diff --git a/_clang-format b/.clang-format
similarity index 93%
rename from _clang-format
rename to .clang-format
index d1af2d18..afb5fa06 100644
--- a/_clang-format
+++ b/.clang-format
@@ -7,7 +7,7 @@ AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlines: Left
 AlignOperands:   true
-AlignTrailingComments: true
+AlignTrailingComments: false
 AllowAllArgumentsOnNextLine: true
 AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
@@ -130,6 +130,8 @@ RawStringFormats:
 ReflowComments:  true
 SortIncludes:    true
 SortUsingDeclarations: true
+
+# spaces
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
@@ -140,12 +142,19 @@ SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
 SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
+SpacesBeforeTrailingComments: 1
 SpacesInAngles:  false
 SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
+
+# https://stackoverflow.com/questions/67396557/adding-space-after-in-clang-format
+# this is to allow both // and //space commants to be unmangled
+SpacesInLineCommentPrefix:
+  Minimum: 0
+  Maximum: 1
+
 Standard:        Auto
 StatementMacros: 
   - Q_UNUSED
diff --git a/.clang-format-ignore b/.clang-format-ignore
new file mode 100644
index 00000000..cf89029a
--- /dev/null
+++ b/.clang-format-ignore
@@ -0,0 +1,37 @@
+#comments look like this, so these dirs are processed
+
+gltf/*
+
+#hlslparser/*
+
+#kram-preview/*
+#kram-profile/*
+#kram-profile/CBA/*
+#kram-shader/*
+#kram-thumb/*
+#kram-thumb-win/*
+#kramc
+#kramv
+
+libkram/allocate/*
+libkram/astc-encoder/*
+libkram/bc7enc/*
+libkram/cgltf/*
+libkram/compressonator/*
+libkram/eastl/*
+#libkram/etc2comp/*
+libkram/fastl/*
+libkram/fmt/*
+libkram/heman/*
+libkram/json11/*
+#libkram/kram/*
+libkram/lodepng/*
+libkram/miniz/*
+libkram/simdjson/*
+libkram/squish/*
+libkram/tmpfileplus/*
+libkram/transcoder/*
+#libkram/vectormath/*
+libkram/zstd/*
+
+plugin/*
diff --git a/.gitattributes b/.gitattributes
index ffd4fb1a..86ca36cb 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,8 +1,54 @@
-# Make all line endings match macOS/linux, even on Windows.  This allows bash to run.
-* text eol=lf
+# This controls the line-endings on various file types.
+# Most win editors can cope with lf ending files, 
+# and use binary load to avoid cstdlib replacement.
 
+# txt/sh/py scripts need to be lf to run across wsl/macOS
+*.sh text eol=lf
+*.py text eol=lf
+*.txt text eol=lf
+
+*.json text eol=lf
+*.plist text eol=lf
+*.xconfig text eol=lf
+
+*.md text eol=lf
+LICENSE text eol=lf
+meson.build text eol=lf
+*.vcproj text eol=crlf
+
+# what about .cpp/.h files?
+
+#-------------
 # commit various binary file types to git-lfs
+# see here https://rehansaeed.com/gitattributes-best-practices/
+# -text means it's not a text file and is binary
+
+# Archives
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+
+*.ico filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
 *.ktx filter=lfs diff=lfs merge=lfs -text
 *.ktx2 filter=lfs diff=lfs merge=lfs -text
 *.dds filter=lfs diff=lfs merge=lfs -text
+*.psd filter=lfs diff=lfs merge=lfs -text
+
+# Documents
+*.pdf filter=lfs diff=lfs merge=lfs -text
+
+# Models
+*.obj filter=lfs diff=lfs merge=lfs
+*.gltf filter=lfs diff=lfs merge=lfs
+*.glb filter=lfs diff=lfs merge=lfs -text
+*.fbx filter=lfs diff=lfs merge=lfs -text
+*.usda filter=lfs diff=lfs merge=lfs -text
+*.usdc filter=lfs diff=lfs merge=lfs -text
+*.usdz filter=lfs diff=lfs merge=lfs -text
+*.rkassets filter=lfs diff=lfs merge=lfs -text
+
+# Other
+*.exe filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/pre-release.yml b/.github/workflows/pre-release.yml
index 6e23a987..ebb5ef13 100644
--- a/.github/workflows/pre-release.yml
+++ b/.github/workflows/pre-release.yml
@@ -14,14 +14,15 @@ jobs:
       matrix:
         #os: [ubuntu-latest, macos-latest, windows-latest]
         #os: [macos-latest, windows-latest]
-        os: [macos-13, windows-latest]
+        #os: [macos-15, windows-latest]
+        os: [ubuntu-latest, macos-15, windows-latest]
         
     steps:
       - name: Update CMake
         uses: lukka/get-cmake@latest
 
       - name: Check out code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Build and install to bin/
         run: ./scripts/cibuild.sh ${{ matrix.os }}
diff --git a/.github/workflows/tagged-release.yml b/.github/workflows/tagged-release.yml
index f8af27f1..bb9c3414 100644
--- a/.github/workflows/tagged-release.yml
+++ b/.github/workflows/tagged-release.yml
@@ -13,14 +13,15 @@ jobs:
       matrix:
         #os: [ubuntu-latest, macos-latest, windows-latest]
         #os: [macos-latest, windows-latest]
-        os: [macos-13, windows-latest]
+        #os: [macos-15, windows-latest]
+        os: [ubuntu-latest, macos-15, windows-latest]
         
     steps:
       - name: Update CMake
         uses: lukka/get-cmake@latest
 
       - name: Check out code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Build and install to bin/
         run: ./scripts/cibuild.sh ${{ matrix.os }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d1e7762..7ec939ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,36 +1,26 @@
-# for now don't require high cmake for TravisCI builds on Win, but need 3.19.2 for universal app builds
-if (APPLE)
-    cmake_minimum_required(VERSION 3.19.1 FATAL_ERROR)
-else()
-    cmake_minimum_required(VERSION 3.18.0 FATAL_ERROR)
-endif()
-
+cmake_minimum_required(VERSION 3.19.1 FATAL_ERROR)
+  
 #-----------------------------------------------------
 
+# really not using cmake for mac, but this was used in the past so leaving it in
+# still building Win using Cmake.  macOS uses avx2 and Win uses avx2.
 set(BUILD_MAC FALSE)
 set(BUILD_WIN FALSE)
+set(BUILD_LINUX FALSE)
 
 if (APPLE)
-    message("build for macOS")
+    message(STATUS "build for macOS")
     set(BUILD_MAC TRUE)
 elseif (WIN32)
-    message("build for win x64")
+    message(STATUS "build for win x64")
     set(BUILD_WIN TRUE)
+elseif (LINUX)
+    message(STATUS "build for linux x64")
+    set(BUILD_LINUX TRUE)
 endif()
 
 #-----------------------------------------------------
 
-# SYSROOT must be set before project,
-# SYSROOT is max OS, deployment is min OS on Apple.
-# If SYSROOT not set, then cmake uses min OS from deployment target.  Ugh.
-# so have to force SYSROOT to latest SDK.
-# Want to set 11.0 here, but Xcode 12.3 ships with 11.1, etc.
-# So then cmake breaks when it cannot find the C compiler, etc.
-# Setting macosx to take the latest greatest sdk.
-
-# don't change these to set_property(GLOBAL) or set_target_properties, the need to be set prior to project
-# and only seem to work if set() is used to force the global value.
-
 # suppress ZERO_CHECK project
 set(CMAKE_SUPPRESS_REGENERATION true)
 
@@ -38,137 +28,128 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
 set(CMAKE_CXX_EXTENSIONS NO)
 
-# Xcode 12.2 ships with macosx11.0, but 12.3 ships with macosx11.1
-# cmake on 12.3 completely breaks when this is set and can't find c compilers.
-# set(CMAKE_OSX_SYSROOT macosx11.0)
-# set(CMAKE_OSX_SYSROOT macos)  # this doesn't work
-
-# CMAKE_OSX_DEPLOYMENT_TARGET must be set as a CACHE variable, or it will be stripped
-if (BUILD_MAC)
-    set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS")
-    set(CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)" CACHE STRING "Architecture macOS")
-endif()
-
 set(CMAKE_CONFIGURATION_TYPES "Debug;Release")
 set(CMAKE_BUILD_TYPE Release)
-
-if (BUILD_MAC)
-    set(CMAKE_DEFAULT_STARTUP_PROJECT "kramc")
-elseif (BUILD_WIN)
-    set(CMAKE_DEFAULT_STARTUP_PROJECT "kramc")
-endif()
-
+set(CMAKE_DEFAULT_STARTUP_PROJECT "kramc")
 
 #-----------------------------------------------------
+if (BUILD_LINUX)
+    set(myTargetWorkspace kramWorkspace)
 
-# cmake translates project to sln in Win, but to xcode projects on Mac.
-# No way to make xcode workspaces, but could do manually.
-set(myTargetWorkspace kramWorkspace)
+    # don't want gcc, want clang
+    SET (CMAKE_CXX_COMPILER "/usr/bin/clang++" CACHE STRING "C++ compiler" FORCE)
+    SET (CMAKE_C_COMPILER "/usr/bin/clang" CACHE STRING "C compiler" FORCE)
 
-if (BUILD_MAC)
-    project(${myTargetWorkspace} LANGUAGES C CXX OBJCXX)
-elseif (BUILD_WIN)
     project(${myTargetWorkspace} LANGUAGES C CXX)
-endif()
 
-# the kram static library libkram which should build on iOS/Android/Mac/Win
-# this doesn't set a project, but maybe it should
-add_subdirectory(libkram)
+    # want to only use clang across all platforms
+    message(STATUS "Using ${CMAKE_CXX_COMPILER_ID} compiler")
+    
+    # the kram static library libkram which should build on iOS/Android/Mac/Win
+    # this doesn't set a project, but maybe it should
+    add_subdirectory(libkram)
 
-# the CLI app for Mac/Win that can build content for other platforms, uses libkram
-add_subdirectory(kramc)
+    # the CLI app for Mac/Win that can build content for other platforms, uses libkram
+    add_subdirectory(kramc)
 
-if (BUILD_MAC)
-    # the viewer is only written for macOS Intel/ARM currently, uses libkram
-    add_subdirectory(kramv)
+    set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin)
 
-    # ps plugin that uses libkram
-    add_subdirectory(plugin)
+    # need app/libs to be in bin directory to zip archive
+    install(TARGETS libkram ARCHIVE DESTINATION ${BIN_DIR})
+    install(TARGETS kram RUNTIME DESTINATION ${BIN_DIR})
+    
 endif()
-
-# this is an Explorer thumbnail extension (run script to un/register), uses libkrma
+    
+#-----------------------------------------------------
 if (BUILD_WIN)
-    add_subdirectory(kram-thumb-win)
-endif()
+    # cmake translates project to sln in Win, but to xcode projects on Mac.
+    # No way to make xcode workspaces, but could do manually.
+    set(myTargetWorkspace kramWorkspace)
 
-# hack hlslparser for win build into kram for now, does not use kram
-if (BUILD_WIN)
-    add_subdirectory(hlslparser)
-endif()
+    project(${myTargetWorkspace} LANGUAGES C CXX)
+    
+    # want to only use clang across all platforms
+    message(STATUS "Using ${CMAKE_CXX_COMPILER_ID} compiler")
+    
+    #-----------------------------------------------------
 
-#-----------------------------------------------------
+    # the kram static library libkram which should build on iOS/Android/Mac/Win
+    # this doesn't set a project, but maybe it should
+    add_subdirectory(libkram)
 
-# https://discourse.cmake.org/t/specifying-cmake-osx-sysroot-breaks-xcode-projects-but-no-other-choice/2532/8
-# use snipet from Alian Martin to validate SDK
+    # the CLI app for Mac/Win that can build content for other platforms, uses libkram
+    add_subdirectory(kramc)
 
-if (BUILD_MAC)
-    if(NOT DEFINED CMAKE_OSX_SYSROOT)
-        message(FATAL_ERROR "Cannot check SDK version if CMAKE_OSX_SYSROOT is not defined."
-    )
-    endif()
-
-    # check the Xcode app itself for it's version
-    set(XCODE_MIN_APP 12.2)
-    if(XCODE AND XCODE_VERSION VERSION_LESS XCODE_MIN_APP)
-        message(FATAL_ERROR "This project requires at least Xcode ${XCODE_MIN_APP}")
-    endif()
-
-    # check the SDK
-    set(XCODE_MIN_SDK_IOS 14.1)
-    set(XCODE_MIN_SDK_MACOS 11.0)
-
-    execute_process(
-        COMMAND xcrun --sdk "${CMAKE_OSX_SYSROOT}" --show-sdk-version
-        OUTPUT_VARIABLE SDK_VERSION
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
+    # TODO: this needs a shared libkram, but kramc uses static libkram
+    # this is an Explorer thumbnail extension (run script to un/register), uses libkram
+    # add_subdirectory(kram-thumb-win)
     
-    message("macOS SDK ${SDK_VERSION}")
-    message("macOS deploy ${CMAKE_OSX_DEPLOYMENT_TARGET}")
-    message("macOS arch ${CMAKE_OSX_ARCHITECTURES}")
-                    
-    if (SDK_VERSION VERSION_LESS XCODE_MIN_SDK_MACOS)
-        message(FATAL_ERROR "This project requires at least macOS SDK ${XCODE_MIN_SDK_MACOS}"
-    )
-    endif()
+    # hack hlslparser for win build into kram for now, does not use kram
+    # add_subdirectory(hlslparser)
+
+    #-----------------------------------------------------
+
+    set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin)
+
+    # need app/libs to be in bin directory to zip archive
+    install(TARGETS libkram ARCHIVE DESTINATION ${BIN_DIR})
+    install(TARGETS kram RUNTIME DESTINATION ${BIN_DIR})
+    #install(TARGETS kram-thumb-win LIBRARY DESTINATION ${BIN_DIR})
     
+    # hlslparser is also now in the kram build.  Keep executables up to date.
+    # I would use the sln file, but msbuild doesn't like to be called from cibuld.sh
+    # This builds but has a lot of warnings.  When I resume work, will re-instate.
+    # install(TARGETS hlslparser RUNTIME DESTINATION ${BIN_DIR})
 endif()
 
 #-----------------------------------------------------
+# This part is unmaintained.  Couldn't build app extensions via CMake.
+# So now just maintain projects
 
-# was considering platform-specific builds, but mac/win don't conflict
-set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin)
-    
-# So by default install depends on ALL_BUILD target, but that will fail if plugin
-# does not have everything setup to build (or like now is not building).
-# The plugin is currently setting EXCLUDE_FROM_ALL on the target so it's not built.
-# https://stackoverflow.com/questions/17164731/installing-only-one-target-and-its-dependencies-out-of-a-complex-project-with
+if (BUILD_MAC)
+    # cmake translates project to sln in Win, but to xcode projects on Mac.
+    # No way to make xcode workspaces, but could do manually.
+    set(myTargetWorkspace kramWorkspace)
 
-# install doesn't seem to do anything on WIN32, the build elements are not copied
-install(TARGETS libkram ARCHIVE DESTINATION ${BIN_DIR})
+    project(${myTargetWorkspace} LANGUAGES C CXX OBJCXX)
 
-if (BUILD_MAC OR BUILD_WIN)
-    install(TARGETS kram RUNTIME DESTINATION ${BIN_DIR})
-endif()
+    # CMAKE_OSX_DEPLOYMENT_TARGET must be set as a CACHE variable, or it will be stripped
+    set(CMAKE_OSX_DEPLOYMENT_TARGET "13.0" CACHE STRING "Minimum macOS")
+    set(CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Architecture macOS")
+   
+    #-----------------------------------------------------
 
-if (BUILD_MAC)
-	install(TARGETS kramv BUNDLE DESTINATION ${BIN_DIR})
-endif()
+    # the kram static library libkram which should build on iOS/Android/Mac/Win
+    # this doesn't set a project, but maybe it should
+    add_subdirectory(libkram)
 
-if (BUILD_WIN)
-    install(TARGETS kram-thumb-win LIBRARY DESTINATION ${BIN_DIR})
-endif()
+    # the CLI app for Mac/Win that can build content for other platforms, uses libkram
+    add_subdirectory(kramc)
 
-# don't install this yet
-#if (BUILD_MAC)
-#    install(TARGETS kram-ps BUNDLE DESTINATION ${BIN_DIR})
-#endif()
+    # the viewer is only written for macOS Intel/ARM currently, uses libkram
+    add_subdirectory(kramv)
 
-# hlslparser is also now in the kram build.  Keep executables up to date.
-# I would use the sln file, but msbuild doesn't like to be called from cibuld.sh
-if (BUILD_WIN)
+    # ps plugin that uses libkram
+    add_subdirectory(plugin)
+  
+    # hlslparser needs some more work to modernize to a C++ style HLSL syntax
+    add_subdirectory(hlslparser)
+
+    #-----------------------------------------------------
+
+    # was considering platform-specific builds, but mac/win don't conflict
+    set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin)
+
+    install(TARGETS libkram ARCHIVE DESTINATION ${BIN_DIR})
+    install(TARGETS kram RUNTIME DESTINATION ${BIN_DIR})
+	install(TARGETS kramv BUNDLE DESTINATION ${BIN_DIR})
     install(TARGETS hlslparser RUNTIME DESTINATION ${BIN_DIR})
+
+    # photoshop plugin
+#    install(TARGETS kram-ps BUNDLE DESTINATION ${BIN_DIR})
+
 endif()
 
 
 
+
diff --git a/README.md b/README.md
index 7af9f966..b08acec0 100644
--- a/README.md
+++ b/README.md
@@ -8,11 +8,15 @@ https://github.com/alecazam/kram/tree/main/kram-profile
 Parses HLSL syntax and generates readable HLSL/MSL code without transpiling.  DXC is then used to compile to spirv.
 https://github.com/alecazam/kram/tree/main/hlslparser
 
+# vectormath
+Fast vector math based around clang vector extensions.  Requires clang but accelerated for arm64/neon, x64/avx2+fma+f16c.
+https://github.com/alecazam/kram/tree/main/libkram/vectormath
+
 # libkram.a, libkram-ios.a, kram.lib
-C++11 library from 200 to 800KB in size depending on encoder options.  Compiles for iOS (ARM), macOS (ARM/Intel), win (Intel).
+C++11 library from 200 to 800KB in size depending on encoder options.  Compiles for iOS/macOS (arm64), win/linux (x64).
 
 # kram, kram.exe
-C++11 main to libkram to create CLI tool.  Encode/decode/info on PNG/KTX/KTX2/DDS files with LDR/HDR and BC/ASTC/ETC2.  Runs on macOS/win.
+C++11 main to libkram to create CLI tool.  Encode/decode/info on PNG/KTX/KTX2/DDS files with LDR/HDR and BC/ASTC/ETC2.  Runs on macOS(arm64), win/linux(x64).
 
 # kram-thumb-win.dll
 Windows thumbnailer for DDS/KTX/KTX2.  Go to build or bin folder.  Install with "regsvr32.exe kram-thumb-win.dll".  Uninstall with "regsvr32.exe /u kram-thumb-win.dll"
@@ -20,12 +24,12 @@ Windows thumbnailer for DDS/KTX/KTX2.  Go to build or bin folder.  Install with
 https://github.com/alecazam/kram/tree/main/kram-thumb-win
 
 # kramv.app
-ObjC++ viewer for PNG/KTX/KTX2/DDS supported files from kram.  Uses Metal compute and shaders, eyedropper, grids, debugging, preview.  Supports HDR and all texture types.  Mip, face, volume, and array access.  No dmg yet, just drop onto /Applications folder.  Runs on macOS (arm64/x64).  Generates Finder thumbnails and QuickLook previews via modern macOS app extension mechanisms.
+ObjC++ viewer for PNG/KTX/KTX2/DDS supported files from kram.  Uses Metal compute and shaders, eyedropper, grids, debugging, preview.  Supports HDR and all texture types.  Mip, face, volume, and array access.  No dmg yet, just drop onto /Applications folder.  Runs on macOS (arm64).  Generates Finder thumbnails and QuickLook previews via modern macOS app extension mechanisms.
 
 Diagrams and screenshots can be located here:
-https://www.figma.com/file/bPmPSpBGTi2xTVnBDqVEq0/kram
+https://www.figma.com/design/bPmPSpBGTi2xTVnBDqVEq0/kram?node-id=0-1&t=OnP0wHcDOmg7b7Vg-1
 
-#### Releases includes builds for macOS (Xcode 14.3 - arm64/x64/clang) and Windows x64 (VS 2022 - x64/clang).  kramv for macOS, kram for macOS/Win, libkram for macOS/iOS/Win, win-thumb-kram for Win.  Android library via NDK is possible, but f16 support is spotty on devices.
+#### Releases includes builds for macOS (Xcode 15.3 - arm64/x64/clang) and Windows x64 (VS 2022 - x64/clang) and Linux (ubuntu-x64/clang).  kramv for macOS, kram for macOS/Win/Linux, libkram for macOS/iOS/Win/Linux, win-thumb-kram for Win.  Android library via NDK is possible, but f16 support is spotty on devices.
 
 ### About kram
 kram is a wrapper to several popular encoders.  Most encoders have sources, and have been optimized to use very little memory and generate high quality encodings at all settings.  All kram encoders are currently CPU-based.  Some of these encoders use SSE, and a SSE to Neon layer translates those.  kram was built to be small and used as a library or app.  It's also designed for mobile and desktop use.  The final size with all encoders is under 1MB, and disabling each encoder chops off around 200KB down to a final 200KB app size via dead-code stripping.  The code should compile with C++11 or higher.
@@ -37,7 +41,7 @@ Many of the encoder sources can multithread a single image, but that is unused.
 Similar to a makefile system, the script sample kramtexture.py uses modstamps to skip textures that have already been processed.  If the source png/ktx/ktx2 is older than the output, then the file is skipped.  Command line options are not yet compared, so if those change then use --force on the python script to rebuild all textures.  Also a crc/hash could be used instead when modstamp isn't sufficient or the same data could come from different folders.
 
 ### About kramv
-kramv is a viewer for the BC/ASTC/ETC2 LDR/HDR KTX/KTX2/DDS textures generated by kram from LDR PNG and LDR/HDR KTX/KTX2/DDS sources.  kramv decodes ASTC/ETC2 textures on macOS Intel, where the GPU doesn't support them.  macOS with Apple Silicon supports all three formats, and doesn't need to decode.   
+kramv is a viewer for the BC/ASTC/ETC2 LDR/HDR KTX/KTX2/DDS textures generated by kram from LDR PNG and LDR/HDR KTX/KTX2/DDS sources.  kramv decodes ASTC/ETC2 textures on macOS Intel, where the GPU doesn't support them.  macOS with Apple Silicon supports all three formats, and doesn't need to decode.  I have macOS Intel support disabled as of 1/25, but can be re--enabled in the xcode project.
 
 kramv uses ObjC++ with the intent to port to Windows C++ as time permits.  Uses menus, buttons, and keyboard handling useful for texture triage and analysis.  Drag and drop folders, bundles, and click-to-launch are supported.  Recently used textures/folders/bundles are listed in the menu.  The app currently shows a single document at a time.  Subsequent opens reuse the same document Window.  With bundles and folders, kramv will attempt to pair albedo and normal maps together by filename for the preview. 
 
@@ -636,7 +640,7 @@ kram encourages the use of lossless and hdr source data.  There are not many cho
 
 KTX is a well-designed format, and KTX2 continues that tradition.  It was also faily easy to convert between these formats.  Once mips are decoded, KTX2 looks very much like KTX.
 
-Visually validating and previewing the results is complicated.  KTX/2 have few viewers, hence the need for kramv.  Apple's Preview can open BC and ASTC files on macOS, but not ETC/PVRTC.  And then you can't look at channels or mips, or turn on/off premultiplied alpha, or view signed/unsigned data.  Preview premultiplies PNG images, but KTX files aren't.  Apple's thumbnails don't work for ETC2 or PVRTC data in KTX files.  Windows thumbnails don't work for KTX at all.  PVRTexToolGUI 2020R2 applies sRGB incorrectly to images, and can't open BC4/5/7 files on Mac.  
+Visually validating and previewing the results is complicated.  KTX/2 have few viewers, hence the need for kramv.  Apple's Preview can open BC/ASTC files on macOS without mips, but not ETC/PVRTC.  It quarantines files opened.  And then you can't look at channels or mips, or turn on/off premultiplied alpha, or view signed/unsigned data.  Preview premultiplies PNG images, but KTX files aren't.  Apple's thumbnails don't work for ETC2 or PVRTC data in KTX files.  Windows thumbnails don't work for KTX at all.  PVRTexToolGUI 2020R2 applies sRGB incorrectly to images, and can't open BC4/5/7 files on Mac.  
 
 kram adds props to KTX/2 file to store data.  Currently props store Metal and Vulkan formats.  This is important since GL's ASTC LDR and HDR formats are the same constant.  Also props are saved for channel content and post-swizzle.  Loaders, viewers, and shaders can utilize this metadata.
 
diff --git a/build2/kram.xcconfig b/build2/kram.xcconfig
new file mode 100644
index 00000000..c63effb2
--- /dev/null
+++ b/build2/kram.xcconfig
@@ -0,0 +1,35 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+// Xcode's AVX2 simd setting doesn't set -mfma or -m16c.  So
+// then universal builds throw hundreds of warnings.  Ugh.
+// Xcode doesn't set NDEBUG=1 in Release builds.  Ugh.
+// Also turn on -ftime-trace to review build times in kram-profile.
+
+// This setting only applies to x64, but many macs only have AVX (2019 MBP).
+// Note that f16c is supported on AVX, but fma requires AVX2.
+// If setting this to AVX, then set GLTF/GLTFMTL too since those don't use config.
+// There isn't currently a vectormath fallback to avx (see SIMD_AVX2).
+CLANG_X86_VECTOR_INSTRUCTIONS = avx2
+
+KRAM_FLAGS_X64 =
+KRAM_FLAGS_X64[sdk=*][arch=x86_64] = -mf16c -mfma
+
+KRAM_FLAGS_RELEASE =
+KRAM_FLAGS_RELEASE[sdk=*][config=Release] = -DNDEBUG=1
+
+KRAM_FLAGS = -ftime-trace
+KRAM_FLAGS = $(KRAM_FLAGS) -DUSE_SIMDLIB=1 -DUSE_SIMDLIBMODULE=1
+KRAM_FLAGS = $(KRAM_FLAGS) -fmodules -fcxx-modules
+
+// KRAM_FLAGS += -DCOMPILE_EASTL=1
+// TODO: also set include path for eastl
+
+// configuring all the encoders in kram
+// KRAM_FLAGS += -DCOMPILE_ASTCENC=1 -DCOMPILE_ATE=1 -DCOMPILE_ETCENC=1 -DCOMPILE_SQUISH=1 -DCOMPILE_BCENC=1 -DCOMPILE_COMP=1 -DCOMPILE_BASIS=0 -DCOMPILE_EASTL=0
+
+// This is killing build times in Xcode16
+ENABLE_MODULE_VERIFIER = NO
+
+OTHER_CFLAGS = $(inherited) $(KRAM_FLAGS) $(KRAM_FLAGS_RELEASE) $(KRAM_FLAGS_X64)
diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj
index f0fc688e..26ee08f5 100644
--- a/build2/kram.xcodeproj/project.pbxproj
+++ b/build2/kram.xcodeproj/project.pbxproj
@@ -7,22 +7,9 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
-		704738BC289F6AEE00C77A9F /* unordered_map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B1289F6AEE00C77A9F /* unordered_map.h */; };
-		704738BD289F6AEE00C77A9F /* unordered_map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B1289F6AEE00C77A9F /* unordered_map.h */; };
-		704738BE289F6AEE00C77A9F /* falgorithm.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B2289F6AEE00C77A9F /* falgorithm.h */; };
-		704738BF289F6AEE00C77A9F /* falgorithm.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B2289F6AEE00C77A9F /* falgorithm.h */; };
-		704738C0289F6AEE00C77A9F /* map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B3289F6AEE00C77A9F /* map.h */; };
-		704738C1289F6AEE00C77A9F /* map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B3289F6AEE00C77A9F /* map.h */; };
-		704738C2289F6AEE00C77A9F /* pair.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B4289F6AEE00C77A9F /* pair.h */; };
-		704738C3289F6AEE00C77A9F /* pair.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B4289F6AEE00C77A9F /* pair.h */; };
-		704738C6289F6AEE00C77A9F /* unordered_set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B6289F6AEE00C77A9F /* unordered_set.h */; };
-		704738C7289F6AEE00C77A9F /* unordered_set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B6289F6AEE00C77A9F /* unordered_set.h */; };
-		704738C8289F6AEE00C77A9F /* vector.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B7289F6AEE00C77A9F /* vector.h */; };
-		704738C9289F6AEE00C77A9F /* vector.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B7289F6AEE00C77A9F /* vector.h */; };
-		704738CA289F6AEE00C77A9F /* set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B8289F6AEE00C77A9F /* set.h */; };
-		704738CB289F6AEE00C77A9F /* set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B8289F6AEE00C77A9F /* set.h */; };
-		704738CC289F6AEE00C77A9F /* fstring.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B9289F6AEE00C77A9F /* fstring.h */; };
-		704738CD289F6AEE00C77A9F /* fstring.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B9289F6AEE00C77A9F /* fstring.h */; };
+		702E0DB62CA10BC100B652B7 /* astcenc_mathlib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB327DDDBCC00D0B9E1 /* astcenc_mathlib.cpp */; };
+		706178192DE16211001545E1 /* KramFileIO.h in Headers */ = {isa = PBXBuildFile; fileRef = 706178172DE16211001545E1 /* KramFileIO.h */; };
+		7061781A2DE16211001545E1 /* KramFileIO.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706178182DE16211001545E1 /* KramFileIO.cpp */; };
 		706EEF7F26D1595D001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */; };
 		706EEF8026D1595D001C950E /* EtcImage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAC26D1583E001C950E /* EtcImage.cpp */; };
 		706EEF8126D1595D001C950E /* EtcDifferentialTrys.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAF26D1583E001C950E /* EtcDifferentialTrys.cpp */; };
@@ -49,7 +36,6 @@
 		706EEFB226D1595D001C950E /* KramLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2826D1583F001C950E /* KramLog.cpp */; };
 		706EEFB326D1595D001C950E /* KramSDFMipper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2B26D1583F001C950E /* KramSDFMipper.cpp */; };
 		706EEFB426D1595D001C950E /* KramMmapHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2C26D1583F001C950E /* KramMmapHelper.cpp */; };
-		706EEFB526D1595D001C950E /* float4a.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2F26D1583F001C950E /* float4a.cpp */; };
 		706EEFB626D1595D001C950E /* Kram.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE3526D1583F001C950E /* Kram.cpp */; };
 		706EEFB726D1595D001C950E /* squish.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE3D26D1583F001C950E /* squish.cpp */; };
 		706EEFB826D1595D001C950E /* colourset.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE3E26D1583F001C950E /* colourset.cpp */; };
@@ -97,7 +83,6 @@
 		706EEFFE26D15985001C950E /* stb_rect_pack.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE1726D1583F001C950E /* stb_rect_pack.h */; };
 		706EEFFF26D15985001C950E /* KramZipHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE1926D1583F001C950E /* KramZipHelper.h */; };
 		706EF00026D15985001C950E /* KramSDFMipper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2026D1583F001C950E /* KramSDFMipper.h */; };
-		706EF00126D15985001C950E /* sse2neon.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2226D1583F001C950E /* sse2neon.h */; };
 		706EF00226D15985001C950E /* KramConfig.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2326D1583F001C950E /* KramConfig.h */; };
 		706EF00326D15985001C950E /* KramLog.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2426D1583F001C950E /* KramLog.h */; };
 		706EF00426D15985001C950E /* KramLib.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2726D1583F001C950E /* KramLib.h */; };
@@ -109,7 +94,6 @@
 		706EF00A26D15985001C950E /* KramImageInfo.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3126D1583F001C950E /* KramImageInfo.h */; };
 		706EF00B26D15985001C950E /* KramTimer.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3226D1583F001C950E /* KramTimer.h */; };
 		706EF00C26D15985001C950E /* KramMmapHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3326D1583F001C950E /* KramMmapHelper.h */; };
-		706EF00D26D15985001C950E /* float4a.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3426D1583F001C950E /* float4a.h */; };
 		706EF00E26D15985001C950E /* KramFileHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3626D1583F001C950E /* KramFileHelper.h */; };
 		706EF00F26D15985001C950E /* KramMipper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3726D1583F001C950E /* KramMipper.h */; };
 		706EF01026D15985001C950E /* TaskSystem.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3826D1583F001C950E /* TaskSystem.h */; };
@@ -126,309 +110,107 @@
 		706EF01B26D15985001C950E /* lodepng.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE5426D1583F001C950E /* lodepng.h */; };
 		706EF01C26D15985001C950E /* tmpfileplus.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE5926D1583F001C950E /* tmpfileplus.h */; };
 		706EF12B26D159F9001C950E /* libate.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 706EF12A26D159F9001C950E /* libate.tbd */; };
-		706EF14B26D166C5001C950E /* EtcErrorMetric.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDAB26D1583E001C950E /* EtcErrorMetric.h */; };
-		706EF14C26D166C5001C950E /* EtcColor.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDAD26D1583E001C950E /* EtcColor.h */; };
-		706EF14D26D166C5001C950E /* EtcDifferentialTrys.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDAE26D1583E001C950E /* EtcDifferentialTrys.h */; };
-		706EF14E26D166C5001C950E /* EtcBlock4x4Encoding_RGB8.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDB026D1583E001C950E /* EtcBlock4x4Encoding_RGB8.h */; };
-		706EF14F26D166C5001C950E /* EtcConfig.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDB426D1583E001C950E /* EtcConfig.h */; };
-		706EF15026D166C5001C950E /* EtcBlock4x4Encoding_R11.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDB526D1583E001C950E /* EtcBlock4x4Encoding_R11.h */; };
-		706EF15126D166C5001C950E /* EtcBlock4x4Encoding_RG11.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDB726D1583E001C950E /* EtcBlock4x4Encoding_RG11.h */; };
-		706EF15226D166C5001C950E /* EtcMath.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDB926D1583E001C950E /* EtcMath.h */; };
-		706EF15326D166C5001C950E /* EtcIndividualTrys.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDBA26D1583E001C950E /* EtcIndividualTrys.h */; };
-		706EF15426D166C5001C950E /* EtcBlock4x4EncodingBits.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDBD26D1583E001C950E /* EtcBlock4x4EncodingBits.h */; };
-		706EF15526D166C5001C950E /* EtcBlock4x4Encoding_RGB8A1.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDBE26D1583E001C950E /* EtcBlock4x4Encoding_RGB8A1.h */; };
-		706EF15626D166C5001C950E /* EtcBlock4x4.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC026D1583E001C950E /* EtcBlock4x4.h */; };
-		706EF15726D166C5001C950E /* Etc.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC126D1583E001C950E /* Etc.h */; };
-		706EF15826D166C5001C950E /* EtcImage.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC226D1583E001C950E /* EtcImage.h */; };
-		706EF15926D166C5001C950E /* EtcBlock4x4Encoding_ETC1.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC326D1583E001C950E /* EtcBlock4x4Encoding_ETC1.h */; };
-		706EF15A26D166C5001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC426D1583E001C950E /* EtcBlock4x4Encoding_RGBA8.h */; };
-		706EF15B26D166C5001C950E /* EtcColorFloatRGBA.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC726D1583E001C950E /* EtcColorFloatRGBA.h */; };
-		706EF15C26D166C5001C950E /* EtcBlock4x4Encoding.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC826D1583E001C950E /* EtcBlock4x4Encoding.h */; };
-		706EF16C26D166C5001C950E /* ateencoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFA26D1583E001C950E /* ateencoder.h */; };
-		706EF16D26D166C5001C950E /* basisu_transcoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFC26D1583E001C950E /* basisu_transcoder.h */; };
-		706EF16E26D166C5001C950E /* basisu_containers.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFD26D1583E001C950E /* basisu_containers.h */; };
-		706EF16F26D166C5001C950E /* basisu_containers_impl.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFF26D1583E001C950E /* basisu_containers_impl.h */; };
-		706EF17026D166C5001C950E /* basisu_transcoder_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE0226D1583F001C950E /* basisu_transcoder_internal.h */; };
-		706EF17126D166C5001C950E /* basisu_global_selector_cb.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE0326D1583F001C950E /* basisu_global_selector_cb.h */; };
-		706EF17226D166C5001C950E /* basisu_transcoder_uastc.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE0526D1583F001C950E /* basisu_transcoder_uastc.h */; };
-		706EF17326D166C5001C950E /* basisu_global_selector_palette.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE0626D1583F001C950E /* basisu_global_selector_palette.h */; };
-		706EF17426D166C5001C950E /* basisu.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE0C26D1583F001C950E /* basisu.h */; };
-		706EF17526D166C5001C950E /* basisu_file_headers.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE0E26D1583F001C950E /* basisu_file_headers.h */; };
-		706EF17626D166C5001C950E /* miniz.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE1226D1583F001C950E /* miniz.h */; };
-		706EF17726D166C5001C950E /* hedistance.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE1526D1583F001C950E /* hedistance.h */; };
-		706EF17826D166C5001C950E /* stb_rect_pack.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE1726D1583F001C950E /* stb_rect_pack.h */; };
-		706EF17926D166C5001C950E /* KramZipHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE1926D1583F001C950E /* KramZipHelper.h */; };
-		706EF17A26D166C5001C950E /* KramSDFMipper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2026D1583F001C950E /* KramSDFMipper.h */; };
-		706EF17B26D166C5001C950E /* sse2neon.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2226D1583F001C950E /* sse2neon.h */; };
-		706EF17C26D166C5001C950E /* KramConfig.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2326D1583F001C950E /* KramConfig.h */; };
-		706EF17D26D166C5001C950E /* KramLog.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2426D1583F001C950E /* KramLog.h */; };
-		706EF17E26D166C5001C950E /* KramLib.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2726D1583F001C950E /* KramLib.h */; };
-		706EF17F26D166C5001C950E /* KramVersion.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2926D1583F001C950E /* KramVersion.h */; };
-		706EF18026D166C5001C950E /* KramImage.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2A26D1583F001C950E /* KramImage.h */; };
-		706EF18126D166C5001C950E /* win_mmap.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2D26D1583F001C950E /* win_mmap.h */; };
-		706EF18226D166C5001C950E /* Kram.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE2E26D1583F001C950E /* Kram.h */; };
-		706EF18326D166C5001C950E /* KTXImage.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3026D1583F001C950E /* KTXImage.h */; };
-		706EF18426D166C5001C950E /* KramImageInfo.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3126D1583F001C950E /* KramImageInfo.h */; };
-		706EF18526D166C5001C950E /* KramTimer.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3226D1583F001C950E /* KramTimer.h */; };
-		706EF18626D166C5001C950E /* KramMmapHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3326D1583F001C950E /* KramMmapHelper.h */; };
-		706EF18726D166C5001C950E /* float4a.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3426D1583F001C950E /* float4a.h */; };
-		706EF18826D166C5001C950E /* KramFileHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3626D1583F001C950E /* KramFileHelper.h */; };
-		706EF18926D166C5001C950E /* KramMipper.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3726D1583F001C950E /* KramMipper.h */; };
-		706EF18A26D166C5001C950E /* TaskSystem.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3826D1583F001C950E /* TaskSystem.h */; };
-		706EF18B26D166C5001C950E /* squish.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3A26D1583F001C950E /* squish.h */; };
-		706EF18C26D166C5001C950E /* clusterfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3B26D1583F001C950E /* clusterfit.h */; };
-		706EF18D26D166C5001C950E /* colourfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3C26D1583F001C950E /* colourfit.h */; };
-		706EF18E26D166C5001C950E /* alpha.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE3F26D1583F001C950E /* alpha.h */; };
-		706EF18F26D166C5001C950E /* singlecolourfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE4126D1583F001C950E /* singlecolourfit.h */; };
-		706EF19026D166C5001C950E /* maths.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE4526D1583F001C950E /* maths.h */; };
-		706EF19126D166C5001C950E /* colourset.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE4826D1583F001C950E /* colourset.h */; };
-		706EF19226D166C5001C950E /* colourblock.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE4A26D1583F001C950E /* colourblock.h */; };
-		706EF19326D166C5001C950E /* rangefit.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE4B26D1583F001C950E /* rangefit.h */; };
-		706EF19426D166C5001C950E /* zstd.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE5226D1583F001C950E /* zstd.h */; };
-		706EF19526D166C5001C950E /* lodepng.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE5426D1583F001C950E /* lodepng.h */; };
-		706EF19626D166C5001C950E /* tmpfileplus.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEE5926D1583F001C950E /* tmpfileplus.h */; };
-		706EF19826D166C5001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */; };
-		706EF19926D166C5001C950E /* EtcImage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAC26D1583E001C950E /* EtcImage.cpp */; };
-		706EF19A26D166C5001C950E /* EtcDifferentialTrys.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAF26D1583E001C950E /* EtcDifferentialTrys.cpp */; };
-		706EF19B26D166C5001C950E /* EtcMath.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDB126D1583E001C950E /* EtcMath.cpp */; };
-		706EF19C26D166C5001C950E /* EtcBlock4x4Encoding_RGBA8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDB226D1583E001C950E /* EtcBlock4x4Encoding_RGBA8.cpp */; };
-		706EF19D26D166C5001C950E /* EtcBlock4x4Encoding_RG11.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDB326D1583E001C950E /* EtcBlock4x4Encoding_RG11.cpp */; };
-		706EF19E26D166C5001C950E /* EtcBlock4x4Encoding_RGB8A1.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDB626D1583E001C950E /* EtcBlock4x4Encoding_RGB8A1.cpp */; };
-		706EF19F26D166C5001C950E /* EtcIndividualTrys.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDBB26D1583E001C950E /* EtcIndividualTrys.cpp */; };
-		706EF1A026D166C5001C950E /* EtcBlock4x4Encoding_R11.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDBC26D1583E001C950E /* EtcBlock4x4Encoding_R11.cpp */; };
-		706EF1A126D166C5001C950E /* EtcBlock4x4Encoding_ETC1.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDBF26D1583E001C950E /* EtcBlock4x4Encoding_ETC1.cpp */; };
-		706EF1A226D166C5001C950E /* EtcBlock4x4Encoding.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDC526D1583E001C950E /* EtcBlock4x4Encoding.cpp */; };
-		706EF1A326D166C5001C950E /* EtcBlock4x4.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDC626D1583E001C950E /* EtcBlock4x4.cpp */; };
-		706EF1BF26D166C5001C950E /* basisu_transcoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE0426D1583F001C950E /* basisu_transcoder.cpp */; };
-		706EF1C026D166C5001C950E /* miniz.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1126D1583F001C950E /* miniz.cpp */; };
-		706EF1C126D166C5001C950E /* hedistance.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1426D1583F001C950E /* hedistance.cpp */; };
-		706EF1C226D166C5001C950E /* KramTimer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1A26D1583F001C950E /* KramTimer.cpp */; };
-		706EF1C326D166C5001C950E /* KTXImage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1B26D1583F001C950E /* KTXImage.cpp */; };
-		706EF1C426D166C5001C950E /* KramMipper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1C26D1583F001C950E /* KramMipper.cpp */; };
-		706EF1C526D166C5001C950E /* KramZipHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1E26D1583F001C950E /* KramZipHelper.cpp */; };
-		706EF1C626D166C5001C950E /* TaskSystem.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1F26D1583F001C950E /* TaskSystem.cpp */; };
-		706EF1C726D166C5001C950E /* KramFileHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2126D1583F001C950E /* KramFileHelper.cpp */; };
-		706EF1C826D166C5001C950E /* KramImageInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2526D1583F001C950E /* KramImageInfo.cpp */; };
-		706EF1C926D166C5001C950E /* KramImage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2626D1583F001C950E /* KramImage.cpp */; };
-		706EF1CA26D166C5001C950E /* KramLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2826D1583F001C950E /* KramLog.cpp */; };
-		706EF1CB26D166C5001C950E /* KramSDFMipper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2B26D1583F001C950E /* KramSDFMipper.cpp */; };
-		706EF1CC26D166C5001C950E /* KramMmapHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2C26D1583F001C950E /* KramMmapHelper.cpp */; };
-		706EF1CD26D166C5001C950E /* float4a.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE2F26D1583F001C950E /* float4a.cpp */; };
-		706EF1CE26D166C5001C950E /* Kram.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE3526D1583F001C950E /* Kram.cpp */; };
-		706EF1CF26D166C5001C950E /* squish.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE3D26D1583F001C950E /* squish.cpp */; };
-		706EF1D026D166C5001C950E /* colourset.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE3E26D1583F001C950E /* colourset.cpp */; };
-		706EF1D126D166C5001C950E /* clusterfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE4226D1583F001C950E /* clusterfit.cpp */; };
-		706EF1D226D166C5001C950E /* rangefit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE4426D1583F001C950E /* rangefit.cpp */; };
-		706EF1D326D166C5001C950E /* alpha.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE4626D1583F001C950E /* alpha.cpp */; };
-		706EF1D426D166C5001C950E /* colourblock.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE4726D1583F001C950E /* colourblock.cpp */; };
-		706EF1D526D166C5001C950E /* colourfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE4926D1583F001C950E /* colourfit.cpp */; };
-		706EF1D626D166C5001C950E /* maths.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE4D26D1583F001C950E /* maths.cpp */; };
-		706EF1D726D166C5001C950E /* singlecolourfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE4E26D1583F001C950E /* singlecolourfit.cpp */; };
-		706EF1D826D166C5001C950E /* zstd.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE5026D1583F001C950E /* zstd.cpp */; };
-		706EF1D926D166C5001C950E /* zstddeclib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE5126D1583F001C950E /* zstddeclib.cpp */; };
-		706EF1DA26D166C5001C950E /* lodepng.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE5626D1583F001C950E /* lodepng.cpp */; };
-		706EF1DB26D166C5001C950E /* tmpfileplus.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE5826D1583F001C950E /* tmpfileplus.cpp */; };
-		706EF1DD26D166C5001C950E /* libate.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 706EF12A26D159F9001C950E /* libate.tbd */; };
 		706EF26426D17DCC001C950E /* ateencoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDF926D1583E001C950E /* ateencoder.mm */; };
-		706EFC2426D1C39B001C950E /* ateencoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDF926D1583E001C950E /* ateencoder.mm */; };
 		706EFF7326D34740001C950E /* thread_support.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5726D3473F001C950E /* thread_support.cpp */; };
-		706EFF7426D34740001C950E /* thread_support.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5726D3473F001C950E /* thread_support.cpp */; };
 		706EFF7526D34740001C950E /* assert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5826D3473F001C950E /* assert.cpp */; };
-		706EFF7626D34740001C950E /* assert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5826D3473F001C950E /* assert.cpp */; };
 		706EFF7726D34740001C950E /* string.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5926D3473F001C950E /* string.cpp */; };
-		706EFF7826D34740001C950E /* string.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5926D3473F001C950E /* string.cpp */; };
-		706EFF7A26D34740001C950E /* allocator_eastl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5A26D3473F001C950E /* allocator_eastl.cpp */; };
 		706EFF7B26D34740001C950E /* numeric_limits.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5B26D3473F001C950E /* numeric_limits.cpp */; };
-		706EFF7C26D34740001C950E /* numeric_limits.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5B26D3473F001C950E /* numeric_limits.cpp */; };
 		706EFF7F26D34740001C950E /* intrusive_list.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5D26D3473F001C950E /* intrusive_list.cpp */; };
-		706EFF8026D34740001C950E /* intrusive_list.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5D26D3473F001C950E /* intrusive_list.cpp */; };
 		706EFF8126D34740001C950E /* hashtable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5E26D3473F001C950E /* hashtable.cpp */; };
-		706EFF8226D34740001C950E /* hashtable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5E26D3473F001C950E /* hashtable.cpp */; };
 		706EFF8326D34740001C950E /* red_black_tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5F26D3473F001C950E /* red_black_tree.cpp */; };
-		706EFF8426D34740001C950E /* red_black_tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5F26D3473F001C950E /* red_black_tree.cpp */; };
 		706EFF8526D34740001C950E /* fixed_pool.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD6026D3473F001C950E /* fixed_pool.cpp */; };
-		706EFF8626D34740001C950E /* fixed_pool.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD6026D3473F001C950E /* fixed_pool.cpp */; };
 		707789D52881BA81008A51BC /* bc7enc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789C62881BA81008A51BC /* bc7enc.cpp */; };
-		707789D62881BA81008A51BC /* bc7enc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789C62881BA81008A51BC /* bc7enc.cpp */; };
 		707789D72881BA81008A51BC /* bc7enc.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C72881BA81008A51BC /* bc7enc.h */; };
-		707789D82881BA81008A51BC /* bc7enc.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C72881BA81008A51BC /* bc7enc.h */; };
 		707789D92881BA81008A51BC /* bc7decomp.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C82881BA81008A51BC /* bc7decomp.h */; };
-		707789DA2881BA81008A51BC /* bc7decomp.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C82881BA81008A51BC /* bc7decomp.h */; };
 		707789DB2881BA81008A51BC /* ert.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C92881BA81008A51BC /* ert.h */; };
-		707789DC2881BA81008A51BC /* ert.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C92881BA81008A51BC /* ert.h */; };
 		707789DD2881BA81008A51BC /* rgbcx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CA2881BA81008A51BC /* rgbcx.cpp */; };
-		707789DE2881BA81008A51BC /* rgbcx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CA2881BA81008A51BC /* rgbcx.cpp */; };
 		707789DF2881BA81008A51BC /* rgbcx_table4.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CB2881BA81008A51BC /* rgbcx_table4.h */; };
-		707789E02881BA81008A51BC /* rgbcx_table4.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CB2881BA81008A51BC /* rgbcx_table4.h */; };
 		707789E12881BA81008A51BC /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CC2881BA81008A51BC /* utils.cpp */; };
-		707789E22881BA81008A51BC /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CC2881BA81008A51BC /* utils.cpp */; };
 		707789E32881BA81008A51BC /* rgbcx_table4_small.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CD2881BA81008A51BC /* rgbcx_table4_small.h */; };
-		707789E42881BA81008A51BC /* rgbcx_table4_small.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CD2881BA81008A51BC /* rgbcx_table4_small.h */; };
 		707789E52881BA81008A51BC /* ert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CE2881BA81008A51BC /* ert.cpp */; };
-		707789E62881BA81008A51BC /* ert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CE2881BA81008A51BC /* ert.cpp */; };
 		707789E72881BA81008A51BC /* rgbcx.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CF2881BA81008A51BC /* rgbcx.h */; };
-		707789E82881BA81008A51BC /* rgbcx.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CF2881BA81008A51BC /* rgbcx.h */; };
 		707789E92881BA81008A51BC /* bc7decomp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D02881BA81008A51BC /* bc7decomp.cpp */; };
-		707789EA2881BA81008A51BC /* bc7decomp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D02881BA81008A51BC /* bc7decomp.cpp */; };
 		707789EB2881BA81008A51BC /* utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789D22881BA81008A51BC /* utils.h */; };
-		707789EC2881BA81008A51BC /* utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789D22881BA81008A51BC /* utils.h */; };
 		707789ED2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D32881BA81008A51BC /* bc7decomp_ref.cpp */; };
-		707789EE2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D32881BA81008A51BC /* bc7decomp_ref.cpp */; };
 		707789F12881BCE2008A51BC /* rdo_bc_encoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789EF2881BCE2008A51BC /* rdo_bc_encoder.cpp */; };
-		707789F22881BCE2008A51BC /* rdo_bc_encoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789EF2881BCE2008A51BC /* rdo_bc_encoder.cpp */; };
 		707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789F02881BCE2008A51BC /* rdo_bc_encoder.h */; };
-		707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789F02881BCE2008A51BC /* rdo_bc_encoder.h */; };
+		707B2AB42D99BF7A00DD3F0B /* KramThreadPool.h in Headers */ = {isa = PBXBuildFile; fileRef = 707B2AB22D99BF7A00DD3F0B /* KramThreadPool.h */; };
+		707B2AB52D99BF7A00DD3F0B /* KramThreadPool.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707B2AB32D99BF7A00DD3F0B /* KramThreadPool.cpp */; };
 		70871DC927DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DA727DDDBCC00D0B9E1 /* astcenc_vecmathlib_common_4.h */; };
-		70871DCA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DA727DDDBCC00D0B9E1 /* astcenc_vecmathlib_common_4.h */; };
 		70871DCB27DDDBCD00D0B9E1 /* astcenc_image.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DA827DDDBCC00D0B9E1 /* astcenc_image.cpp */; };
-		70871DCC27DDDBCD00D0B9E1 /* astcenc_image.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DA827DDDBCC00D0B9E1 /* astcenc_image.cpp */; };
 		70871DCD27DDDBCD00D0B9E1 /* astcenc_find_best_partitioning.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DA927DDDBCC00D0B9E1 /* astcenc_find_best_partitioning.cpp */; };
-		70871DCE27DDDBCD00D0B9E1 /* astcenc_find_best_partitioning.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DA927DDDBCC00D0B9E1 /* astcenc_find_best_partitioning.cpp */; };
 		70871DCF27DDDBCD00D0B9E1 /* astcenc_symbolic_physical.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAA27DDDBCC00D0B9E1 /* astcenc_symbolic_physical.cpp */; };
-		70871DD027DDDBCD00D0B9E1 /* astcenc_symbolic_physical.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAA27DDDBCC00D0B9E1 /* astcenc_symbolic_physical.cpp */; };
 		70871DD127DDDBCD00D0B9E1 /* astcenc_averages_and_directions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAB27DDDBCC00D0B9E1 /* astcenc_averages_and_directions.cpp */; };
-		70871DD227DDDBCD00D0B9E1 /* astcenc_averages_and_directions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAB27DDDBCC00D0B9E1 /* astcenc_averages_and_directions.cpp */; };
 		70871DD327DDDBCD00D0B9E1 /* astcenc_partition_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAC27DDDBCC00D0B9E1 /* astcenc_partition_tables.cpp */; };
-		70871DD427DDDBCD00D0B9E1 /* astcenc_partition_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAC27DDDBCC00D0B9E1 /* astcenc_partition_tables.cpp */; };
 		70871DD527DDDBCD00D0B9E1 /* astcenc.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DAD27DDDBCC00D0B9E1 /* astcenc.h */; };
-		70871DD627DDDBCD00D0B9E1 /* astcenc.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DAD27DDDBCC00D0B9E1 /* astcenc.h */; };
 		70871DD727DDDBCD00D0B9E1 /* astcenc_quantization.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAE27DDDBCC00D0B9E1 /* astcenc_quantization.cpp */; };
-		70871DD827DDDBCD00D0B9E1 /* astcenc_quantization.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAE27DDDBCC00D0B9E1 /* astcenc_quantization.cpp */; };
 		70871DD927DDDBCD00D0B9E1 /* astcenc_compute_variance.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAF27DDDBCC00D0B9E1 /* astcenc_compute_variance.cpp */; };
-		70871DDA27DDDBCD00D0B9E1 /* astcenc_compute_variance.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DAF27DDDBCC00D0B9E1 /* astcenc_compute_variance.cpp */; };
 		70871DDB27DDDBCD00D0B9E1 /* astcenc_percentile_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB027DDDBCC00D0B9E1 /* astcenc_percentile_tables.cpp */; };
-		70871DDC27DDDBCD00D0B9E1 /* astcenc_percentile_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB027DDDBCC00D0B9E1 /* astcenc_percentile_tables.cpp */; };
 		70871DDD27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DB127DDDBCC00D0B9E1 /* astcenc_vecmathlib_sse_4.h */; };
-		70871DDE27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DB127DDDBCC00D0B9E1 /* astcenc_vecmathlib_sse_4.h */; };
 		70871DDF27DDDBCD00D0B9E1 /* astcenc_mathlib_softfloat.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB227DDDBCC00D0B9E1 /* astcenc_mathlib_softfloat.cpp */; };
-		70871DE027DDDBCD00D0B9E1 /* astcenc_mathlib_softfloat.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB227DDDBCC00D0B9E1 /* astcenc_mathlib_softfloat.cpp */; };
-		70871DE127DDDBCD00D0B9E1 /* astcenc_mathlib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB327DDDBCC00D0B9E1 /* astcenc_mathlib.cpp */; };
-		70871DE227DDDBCD00D0B9E1 /* astcenc_mathlib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB327DDDBCC00D0B9E1 /* astcenc_mathlib.cpp */; };
 		70871DE327DDDBCD00D0B9E1 /* astcenc_decompress_symbolic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB427DDDBCC00D0B9E1 /* astcenc_decompress_symbolic.cpp */; };
-		70871DE427DDDBCD00D0B9E1 /* astcenc_decompress_symbolic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB427DDDBCC00D0B9E1 /* astcenc_decompress_symbolic.cpp */; };
 		70871DE527DDDBCD00D0B9E1 /* astcenc_compress_symbolic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB527DDDBCC00D0B9E1 /* astcenc_compress_symbolic.cpp */; };
-		70871DE627DDDBCD00D0B9E1 /* astcenc_compress_symbolic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB527DDDBCC00D0B9E1 /* astcenc_compress_symbolic.cpp */; };
 		70871DE727DDDBCD00D0B9E1 /* astcenc_entry.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB627DDDBCC00D0B9E1 /* astcenc_entry.cpp */; };
-		70871DE827DDDBCD00D0B9E1 /* astcenc_entry.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB627DDDBCC00D0B9E1 /* astcenc_entry.cpp */; };
 		70871DE927DDDBCD00D0B9E1 /* astcenc_integer_sequence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB727DDDBCC00D0B9E1 /* astcenc_integer_sequence.cpp */; };
-		70871DEA27DDDBCD00D0B9E1 /* astcenc_integer_sequence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB727DDDBCC00D0B9E1 /* astcenc_integer_sequence.cpp */; };
 		70871DEB27DDDBCD00D0B9E1 /* astcenc_block_sizes.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB827DDDBCC00D0B9E1 /* astcenc_block_sizes.cpp */; };
-		70871DEC27DDDBCD00D0B9E1 /* astcenc_block_sizes.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DB827DDDBCC00D0B9E1 /* astcenc_block_sizes.cpp */; };
 		70871DED27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DB927DDDBCC00D0B9E1 /* astcenc_diagnostic_trace.h */; };
-		70871DEE27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DB927DDDBCC00D0B9E1 /* astcenc_diagnostic_trace.h */; };
 		70871DEF27DDDBCD00D0B9E1 /* astcenc_weight_align.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DBA27DDDBCC00D0B9E1 /* astcenc_weight_align.cpp */; };
-		70871DF027DDDBCD00D0B9E1 /* astcenc_weight_align.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DBA27DDDBCC00D0B9E1 /* astcenc_weight_align.cpp */; };
 		70871DF127DDDBCD00D0B9E1 /* astcenc_mathlib.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBB27DDDBCC00D0B9E1 /* astcenc_mathlib.h */; };
-		70871DF227DDDBCD00D0B9E1 /* astcenc_mathlib.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBB27DDDBCC00D0B9E1 /* astcenc_mathlib.h */; };
 		70871DF327DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBC27DDDBCC00D0B9E1 /* astcenc_internal.h */; };
-		70871DF427DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBC27DDDBCC00D0B9E1 /* astcenc_internal.h */; };
 		70871DF527DDDBCD00D0B9E1 /* astcenc_color_quantize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DBD27DDDBCC00D0B9E1 /* astcenc_color_quantize.cpp */; };
-		70871DF627DDDBCD00D0B9E1 /* astcenc_color_quantize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DBD27DDDBCC00D0B9E1 /* astcenc_color_quantize.cpp */; };
 		70871DF727DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBE27DDDBCC00D0B9E1 /* astcenc_vecmathlib_neon_4.h */; };
-		70871DF827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBE27DDDBCC00D0B9E1 /* astcenc_vecmathlib_neon_4.h */; };
 		70871DF927DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBF27DDDBCC00D0B9E1 /* astcenc_vecmathlib_avx2_8.h */; };
-		70871DFA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DBF27DDDBCC00D0B9E1 /* astcenc_vecmathlib_avx2_8.h */; };
 		70871DFB27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DC027DDDBCC00D0B9E1 /* astcenc_vecmathlib_none_4.h */; };
-		70871DFC27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DC027DDDBCC00D0B9E1 /* astcenc_vecmathlib_none_4.h */; };
 		70871DFD27DDDBCD00D0B9E1 /* astcenc_vecmathlib.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DC127DDDBCC00D0B9E1 /* astcenc_vecmathlib.h */; };
-		70871DFE27DDDBCD00D0B9E1 /* astcenc_vecmathlib.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DC127DDDBCC00D0B9E1 /* astcenc_vecmathlib.h */; };
 		70871DFF27DDDBCD00D0B9E1 /* astcenc_pick_best_endpoint_format.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC227DDDBCC00D0B9E1 /* astcenc_pick_best_endpoint_format.cpp */; };
-		70871E0027DDDBCD00D0B9E1 /* astcenc_pick_best_endpoint_format.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC227DDDBCC00D0B9E1 /* astcenc_pick_best_endpoint_format.cpp */; };
 		70871E0127DDDBCD00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC327DDDBCC00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp */; };
-		70871E0227DDDBCD00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC327DDDBCC00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp */; };
 		70871E0327DDDBCD00D0B9E1 /* astcenc_color_unquantize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC427DDDBCC00D0B9E1 /* astcenc_color_unquantize.cpp */; };
-		70871E0427DDDBCD00D0B9E1 /* astcenc_color_unquantize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC427DDDBCC00D0B9E1 /* astcenc_color_unquantize.cpp */; };
 		70871E0527DDDBCD00D0B9E1 /* astcenc_platform_isa_detection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC527DDDBCC00D0B9E1 /* astcenc_platform_isa_detection.cpp */; };
-		70871E0627DDDBCD00D0B9E1 /* astcenc_platform_isa_detection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC527DDDBCC00D0B9E1 /* astcenc_platform_isa_detection.cpp */; };
 		70871E0727DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC627DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp */; };
-		70871E0827DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC627DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp */; };
 		70871E0927DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC727DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp */; };
-		70871E0A27DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC727DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp */; };
 		708A6A962708CE4700BA5410 /* bc6h_decode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 708A6A8B2708CE4700BA5410 /* bc6h_decode.cpp */; };
-		708A6A972708CE4700BA5410 /* bc6h_decode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 708A6A8B2708CE4700BA5410 /* bc6h_decode.cpp */; };
 		708A6A982708CE4700BA5410 /* bc6h_decode.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A8C2708CE4700BA5410 /* bc6h_decode.h */; };
-		708A6A992708CE4700BA5410 /* bc6h_decode.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A8C2708CE4700BA5410 /* bc6h_decode.h */; };
 		708A6A9A2708CE4700BA5410 /* bc6h_encode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 708A6A8D2708CE4700BA5410 /* bc6h_encode.cpp */; };
-		708A6A9B2708CE4700BA5410 /* bc6h_encode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 708A6A8D2708CE4700BA5410 /* bc6h_encode.cpp */; };
 		708A6A9C2708CE4700BA5410 /* bc6h_encode.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A8E2708CE4700BA5410 /* bc6h_encode.h */; };
-		708A6A9D2708CE4700BA5410 /* bc6h_encode.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A8E2708CE4700BA5410 /* bc6h_encode.h */; };
 		708A6AA02708CE4700BA5410 /* bc6h_definitions.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A902708CE4700BA5410 /* bc6h_definitions.h */; };
-		708A6AA12708CE4700BA5410 /* bc6h_definitions.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A902708CE4700BA5410 /* bc6h_definitions.h */; };
 		708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A922708CE4700BA5410 /* bc6h_utils.h */; };
-		708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A922708CE4700BA5410 /* bc6h_utils.h */; };
 		709B8D2D28D7BCAD0081BD1F /* ostream.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1C28D7BCAD0081BD1F /* ostream.h */; };
-		709B8D2E28D7BCAD0081BD1F /* ostream.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1C28D7BCAD0081BD1F /* ostream.h */; };
 		709B8D2F28D7BCAD0081BD1F /* format-inl.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1D28D7BCAD0081BD1F /* format-inl.h */; };
-		709B8D3028D7BCAD0081BD1F /* format-inl.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1D28D7BCAD0081BD1F /* format-inl.h */; };
 		709B8D3128D7BCAD0081BD1F /* ranges.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1E28D7BCAD0081BD1F /* ranges.h */; };
-		709B8D3228D7BCAD0081BD1F /* ranges.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1E28D7BCAD0081BD1F /* ranges.h */; };
 		709B8D3328D7BCAD0081BD1F /* xchar.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1F28D7BCAD0081BD1F /* xchar.h */; };
-		709B8D3428D7BCAD0081BD1F /* xchar.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1F28D7BCAD0081BD1F /* xchar.h */; };
 		709B8D3528D7BCAD0081BD1F /* core.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2028D7BCAD0081BD1F /* core.h */; };
-		709B8D3628D7BCAD0081BD1F /* core.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2028D7BCAD0081BD1F /* core.h */; };
 		709B8D3728D7BCAD0081BD1F /* os.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2128D7BCAD0081BD1F /* os.cpp */; };
-		709B8D3828D7BCAD0081BD1F /* os.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2128D7BCAD0081BD1F /* os.cpp */; };
 		709B8D3928D7BCAD0081BD1F /* format.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2228D7BCAD0081BD1F /* format.cpp */; };
-		709B8D3A28D7BCAD0081BD1F /* format.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2228D7BCAD0081BD1F /* format.cpp */; };
 		709B8D3D28D7BCAD0081BD1F /* chrono.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2428D7BCAD0081BD1F /* chrono.h */; };
-		709B8D3E28D7BCAD0081BD1F /* chrono.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2428D7BCAD0081BD1F /* chrono.h */; };
 		709B8D3F28D7BCAD0081BD1F /* os.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2528D7BCAD0081BD1F /* os.h */; };
-		709B8D4028D7BCAD0081BD1F /* os.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2528D7BCAD0081BD1F /* os.h */; };
 		709B8D4128D7BCAD0081BD1F /* color.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2628D7BCAD0081BD1F /* color.h */; };
-		709B8D4228D7BCAD0081BD1F /* color.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2628D7BCAD0081BD1F /* color.h */; };
 		709B8D4328D7BCAD0081BD1F /* args.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2728D7BCAD0081BD1F /* args.h */; };
-		709B8D4428D7BCAD0081BD1F /* args.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2728D7BCAD0081BD1F /* args.h */; };
 		709B8D4528D7BCAD0081BD1F /* printf.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2828D7BCAD0081BD1F /* printf.h */; };
-		709B8D4628D7BCAD0081BD1F /* printf.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2828D7BCAD0081BD1F /* printf.h */; };
 		709B8D4728D7BCAD0081BD1F /* compile.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2928D7BCAD0081BD1F /* compile.h */; };
-		709B8D4828D7BCAD0081BD1F /* compile.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2928D7BCAD0081BD1F /* compile.h */; };
 		709B8D4928D7BCAD0081BD1F /* format.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2B28D7BCAD0081BD1F /* format.h */; };
-		709B8D4A28D7BCAD0081BD1F /* format.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2B28D7BCAD0081BD1F /* format.h */; };
 		709B8D4B28D7BCAD0081BD1F /* std.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2C28D7BCAD0081BD1F /* std.h */; };
-		709B8D4C28D7BCAD0081BD1F /* std.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2C28D7BCAD0081BD1F /* std.h */; };
 		709B8D4F28D7C15F0081BD1F /* KramFmt.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D4D28D7C15F0081BD1F /* KramFmt.h */; };
-		709B8D5028D7C15F0081BD1F /* KramFmt.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D4D28D7C15F0081BD1F /* KramFmt.h */; };
 		70A7BD3027092A1200DBCCF7 /* hdr_encode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */; };
-		70A7BD3127092A1200DBCCF7 /* hdr_encode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */; };
 		70A7BD3227092A1200DBCCF7 /* hdr_encode.h in Headers */ = {isa = PBXBuildFile; fileRef = 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */; };
-		70A7BD3327092A1200DBCCF7 /* hdr_encode.h in Headers */ = {isa = PBXBuildFile; fileRef = 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */; };
-		70C6398D289FB234006E7422 /* KramPrefix.pch in Headers */ = {isa = PBXBuildFile; fileRef = 70C6398C289FB234006E7422 /* KramPrefix.pch */; };
-		70C6398E289FB234006E7422 /* KramPrefix.pch in Headers */ = {isa = PBXBuildFile; fileRef = 70C6398C289FB234006E7422 /* KramPrefix.pch */; };
+		70B563A72C857B360089A64F /* KramZipStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B563A52C857B360089A64F /* KramZipStream.cpp */; };
+		70B563A92C857B360089A64F /* KramZipStream.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B563A62C857B360089A64F /* KramZipStream.h */; };
 		70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; };
-		70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; };
 		70CDB65227A1382700A546C1 /* KramDDSHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */; };
-		70CDB65327A1382700A546C1 /* KramDDSHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */; };
 		70D222D82AC800AC00B9EA23 /* json11.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222D62AC800AC00B9EA23 /* json11.h */; };
-		70D222D92AC800AC00B9EA23 /* json11.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222D62AC800AC00B9EA23 /* json11.h */; };
 		70D222DA2AC800AC00B9EA23 /* json11.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222D72AC800AC00B9EA23 /* json11.cpp */; };
-		70D222DB2AC800AC00B9EA23 /* json11.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222D72AC800AC00B9EA23 /* json11.cpp */; };
 		70D222DE2AD2132300B9EA23 /* ImmutableString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222DC2AD2132300B9EA23 /* ImmutableString.cpp */; };
-		70D222DF2AD2132300B9EA23 /* ImmutableString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222DC2AD2132300B9EA23 /* ImmutableString.cpp */; };
 		70D222E02AD2132300B9EA23 /* ImmutableString.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222DD2AD2132300B9EA23 /* ImmutableString.h */; };
-		70D222E12AD2132300B9EA23 /* ImmutableString.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222DD2AD2132300B9EA23 /* ImmutableString.h */; };
 		70D222E42AD22BED00B9EA23 /* BlockedLinearAllocator.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222E22AD22BED00B9EA23 /* BlockedLinearAllocator.cpp */; };
-		70D222E52AD22BED00B9EA23 /* BlockedLinearAllocator.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222E22AD22BED00B9EA23 /* BlockedLinearAllocator.cpp */; };
 		70D222E62AD22BED00B9EA23 /* BlockedLinearAllocator.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222E32AD22BED00B9EA23 /* BlockedLinearAllocator.h */; };
-		70D222E72AD22BED00B9EA23 /* BlockedLinearAllocator.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222E32AD22BED00B9EA23 /* BlockedLinearAllocator.h */; };
 		70D222EB2ADAF25E00B9EA23 /* simdjson.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222E92ADAF25E00B9EA23 /* simdjson.h */; };
-		70D222EC2ADAF25E00B9EA23 /* simdjson.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222E92ADAF25E00B9EA23 /* simdjson.h */; };
 		70D222ED2ADAF25E00B9EA23 /* simdjson.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222EA2ADAF25E00B9EA23 /* simdjson.cpp */; };
-		70D222EE2ADAF25E00B9EA23 /* simdjson.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222EA2ADAF25E00B9EA23 /* simdjson.cpp */; };
 		70D222F52ADAF78300B9EA23 /* dlmalloc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222F42ADAF78300B9EA23 /* dlmalloc.cpp */; };
-		70D222F62ADAF78300B9EA23 /* dlmalloc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70D222F42ADAF78300B9EA23 /* dlmalloc.cpp */; };
 		70D222F82ADAFA1500B9EA23 /* dlmalloc.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222F72ADAFA1500B9EA23 /* dlmalloc.h */; };
-		70D222F92ADAFA1500B9EA23 /* dlmalloc.h in Headers */ = {isa = PBXBuildFile; fileRef = 70D222F72ADAFA1500B9EA23 /* dlmalloc.h */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
-		704738B1289F6AEE00C77A9F /* unordered_map.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unordered_map.h; sourceTree = "<group>"; };
-		704738B2289F6AEE00C77A9F /* falgorithm.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = falgorithm.h; sourceTree = "<group>"; };
-		704738B3289F6AEE00C77A9F /* map.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = map.h; sourceTree = "<group>"; };
-		704738B4289F6AEE00C77A9F /* pair.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pair.h; sourceTree = "<group>"; };
-		704738B6289F6AEE00C77A9F /* unordered_set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unordered_set.h; sourceTree = "<group>"; };
-		704738B7289F6AEE00C77A9F /* vector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vector.h; sourceTree = "<group>"; };
-		704738B8289F6AEE00C77A9F /* set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = set.h; sourceTree = "<group>"; };
-		704738B9289F6AEE00C77A9F /* fstring.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fstring.h; sourceTree = "<group>"; };
+		706178172DE16211001545E1 /* KramFileIO.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramFileIO.h; sourceTree = "<group>"; };
+		706178182DE16211001545E1 /* KramFileIO.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramFileIO.cpp; sourceTree = "<group>"; };
 		706ECDDE26D1577A001C950E /* libkram.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkram.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = EtcBlock4x4Encoding_RGB8.cpp; sourceTree = "<group>"; };
 		706EEDAB26D1583E001C950E /* EtcErrorMetric.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = EtcErrorMetric.h; sourceTree = "<group>"; };
@@ -491,12 +273,10 @@
 		706EEE1A26D1583F001C950E /* KramTimer.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramTimer.cpp; sourceTree = "<group>"; };
 		706EEE1B26D1583F001C950E /* KTXImage.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KTXImage.cpp; sourceTree = "<group>"; };
 		706EEE1C26D1583F001C950E /* KramMipper.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramMipper.cpp; sourceTree = "<group>"; };
-		706EEE1D26D1583F001C950E /* _clang-format */ = {isa = PBXFileReference; lastKnownFileType = text; path = "_clang-format"; sourceTree = "<group>"; };
 		706EEE1E26D1583F001C950E /* KramZipHelper.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramZipHelper.cpp; sourceTree = "<group>"; };
 		706EEE1F26D1583F001C950E /* TaskSystem.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = TaskSystem.cpp; sourceTree = "<group>"; };
 		706EEE2026D1583F001C950E /* KramSDFMipper.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramSDFMipper.h; sourceTree = "<group>"; };
 		706EEE2126D1583F001C950E /* KramFileHelper.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramFileHelper.cpp; sourceTree = "<group>"; };
-		706EEE2226D1583F001C950E /* sse2neon.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sse2neon.h; sourceTree = "<group>"; };
 		706EEE2326D1583F001C950E /* KramConfig.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramConfig.h; sourceTree = "<group>"; };
 		706EEE2426D1583F001C950E /* KramLog.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramLog.h; sourceTree = "<group>"; };
 		706EEE2526D1583F001C950E /* KramImageInfo.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramImageInfo.cpp; sourceTree = "<group>"; };
@@ -509,12 +289,10 @@
 		706EEE2C26D1583F001C950E /* KramMmapHelper.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramMmapHelper.cpp; sourceTree = "<group>"; };
 		706EEE2D26D1583F001C950E /* win_mmap.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = win_mmap.h; sourceTree = "<group>"; };
 		706EEE2E26D1583F001C950E /* Kram.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = Kram.h; sourceTree = "<group>"; };
-		706EEE2F26D1583F001C950E /* float4a.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = float4a.cpp; sourceTree = "<group>"; };
 		706EEE3026D1583F001C950E /* KTXImage.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KTXImage.h; sourceTree = "<group>"; };
 		706EEE3126D1583F001C950E /* KramImageInfo.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramImageInfo.h; sourceTree = "<group>"; };
 		706EEE3226D1583F001C950E /* KramTimer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramTimer.h; sourceTree = "<group>"; };
 		706EEE3326D1583F001C950E /* KramMmapHelper.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramMmapHelper.h; sourceTree = "<group>"; };
-		706EEE3426D1583F001C950E /* float4a.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = float4a.h; sourceTree = "<group>"; };
 		706EEE3526D1583F001C950E /* Kram.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = Kram.cpp; sourceTree = "<group>"; };
 		706EEE3626D1583F001C950E /* KramFileHelper.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramFileHelper.h; sourceTree = "<group>"; };
 		706EEE3726D1583F001C950E /* KramMipper.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramMipper.h; sourceTree = "<group>"; };
@@ -547,7 +325,6 @@
 		706EEE5826D1583F001C950E /* tmpfileplus.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tmpfileplus.cpp; sourceTree = "<group>"; };
 		706EEE5926D1583F001C950E /* tmpfileplus.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tmpfileplus.h; sourceTree = "<group>"; };
 		706EF12A26D159F9001C950E /* libate.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libate.tbd; path = usr/lib/libate.tbd; sourceTree = SDKROOT; };
-		706EF1E126D166C5001C950E /* libkram-ios.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libkram-ios.a"; sourceTree = BUILT_PRODUCTS_DIR; };
 		706EFC4126D3473F001C950E /* eaunits.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eaunits.h; sourceTree = "<group>"; };
 		706EFC4226D3473F001C950E /* version.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = version.h; sourceTree = "<group>"; };
 		706EFC4426D3473F001C950E /* eacompilertraits.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eacompilertraits.h; sourceTree = "<group>"; };
@@ -702,6 +479,9 @@
 		707789D42881BA81008A51BC /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = LICENSE; sourceTree = "<group>"; };
 		707789EF2881BCE2008A51BC /* rdo_bc_encoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rdo_bc_encoder.cpp; sourceTree = "<group>"; };
 		707789F02881BCE2008A51BC /* rdo_bc_encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rdo_bc_encoder.h; sourceTree = "<group>"; };
+		707B2AB22D99BF7A00DD3F0B /* KramThreadPool.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = KramThreadPool.h; sourceTree = "<group>"; };
+		707B2AB32D99BF7A00DD3F0B /* KramThreadPool.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = KramThreadPool.cpp; sourceTree = "<group>"; };
+		707D4C732CC436A000729BE0 /* kram.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = kram.xcconfig; sourceTree = "<group>"; };
 		70871DA727DDDBCC00D0B9E1 /* astcenc_vecmathlib_common_4.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = astcenc_vecmathlib_common_4.h; sourceTree = "<group>"; };
 		70871DA827DDDBCC00D0B9E1 /* astcenc_image.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = astcenc_image.cpp; sourceTree = "<group>"; };
 		70871DA927DDDBCC00D0B9E1 /* astcenc_find_best_partitioning.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = astcenc_find_best_partitioning.cpp; sourceTree = "<group>"; };
@@ -760,7 +540,8 @@
 		709B8D4D28D7C15F0081BD1F /* KramFmt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramFmt.h; sourceTree = "<group>"; };
 		70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hdr_encode.cpp; sourceTree = "<group>"; };
 		70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hdr_encode.h; sourceTree = "<group>"; };
-		70C6398C289FB234006E7422 /* KramPrefix.pch */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramPrefix.pch; sourceTree = "<group>"; };
+		70B563A52C857B360089A64F /* KramZipStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramZipStream.cpp; sourceTree = "<group>"; };
+		70B563A62C857B360089A64F /* KramZipStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramZipStream.h; sourceTree = "<group>"; };
 		70CDB64E27A1382600A546C1 /* KramDDSHelper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramDDSHelper.h; sourceTree = "<group>"; };
 		70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramDDSHelper.cpp; sourceTree = "<group>"; };
 		70D222D62AC800AC00B9EA23 /* json11.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = json11.h; sourceTree = "<group>"; };
@@ -784,38 +565,16 @@
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
-		706EF1DC26D166C5001C950E /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				706EF1DD26D166C5001C950E /* libate.tbd in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
 /* End PBXFrameworksBuildPhase section */
 
 /* Begin PBXGroup section */
-		704738AF289F6AEE00C77A9F /* fastl */ = {
-			isa = PBXGroup;
-			children = (
-				704738B2289F6AEE00C77A9F /* falgorithm.h */,
-				704738B3289F6AEE00C77A9F /* map.h */,
-				704738B4289F6AEE00C77A9F /* pair.h */,
-				704738B1289F6AEE00C77A9F /* unordered_map.h */,
-				704738B6289F6AEE00C77A9F /* unordered_set.h */,
-				704738B7289F6AEE00C77A9F /* vector.h */,
-				704738B8289F6AEE00C77A9F /* set.h */,
-				704738B9289F6AEE00C77A9F /* fstring.h */,
-			);
-			path = fastl;
-			sourceTree = "<group>";
-		};
 		706ECDD526D1577A001C950E = {
 			isa = PBXGroup;
 			children = (
 				706EEDA826D1583E001C950E /* libkram */,
 				706ECDDF26D1577A001C950E /* Products */,
 				706EF12926D159F9001C950E /* Frameworks */,
+				707D4C732CC436A000729BE0 /* kram.xcconfig */,
 			);
 			sourceTree = "<group>";
 		};
@@ -823,7 +582,6 @@
 			isa = PBXGroup;
 			children = (
 				706ECDDE26D1577A001C950E /* libkram.a */,
-				706EF1E126D166C5001C950E /* libkram-ios.a */,
 			);
 			name = Products;
 			sourceTree = "<group>";
@@ -834,7 +592,6 @@
 				70D222F32ADAF78300B9EA23 /* allocate */,
 				708A6A882708CE4700BA5410 /* compressonator */,
 				706EFC3E26D3473F001C950E /* eastl */,
-				704738AF289F6AEE00C77A9F /* fastl */,
 				709B8D1B28D7BCAD0081BD1F /* fmt */,
 				706EEDA926D1583E001C950E /* etc2comp */,
 				706EEDC926D1583E001C950E /* bc7enc */,
@@ -1024,6 +781,8 @@
 			children = (
 				70CDB64E27A1382600A546C1 /* KramDDSHelper.h */,
 				70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */,
+				70B563A62C857B360089A64F /* KramZipStream.h */,
+				70B563A52C857B360089A64F /* KramZipStream.cpp */,
 				706EEE1926D1583F001C950E /* KramZipHelper.h */,
 				706EEE1E26D1583F001C950E /* KramZipHelper.cpp */,
 				706EEE2326D1583F001C950E /* KramConfig.h */,
@@ -1044,24 +803,23 @@
 				706EEE1A26D1583F001C950E /* KramTimer.cpp */,
 				706EEE3326D1583F001C950E /* KramMmapHelper.h */,
 				706EEE2C26D1583F001C950E /* KramMmapHelper.cpp */,
-				70C6398C289FB234006E7422 /* KramPrefix.pch */,
 				706EEE2E26D1583F001C950E /* Kram.h */,
 				706EEE3526D1583F001C950E /* Kram.cpp */,
 				706EEE3626D1583F001C950E /* KramFileHelper.h */,
 				706EEE2126D1583F001C950E /* KramFileHelper.cpp */,
+				706178172DE16211001545E1 /* KramFileIO.h */,
+				706178182DE16211001545E1 /* KramFileIO.cpp */,
 				706EEE3726D1583F001C950E /* KramMipper.h */,
 				706EEE1C26D1583F001C950E /* KramMipper.cpp */,
 				706EEE2D26D1583F001C950E /* win_mmap.h */,
-				706EEE2226D1583F001C950E /* sse2neon.h */,
-				706EEE3426D1583F001C950E /* float4a.h */,
-				706EEE2F26D1583F001C950E /* float4a.cpp */,
 				70D222E32AD22BED00B9EA23 /* BlockedLinearAllocator.h */,
 				70D222E22AD22BED00B9EA23 /* BlockedLinearAllocator.cpp */,
 				70D222DD2AD2132300B9EA23 /* ImmutableString.h */,
 				70D222DC2AD2132300B9EA23 /* ImmutableString.cpp */,
+				707B2AB22D99BF7A00DD3F0B /* KramThreadPool.h */,
+				707B2AB32D99BF7A00DD3F0B /* KramThreadPool.cpp */,
 				706EEE3826D1583F001C950E /* TaskSystem.h */,
 				706EEE1F26D1583F001C950E /* TaskSystem.cpp */,
-				706EEE1D26D1583F001C950E /* _clang-format */,
 			);
 			path = kram;
 			sourceTree = "<group>";
@@ -1411,7 +1169,6 @@
 			files = (
 				706EEFD126D15984001C950E /* EtcErrorMetric.h in Headers */,
 				706EEFD226D15984001C950E /* EtcColor.h in Headers */,
-				70C6398D289FB234006E7422 /* KramPrefix.pch in Headers */,
 				709B8D3D28D7BCAD0081BD1F /* chrono.h in Headers */,
 				706EEFD326D15984001C950E /* EtcDifferentialTrys.h in Headers */,
 				706EEFD426D15984001C950E /* EtcBlock4x4Encoding_RGB8.h in Headers */,
@@ -1422,7 +1179,6 @@
 				706EEFD826D15984001C950E /* EtcMath.h in Headers */,
 				706EEFD926D15984001C950E /* EtcIndividualTrys.h in Headers */,
 				706EEFDA26D15984001C950E /* EtcBlock4x4EncodingBits.h in Headers */,
-				704738BE289F6AEE00C77A9F /* falgorithm.h in Headers */,
 				706EEFDB26D15984001C950E /* EtcBlock4x4Encoding_RGB8A1.h in Headers */,
 				706EEFDC26D15984001C950E /* EtcBlock4x4.h in Headers */,
 				707789E72881BA81008A51BC /* rgbcx.h in Headers */,
@@ -1430,6 +1186,7 @@
 				707789D72881BA81008A51BC /* bc7enc.h in Headers */,
 				706EEFDE26D15984001C950E /* EtcImage.h in Headers */,
 				709B8D4B28D7BCAD0081BD1F /* std.h in Headers */,
+				707B2AB42D99BF7A00DD3F0B /* KramThreadPool.h in Headers */,
 				70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */,
 				709B8D4328D7BCAD0081BD1F /* args.h in Headers */,
 				708A6A9C2708CE4700BA5410 /* bc6h_encode.h in Headers */,
@@ -1448,7 +1205,6 @@
 				706EEFF526D15985001C950E /* basisu_containers_impl.h in Headers */,
 				707789EB2881BA81008A51BC /* utils.h in Headers */,
 				706EEFF626D15985001C950E /* basisu_transcoder_internal.h in Headers */,
-				704738C0289F6AEE00C77A9F /* map.h in Headers */,
 				70871DF927DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */,
 				70871DFB27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */,
 				706EEFF726D15985001C950E /* basisu_global_selector_cb.h in Headers */,
@@ -1459,13 +1215,11 @@
 				706EEFFB26D15985001C950E /* basisu_file_headers.h in Headers */,
 				706EEFFC26D15985001C950E /* miniz.h in Headers */,
 				706EEFFD26D15985001C950E /* hedistance.h in Headers */,
-				704738BC289F6AEE00C77A9F /* unordered_map.h in Headers */,
-				704738C2289F6AEE00C77A9F /* pair.h in Headers */,
 				706EEFFE26D15985001C950E /* stb_rect_pack.h in Headers */,
 				706EEFFF26D15985001C950E /* KramZipHelper.h in Headers */,
 				706EF00026D15985001C950E /* KramSDFMipper.h in Headers */,
-				706EF00126D15985001C950E /* sse2neon.h in Headers */,
 				70D222E62AD22BED00B9EA23 /* BlockedLinearAllocator.h in Headers */,
+				70B563A92C857B360089A64F /* KramZipStream.h in Headers */,
 				70871DF127DDDBCD00D0B9E1 /* astcenc_mathlib.h in Headers */,
 				709B8D3128D7BCAD0081BD1F /* ranges.h in Headers */,
 				706EF00226D15985001C950E /* KramConfig.h in Headers */,
@@ -1479,7 +1233,6 @@
 				707789D92881BA81008A51BC /* bc7decomp.h in Headers */,
 				706EF00826D15985001C950E /* Kram.h in Headers */,
 				70D222E02AD2132300B9EA23 /* ImmutableString.h in Headers */,
-				704738C8289F6AEE00C77A9F /* vector.h in Headers */,
 				70D222EB2ADAF25E00B9EA23 /* simdjson.h in Headers */,
 				70871DED27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */,
 				707789DB2881BA81008A51BC /* ert.h in Headers */,
@@ -1488,9 +1241,7 @@
 				707789DF2881BA81008A51BC /* rgbcx_table4.h in Headers */,
 				70871DF727DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */,
 				706EF00B26D15985001C950E /* KramTimer.h in Headers */,
-				704738C6289F6AEE00C77A9F /* unordered_set.h in Headers */,
 				706EF00C26D15985001C950E /* KramMmapHelper.h in Headers */,
-				706EF00D26D15985001C950E /* float4a.h in Headers */,
 				706EF00E26D15985001C950E /* KramFileHelper.h in Headers */,
 				709B8D3F28D7BCAD0081BD1F /* os.h in Headers */,
 				706EF00F26D15985001C950E /* KramMipper.h in Headers */,
@@ -1506,7 +1257,6 @@
 				706EF01526D15985001C950E /* singlecolourfit.h in Headers */,
 				706EF01626D15985001C950E /* maths.h in Headers */,
 				707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */,
-				704738CA289F6AEE00C77A9F /* set.h in Headers */,
 				70D222D82AC800AC00B9EA23 /* json11.h in Headers */,
 				706EF01726D15985001C950E /* colourset.h in Headers */,
 				708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */,
@@ -1515,136 +1265,16 @@
 				706EF01A26D15985001C950E /* zstd.h in Headers */,
 				70871DF327DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */,
 				709B8D2F28D7BCAD0081BD1F /* format-inl.h in Headers */,
-				704738CC289F6AEE00C77A9F /* fstring.h in Headers */,
 				709B8D2D28D7BCAD0081BD1F /* ostream.h in Headers */,
 				706EF01B26D15985001C950E /* lodepng.h in Headers */,
 				709B8D4928D7BCAD0081BD1F /* format.h in Headers */,
+				706178192DE16211001545E1 /* KramFileIO.h in Headers */,
 				70D222F82ADAFA1500B9EA23 /* dlmalloc.h in Headers */,
 				706EF01C26D15985001C950E /* tmpfileplus.h in Headers */,
 				709B8D3328D7BCAD0081BD1F /* xchar.h in Headers */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
-		706EF14A26D166C5001C950E /* Headers */ = {
-			isa = PBXHeadersBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				706EF14B26D166C5001C950E /* EtcErrorMetric.h in Headers */,
-				706EF14C26D166C5001C950E /* EtcColor.h in Headers */,
-				70C6398E289FB234006E7422 /* KramPrefix.pch in Headers */,
-				709B8D3E28D7BCAD0081BD1F /* chrono.h in Headers */,
-				706EF14D26D166C5001C950E /* EtcDifferentialTrys.h in Headers */,
-				706EF14E26D166C5001C950E /* EtcBlock4x4Encoding_RGB8.h in Headers */,
-				706EF14F26D166C5001C950E /* EtcConfig.h in Headers */,
-				70871DCA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */,
-				706EF15026D166C5001C950E /* EtcBlock4x4Encoding_R11.h in Headers */,
-				706EF15126D166C5001C950E /* EtcBlock4x4Encoding_RG11.h in Headers */,
-				706EF15226D166C5001C950E /* EtcMath.h in Headers */,
-				706EF15326D166C5001C950E /* EtcIndividualTrys.h in Headers */,
-				706EF15426D166C5001C950E /* EtcBlock4x4EncodingBits.h in Headers */,
-				704738BF289F6AEE00C77A9F /* falgorithm.h in Headers */,
-				706EF15526D166C5001C950E /* EtcBlock4x4Encoding_RGB8A1.h in Headers */,
-				706EF15626D166C5001C950E /* EtcBlock4x4.h in Headers */,
-				707789E82881BA81008A51BC /* rgbcx.h in Headers */,
-				706EF15726D166C5001C950E /* Etc.h in Headers */,
-				707789D82881BA81008A51BC /* bc7enc.h in Headers */,
-				706EF15826D166C5001C950E /* EtcImage.h in Headers */,
-				709B8D4C28D7BCAD0081BD1F /* std.h in Headers */,
-				70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */,
-				709B8D4428D7BCAD0081BD1F /* args.h in Headers */,
-				708A6A9D2708CE4700BA5410 /* bc6h_encode.h in Headers */,
-				706EF15926D166C5001C950E /* EtcBlock4x4Encoding_ETC1.h in Headers */,
-				706EF15A26D166C5001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */,
-				706EF15B26D166C5001C950E /* EtcColorFloatRGBA.h in Headers */,
-				706EF15C26D166C5001C950E /* EtcBlock4x4Encoding.h in Headers */,
-				706EF16C26D166C5001C950E /* ateencoder.h in Headers */,
-				706EF16D26D166C5001C950E /* basisu_transcoder.h in Headers */,
-				70A7BD3327092A1200DBCCF7 /* hdr_encode.h in Headers */,
-				709B8D4828D7BCAD0081BD1F /* compile.h in Headers */,
-				708A6AA12708CE4700BA5410 /* bc6h_definitions.h in Headers */,
-				706EF16E26D166C5001C950E /* basisu_containers.h in Headers */,
-				70871DD627DDDBCD00D0B9E1 /* astcenc.h in Headers */,
-				709B8D4628D7BCAD0081BD1F /* printf.h in Headers */,
-				706EF16F26D166C5001C950E /* basisu_containers_impl.h in Headers */,
-				707789EC2881BA81008A51BC /* utils.h in Headers */,
-				706EF17026D166C5001C950E /* basisu_transcoder_internal.h in Headers */,
-				704738C1289F6AEE00C77A9F /* map.h in Headers */,
-				70871DFA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */,
-				70871DFC27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */,
-				706EF17126D166C5001C950E /* basisu_global_selector_cb.h in Headers */,
-				706EF17226D166C5001C950E /* basisu_transcoder_uastc.h in Headers */,
-				706EF17326D166C5001C950E /* basisu_global_selector_palette.h in Headers */,
-				707789E42881BA81008A51BC /* rgbcx_table4_small.h in Headers */,
-				706EF17426D166C5001C950E /* basisu.h in Headers */,
-				706EF17526D166C5001C950E /* basisu_file_headers.h in Headers */,
-				706EF17626D166C5001C950E /* miniz.h in Headers */,
-				706EF17726D166C5001C950E /* hedistance.h in Headers */,
-				704738BD289F6AEE00C77A9F /* unordered_map.h in Headers */,
-				704738C3289F6AEE00C77A9F /* pair.h in Headers */,
-				706EF17826D166C5001C950E /* stb_rect_pack.h in Headers */,
-				706EF17926D166C5001C950E /* KramZipHelper.h in Headers */,
-				706EF17A26D166C5001C950E /* KramSDFMipper.h in Headers */,
-				706EF17B26D166C5001C950E /* sse2neon.h in Headers */,
-				70D222E72AD22BED00B9EA23 /* BlockedLinearAllocator.h in Headers */,
-				70871DF227DDDBCD00D0B9E1 /* astcenc_mathlib.h in Headers */,
-				709B8D3228D7BCAD0081BD1F /* ranges.h in Headers */,
-				706EF17C26D166C5001C950E /* KramConfig.h in Headers */,
-				706EF17D26D166C5001C950E /* KramLog.h in Headers */,
-				706EF17E26D166C5001C950E /* KramLib.h in Headers */,
-				706EF17F26D166C5001C950E /* KramVersion.h in Headers */,
-				706EF18026D166C5001C950E /* KramImage.h in Headers */,
-				706EF18126D166C5001C950E /* win_mmap.h in Headers */,
-				70871DDE27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */,
-				709B8D5028D7C15F0081BD1F /* KramFmt.h in Headers */,
-				707789DA2881BA81008A51BC /* bc7decomp.h in Headers */,
-				706EF18226D166C5001C950E /* Kram.h in Headers */,
-				70D222E12AD2132300B9EA23 /* ImmutableString.h in Headers */,
-				704738C9289F6AEE00C77A9F /* vector.h in Headers */,
-				70D222EC2ADAF25E00B9EA23 /* simdjson.h in Headers */,
-				70871DEE27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */,
-				707789DC2881BA81008A51BC /* ert.h in Headers */,
-				706EF18326D166C5001C950E /* KTXImage.h in Headers */,
-				706EF18426D166C5001C950E /* KramImageInfo.h in Headers */,
-				707789E02881BA81008A51BC /* rgbcx_table4.h in Headers */,
-				70871DF827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */,
-				706EF18526D166C5001C950E /* KramTimer.h in Headers */,
-				704738C7289F6AEE00C77A9F /* unordered_set.h in Headers */,
-				706EF18626D166C5001C950E /* KramMmapHelper.h in Headers */,
-				706EF18726D166C5001C950E /* float4a.h in Headers */,
-				706EF18826D166C5001C950E /* KramFileHelper.h in Headers */,
-				709B8D4028D7BCAD0081BD1F /* os.h in Headers */,
-				706EF18926D166C5001C950E /* KramMipper.h in Headers */,
-				706EF18A26D166C5001C950E /* TaskSystem.h in Headers */,
-				706EF18B26D166C5001C950E /* squish.h in Headers */,
-				706EF18C26D166C5001C950E /* clusterfit.h in Headers */,
-				709B8D3628D7BCAD0081BD1F /* core.h in Headers */,
-				706EF18D26D166C5001C950E /* colourfit.h in Headers */,
-				70871DFE27DDDBCD00D0B9E1 /* astcenc_vecmathlib.h in Headers */,
-				706EF18E26D166C5001C950E /* alpha.h in Headers */,
-				709B8D4228D7BCAD0081BD1F /* color.h in Headers */,
-				708A6A992708CE4700BA5410 /* bc6h_decode.h in Headers */,
-				706EF18F26D166C5001C950E /* singlecolourfit.h in Headers */,
-				706EF19026D166C5001C950E /* maths.h in Headers */,
-				707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */,
-				704738CB289F6AEE00C77A9F /* set.h in Headers */,
-				70D222D92AC800AC00B9EA23 /* json11.h in Headers */,
-				706EF19126D166C5001C950E /* colourset.h in Headers */,
-				708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */,
-				706EF19226D166C5001C950E /* colourblock.h in Headers */,
-				706EF19326D166C5001C950E /* rangefit.h in Headers */,
-				706EF19426D166C5001C950E /* zstd.h in Headers */,
-				70871DF427DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */,
-				709B8D3028D7BCAD0081BD1F /* format-inl.h in Headers */,
-				704738CD289F6AEE00C77A9F /* fstring.h in Headers */,
-				709B8D2E28D7BCAD0081BD1F /* ostream.h in Headers */,
-				706EF19526D166C5001C950E /* lodepng.h in Headers */,
-				709B8D4A28D7BCAD0081BD1F /* format.h in Headers */,
-				70D222F92ADAFA1500B9EA23 /* dlmalloc.h in Headers */,
-				706EF19626D166C5001C950E /* tmpfileplus.h in Headers */,
-				709B8D3428D7BCAD0081BD1F /* xchar.h in Headers */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
 /* End PBXHeadersBuildPhase section */
 
 /* Begin PBXNativeTarget section */
@@ -1665,23 +1295,6 @@
 			productReference = 706ECDDE26D1577A001C950E /* libkram.a */;
 			productType = "com.apple.product-type.library.static";
 		};
-		706EF14926D166C5001C950E /* kram-ios */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 706EF1DE26D166C5001C950E /* Build configuration list for PBXNativeTarget "kram-ios" */;
-			buildPhases = (
-				706EF14A26D166C5001C950E /* Headers */,
-				706EF19726D166C5001C950E /* Sources */,
-				706EF1DC26D166C5001C950E /* Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = "kram-ios";
-			productName = kram;
-			productReference = 706EF1E126D166C5001C950E /* libkram-ios.a */;
-			productType = "com.apple.product-type.library.static";
-		};
 /* End PBXNativeTarget section */
 
 /* Begin PBXProject section */
@@ -1711,7 +1324,6 @@
 			projectRoot = "";
 			targets = (
 				706ECDDD26D1577A001C950E /* kram */,
-				706EF14926D166C5001C950E /* kram-ios */,
 			);
 		};
 /* End PBXProject section */
@@ -1721,6 +1333,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				702E0DB62CA10BC100B652B7 /* astcenc_mathlib.cpp in Sources */,
 				70871DD727DDDBCD00D0B9E1 /* astcenc_quantization.cpp in Sources */,
 				70D222F52ADAF78300B9EA23 /* dlmalloc.cpp in Sources */,
 				707789E52881BA81008A51BC /* ert.cpp in Sources */,
@@ -1750,7 +1363,6 @@
 				706EEF8A26D1595D001C950E /* EtcBlock4x4Encoding.cpp in Sources */,
 				706EEF8B26D1595D001C950E /* EtcBlock4x4.cpp in Sources */,
 				70871DDB27DDDBCD00D0B9E1 /* astcenc_percentile_tables.cpp in Sources */,
-				70871DE127DDDBCD00D0B9E1 /* astcenc_mathlib.cpp in Sources */,
 				708A6A9A2708CE4700BA5410 /* bc6h_encode.cpp in Sources */,
 				70A7BD3027092A1200DBCCF7 /* hdr_encode.cpp in Sources */,
 				706EFF7726D34740001C950E /* string.cpp in Sources */,
@@ -1784,12 +1396,13 @@
 				706EEFB426D1595D001C950E /* KramMmapHelper.cpp in Sources */,
 				709B8D3928D7BCAD0081BD1F /* format.cpp in Sources */,
 				70D222DE2AD2132300B9EA23 /* ImmutableString.cpp in Sources */,
+				7061781A2DE16211001545E1 /* KramFileIO.cpp in Sources */,
 				70871DCB27DDDBCD00D0B9E1 /* astcenc_image.cpp in Sources */,
-				706EEFB526D1595D001C950E /* float4a.cpp in Sources */,
 				706EFF7326D34740001C950E /* thread_support.cpp in Sources */,
 				706EEFB626D1595D001C950E /* Kram.cpp in Sources */,
 				706EEFB726D1595D001C950E /* squish.cpp in Sources */,
 				706EEFB826D1595D001C950E /* colourset.cpp in Sources */,
+				707B2AB52D99BF7A00DD3F0B /* KramThreadPool.cpp in Sources */,
 				70871DD327DDDBCD00D0B9E1 /* astcenc_partition_tables.cpp in Sources */,
 				709B8D3728D7BCAD0081BD1F /* os.cpp in Sources */,
 				706EFF8126D34740001C950E /* hashtable.cpp in Sources */,
@@ -1801,6 +1414,7 @@
 				706EEFBD26D1595D001C950E /* colourblock.cpp in Sources */,
 				706EEFBE26D1595E001C950E /* colourfit.cpp in Sources */,
 				70871DFF27DDDBCD00D0B9E1 /* astcenc_pick_best_endpoint_format.cpp in Sources */,
+				70B563A72C857B360089A64F /* KramZipStream.cpp in Sources */,
 				70871E0927DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp in Sources */,
 				70871DCF27DDDBCD00D0B9E1 /* astcenc_symbolic_physical.cpp in Sources */,
 				70D222DA2AC800AC00B9EA23 /* json11.cpp in Sources */,
@@ -1815,112 +1429,15 @@
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
-		706EF19726D166C5001C950E /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				70871DD827DDDBCD00D0B9E1 /* astcenc_quantization.cpp in Sources */,
-				707789E62881BA81008A51BC /* ert.cpp in Sources */,
-				70871E0427DDDBCD00D0B9E1 /* astcenc_color_unquantize.cpp in Sources */,
-				70871DD227DDDBCD00D0B9E1 /* astcenc_averages_and_directions.cpp in Sources */,
-				70871DE027DDDBCD00D0B9E1 /* astcenc_mathlib_softfloat.cpp in Sources */,
-				709B8D3828D7BCAD0081BD1F /* os.cpp in Sources */,
-				70D222EE2ADAF25E00B9EA23 /* simdjson.cpp in Sources */,
-				706EFC2426D1C39B001C950E /* ateencoder.mm in Sources */,
-				707789EE2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */,
-				706EF19826D166C5001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */,
-				70D222DB2AC800AC00B9EA23 /* json11.cpp in Sources */,
-				70871DCE27DDDBCD00D0B9E1 /* astcenc_find_best_partitioning.cpp in Sources */,
-				70CDB65327A1382700A546C1 /* KramDDSHelper.cpp in Sources */,
-				706EF19926D166C5001C950E /* EtcImage.cpp in Sources */,
-				70871DEC27DDDBCD00D0B9E1 /* astcenc_block_sizes.cpp in Sources */,
-				706EF19A26D166C5001C950E /* EtcDifferentialTrys.cpp in Sources */,
-				706EF19B26D166C5001C950E /* EtcMath.cpp in Sources */,
-				706EF19C26D166C5001C950E /* EtcBlock4x4Encoding_RGBA8.cpp in Sources */,
-				706EF19D26D166C5001C950E /* EtcBlock4x4Encoding_RG11.cpp in Sources */,
-				706EF19E26D166C5001C950E /* EtcBlock4x4Encoding_RGB8A1.cpp in Sources */,
-				706EF19F26D166C5001C950E /* EtcIndividualTrys.cpp in Sources */,
-				707789DE2881BA81008A51BC /* rgbcx.cpp in Sources */,
-				706EF1A026D166C5001C950E /* EtcBlock4x4Encoding_R11.cpp in Sources */,
-				707789F22881BCE2008A51BC /* rdo_bc_encoder.cpp in Sources */,
-				70871DF627DDDBCD00D0B9E1 /* astcenc_color_quantize.cpp in Sources */,
-				706EF1A126D166C5001C950E /* EtcBlock4x4Encoding_ETC1.cpp in Sources */,
-				706EF1A226D166C5001C950E /* EtcBlock4x4Encoding.cpp in Sources */,
-				706EF1A326D166C5001C950E /* EtcBlock4x4.cpp in Sources */,
-				70871DDC27DDDBCD00D0B9E1 /* astcenc_percentile_tables.cpp in Sources */,
-				70871DE227DDDBCD00D0B9E1 /* astcenc_mathlib.cpp in Sources */,
-				708A6A9B2708CE4700BA5410 /* bc6h_encode.cpp in Sources */,
-				70A7BD3127092A1200DBCCF7 /* hdr_encode.cpp in Sources */,
-				706EFF7826D34740001C950E /* string.cpp in Sources */,
-				708A6A972708CE4700BA5410 /* bc6h_decode.cpp in Sources */,
-				706EFF7626D34740001C950E /* assert.cpp in Sources */,
-				706EFF8626D34740001C950E /* fixed_pool.cpp in Sources */,
-				706EF1BF26D166C5001C950E /* basisu_transcoder.cpp in Sources */,
-				706EFF8426D34740001C950E /* red_black_tree.cpp in Sources */,
-				70871DE427DDDBCD00D0B9E1 /* astcenc_decompress_symbolic.cpp in Sources */,
-				70871E0827DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp in Sources */,
-				70871E0627DDDBCD00D0B9E1 /* astcenc_platform_isa_detection.cpp in Sources */,
-				707789D62881BA81008A51BC /* bc7enc.cpp in Sources */,
-				70D222E52AD22BED00B9EA23 /* BlockedLinearAllocator.cpp in Sources */,
-				706EFF8026D34740001C950E /* intrusive_list.cpp in Sources */,
-				707789EA2881BA81008A51BC /* bc7decomp.cpp in Sources */,
-				706EF1C026D166C5001C950E /* miniz.cpp in Sources */,
-				70871DE627DDDBCD00D0B9E1 /* astcenc_compress_symbolic.cpp in Sources */,
-				706EF1C126D166C5001C950E /* hedistance.cpp in Sources */,
-				706EF1C226D166C5001C950E /* KramTimer.cpp in Sources */,
-				70871DE827DDDBCD00D0B9E1 /* astcenc_entry.cpp in Sources */,
-				706EF1C326D166C5001C950E /* KTXImage.cpp in Sources */,
-				706EF1C426D166C5001C950E /* KramMipper.cpp in Sources */,
-				706EF1C526D166C5001C950E /* KramZipHelper.cpp in Sources */,
-				706EF1C626D166C5001C950E /* TaskSystem.cpp in Sources */,
-				706EF1C726D166C5001C950E /* KramFileHelper.cpp in Sources */,
-				706EFF7C26D34740001C950E /* numeric_limits.cpp in Sources */,
-				706EF1C826D166C5001C950E /* KramImageInfo.cpp in Sources */,
-				70871DEA27DDDBCD00D0B9E1 /* astcenc_integer_sequence.cpp in Sources */,
-				706EF1C926D166C5001C950E /* KramImage.cpp in Sources */,
-				706EF1CA26D166C5001C950E /* KramLog.cpp in Sources */,
-				706EF1CB26D166C5001C950E /* KramSDFMipper.cpp in Sources */,
-				706EF1CC26D166C5001C950E /* KramMmapHelper.cpp in Sources */,
-				70D222DF2AD2132300B9EA23 /* ImmutableString.cpp in Sources */,
-				70871DCC27DDDBCD00D0B9E1 /* astcenc_image.cpp in Sources */,
-				706EF1CD26D166C5001C950E /* float4a.cpp in Sources */,
-				706EFF7426D34740001C950E /* thread_support.cpp in Sources */,
-				706EF1CE26D166C5001C950E /* Kram.cpp in Sources */,
-				706EF1CF26D166C5001C950E /* squish.cpp in Sources */,
-				706EF1D026D166C5001C950E /* colourset.cpp in Sources */,
-				70871DD427DDDBCD00D0B9E1 /* astcenc_partition_tables.cpp in Sources */,
-				706EFF8226D34740001C950E /* hashtable.cpp in Sources */,
-				70871DF027DDDBCD00D0B9E1 /* astcenc_weight_align.cpp in Sources */,
-				70871DDA27DDDBCD00D0B9E1 /* astcenc_compute_variance.cpp in Sources */,
-				706EF1D126D166C5001C950E /* clusterfit.cpp in Sources */,
-				706EF1D226D166C5001C950E /* rangefit.cpp in Sources */,
-				706EF1D326D166C5001C950E /* alpha.cpp in Sources */,
-				706EF1D426D166C5001C950E /* colourblock.cpp in Sources */,
-				706EF1D526D166C5001C950E /* colourfit.cpp in Sources */,
-				70871E0027DDDBCD00D0B9E1 /* astcenc_pick_best_endpoint_format.cpp in Sources */,
-				70871E0A27DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp in Sources */,
-				70871DD027DDDBCD00D0B9E1 /* astcenc_symbolic_physical.cpp in Sources */,
-				706EFF7A26D34740001C950E /* allocator_eastl.cpp in Sources */,
-				706EF1D626D166C5001C950E /* maths.cpp in Sources */,
-				706EF1D726D166C5001C950E /* singlecolourfit.cpp in Sources */,
-				706EF1D826D166C5001C950E /* zstd.cpp in Sources */,
-				70D222F62ADAF78300B9EA23 /* dlmalloc.cpp in Sources */,
-				706EF1D926D166C5001C950E /* zstddeclib.cpp in Sources */,
-				706EF1DA26D166C5001C950E /* lodepng.cpp in Sources */,
-				707789E22881BA81008A51BC /* utils.cpp in Sources */,
-				706EF1DB26D166C5001C950E /* tmpfileplus.cpp in Sources */,
-				709B8D3A28D7BCAD0081BD1F /* format.cpp in Sources */,
-				70871E0227DDDBCD00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
 /* End PBXSourcesBuildPhase section */
 
 /* Begin XCBuildConfiguration section */
 		706ECDE726D1577A001C950E /* Debug */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C732CC436A000729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = "$(ARCHS_STANDARD)";
 				CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
@@ -1939,6 +1456,7 @@
 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_IMPLICIT_FALLTHROUGH = YES_ERROR;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
@@ -1970,10 +1488,6 @@
 				GCC_OPTIMIZATION_LEVEL = 0;
 				GCC_PRECOMPILE_PREFIX_HEADER = YES;
 				GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramPrefix.h";
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				"GCC_WARN_64_TO_32_BIT_CONVERSION[arch=*64]" = NO;
 				GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
@@ -1990,13 +1504,13 @@
 				HEADER_SEARCH_PATHS = (
 					"$(PROJECT_DIR)/../libkram/eastl/include",
 					"$(PROJECT_DIR)/../libkram/kram",
+					"$(PROJECT_DIR)/../libkram/vectormath",
 				);
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
-				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-				MTL_FAST_MATH = YES;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = (
+					"$(inherited)",
 					"-DCOMPILE_ASTCENC=1",
 					"-DCOMPILE_ATE=1",
 					"-DCOMPILE_ETCENC=1",
@@ -2005,18 +1519,19 @@
 					"-DCOMPILE_COMP=1",
 					"-DCOMPILE_BASIS=0",
 					"-DCOMPILE_EASTL=0",
-					"-ftime-trace",
 				);
-				PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO;
 				SDKROOT = macosx;
 				USER_HEADER_SEARCH_PATHS = "";
+				XROS_DEPLOYMENT_TARGET = 2.0;
 			};
 			name = Debug;
 		};
 		706ECDE826D1577A001C950E /* Release */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C732CC436A000729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = "$(ARCHS_STANDARD)";
 				CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
@@ -2035,6 +1550,7 @@
 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_IMPLICIT_FALLTHROUGH = YES_ERROR;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
@@ -2080,13 +1596,12 @@
 				HEADER_SEARCH_PATHS = (
 					"$(PROJECT_DIR)/../libkram/eastl/include",
 					"$(PROJECT_DIR)/../libkram/kram",
+					"$(PROJECT_DIR)/../libkram/vectormath",
 				);
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_FAST_MATH = YES;
 				OTHER_CFLAGS = (
-					"-DNDEBUG=1",
+					"$(inherited)",
 					"-DCOMPILE_ASTCENC=1",
 					"-DCOMPILE_ATE=1",
 					"-DCOMPILE_ETCENC=1",
@@ -2095,71 +1610,46 @@
 					"-DCOMPILE_COMP=1",
 					"-DCOMPILE_BASIS=0",
 					"-DCOMPILE_EASTL=0",
-					"-ftime-trace",
 				);
-				PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO;
 				SDKROOT = macosx;
 				USER_HEADER_SEARCH_PATHS = "";
+				XROS_DEPLOYMENT_TARGET = 2.0;
 			};
 			name = Release;
 		};
 		706ECDEA26D1577A001C950E /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
+				ALLOW_TARGET_PLATFORM_SPECIALIZATION = YES;
 				CLANG_WARN_OBJC_EXPLICIT_OWNERSHIP_TYPE = YES;
 				CLANG_WARN_OBJC_REPEATED_USE_OF_WEAK = YES;
-				CLANG_X86_VECTOR_INSTRUCTIONS = avx;
 				CODE_SIGN_STYLE = Automatic;
 				EXECUTABLE_PREFIX = lib;
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				SDKROOT = Automatic;
 				SKIP_INSTALL = YES;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx xros xrsimulator";
+				SUPPORTS_MACCATALYST = NO;
 				SYSTEM_HEADER_SEARCH_PATHS = "";
+				TARGETED_DEVICE_FAMILY = "1,2,7";
 			};
 			name = Debug;
 		};
 		706ECDEB26D1577A001C950E /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
+				ALLOW_TARGET_PLATFORM_SPECIALIZATION = YES;
 				CLANG_WARN_OBJC_EXPLICIT_OWNERSHIP_TYPE = YES;
 				CLANG_WARN_OBJC_REPEATED_USE_OF_WEAK = YES;
-				CLANG_X86_VECTOR_INSTRUCTIONS = avx;
 				CODE_SIGN_STYLE = Automatic;
 				EXECUTABLE_PREFIX = lib;
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				SDKROOT = Automatic;
 				SKIP_INSTALL = YES;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx xros xrsimulator";
+				SUPPORTS_MACCATALYST = NO;
 				SYSTEM_HEADER_SEARCH_PATHS = "";
-			};
-			name = Release;
-		};
-		706EF1DF26D166C5001C950E /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CLANG_WARN_DOCUMENTATION_COMMENTS = NO;
-				CLANG_X86_VECTOR_INSTRUCTIONS = default;
-				CODE_SIGN_STYLE = Automatic;
-				EXECUTABLE_PREFIX = lib;
-				IPHONEOS_DEPLOYMENT_TARGET = 14.1;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SDKROOT = iphoneos;
-				SKIP_INSTALL = YES;
-				SUPPORTED_PLATFORMS = "iphonesimulator iphoneos";
-				SYSTEM_HEADER_SEARCH_PATHS = "";
-			};
-			name = Debug;
-		};
-		706EF1E026D166C5001C950E /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CLANG_WARN_DOCUMENTATION_COMMENTS = NO;
-				CLANG_X86_VECTOR_INSTRUCTIONS = default;
-				CODE_SIGN_STYLE = Automatic;
-				EXECUTABLE_PREFIX = lib;
-				IPHONEOS_DEPLOYMENT_TARGET = 14.1;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SDKROOT = iphoneos;
-				SKIP_INSTALL = YES;
-				SUPPORTED_PLATFORMS = "iphonesimulator iphoneos";
-				SYSTEM_HEADER_SEARCH_PATHS = "";
+				TARGETED_DEVICE_FAMILY = "1,2,7";
 			};
 			name = Release;
 		};
@@ -2184,15 +1674,6 @@
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
-		706EF1DE26D166C5001C950E /* Build configuration list for PBXNativeTarget "kram-ios" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				706EF1DF26D166C5001C950E /* Debug */,
-				706EF1E026D166C5001C950E /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
 /* End XCConfigurationList section */
 	};
 	rootObject = 706ECDD626D1577A001C950E /* Project object */;
diff --git a/build2/kram.xcworkspace/contents.xcworkspacedata b/build2/kram.xcworkspace/contents.xcworkspacedata
index 6c500c6d..c4f3782a 100644
--- a/build2/kram.xcworkspace/contents.xcworkspacedata
+++ b/build2/kram.xcworkspace/contents.xcworkspacedata
@@ -16,4 +16,7 @@
    <FileRef
       location = "container:kramv.xcodeproj">
    </FileRef>
+   <FileRef
+      location = "group:vectormath.xcodeproj">
+   </FileRef>
 </Workspace>
diff --git a/build2/kramc.xcodeproj/project.pbxproj b/build2/kramc.xcodeproj/project.pbxproj
index 5242dd82..e422cd4d 100644
--- a/build2/kramc.xcodeproj/project.pbxproj
+++ b/build2/kramc.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		701AF1922CAE4F2300BD0886 /* libvectormath.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 70B687272CAD1996007ACA58 /* libvectormath.a */; };
 		705F68F82BA2DD2000437FAA /* libcompression.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 705F68F72BA2DD1100437FAA /* libcompression.tbd */; };
 		706EF28326D18251001C950E /* libkram.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 706EF28226D18251001C950E /* libkram.a */; };
 		706EF28526D1825D001C950E /* libate.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 706EF28426D18257001C950E /* libate.tbd */; };
@@ -33,6 +34,8 @@
 		706EF28226D18251001C950E /* libkram.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; path = libkram.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		706EF28426D18257001C950E /* libate.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libate.tbd; path = usr/lib/libate.tbd; sourceTree = SDKROOT; };
 		706EF28A26D182CB001C950E /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
+		707D4C542CC420FE00729BE0 /* kram.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = kram.xcconfig; sourceTree = "<group>"; };
+		70B687272CAD1996007ACA58 /* libvectormath.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; path = libvectormath.a; sourceTree = BUILT_PRODUCTS_DIR; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -40,6 +43,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				701AF1922CAE4F2300BD0886 /* libvectormath.a in Frameworks */,
 				706EF28326D18251001C950E /* libkram.a in Frameworks */,
 				706EF28B26D182CB001C950E /* Foundation.framework in Frameworks */,
 				706EF28526D1825D001C950E /* libate.tbd in Frameworks */,
@@ -56,6 +60,7 @@
 				706EF27E26D18223001C950E /* kramc */,
 				706EF27326D18082001C950E /* Products */,
 				706EF28126D18251001C950E /* Frameworks */,
+				707D4C542CC420FE00729BE0 /* kram.xcconfig */,
 			);
 			sourceTree = "<group>";
 		};
@@ -79,6 +84,7 @@
 		706EF28126D18251001C950E /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				70B687272CAD1996007ACA58 /* libvectormath.a */,
 				705F68F72BA2DD1100437FAA /* libcompression.tbd */,
 				706EF28A26D182CB001C950E /* Foundation.framework */,
 				706EF28426D18257001C950E /* libate.tbd */,
@@ -153,8 +159,10 @@
 /* Begin XCBuildConfiguration section */
 		706EF27726D18082001C950E /* Debug */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C542CC420FE00729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = "$(ARCHS_STANDARD)";
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
@@ -171,6 +179,7 @@
 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_IMPLICIT_FALLTHROUGH = YES_ERROR;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
@@ -189,7 +198,6 @@
 				DEBUG_INFORMATION_FORMAT = dwarf;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				ENABLE_TESTABILITY = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu11;
 				GCC_DYNAMIC_NO_PIC = NO;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
@@ -202,12 +210,13 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = (
-					"-DCOMPILE_EASTL=0",
+					"$(inherited)",
 					"-include",
 					KramConfig.h,
 				);
@@ -218,8 +227,10 @@
 		};
 		706EF27826D18082001C950E /* Release */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C542CC420FE00729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = "$(ARCHS_STANDARD)";
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
@@ -236,6 +247,7 @@
 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_IMPLICIT_FALLTHROUGH = YES_ERROR;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
@@ -254,7 +266,6 @@
 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
 				ENABLE_NS_ASSERTIONS = NO;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu11;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
@@ -265,12 +276,12 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
 				OTHER_CFLAGS = (
-					"-DNDEBUG=1",
-					"-DCOMPILE_EASTL=0",
+					"$(inherited)",
 					"-include",
 					KramConfig.h,
 				);
@@ -289,12 +300,18 @@
 				CLANG_WARN_OBJC_INTERFACE_IVARS = YES_ERROR;
 				CLANG_WARN_OBJC_MISSING_PROPERTY_SYNTHESIS = YES;
 				CLANG_WARN_OBJC_REPEATED_USE_OF_WEAK = YES;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS = YES;
 				GCC_WARN_NON_VIRTUAL_DESTRUCTOR = YES;
 				GCC_WARN_SHADOW = YES;
 				GCC_WARN_STRICT_SELECTOR_MATCH = YES;
-				HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram";
+				HEADER_SEARCH_PATHS = (
+					"$(PROJECT_DIR)/../libkram/kram",
+					"$(PROJECT_DIR)/../libkram/vectormath",
+				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.hialec.kramc;
 				PRODUCT_NAME = kram;
 			};
@@ -310,13 +327,19 @@
 				CLANG_WARN_OBJC_INTERFACE_IVARS = YES_ERROR;
 				CLANG_WARN_OBJC_MISSING_PROPERTY_SYNTHESIS = YES;
 				CLANG_WARN_OBJC_REPEATED_USE_OF_WEAK = YES;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
 				DEAD_CODE_STRIPPING = YES;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS = YES;
 				GCC_WARN_NON_VIRTUAL_DESTRUCTOR = YES;
 				GCC_WARN_SHADOW = YES;
 				GCC_WARN_STRICT_SELECTOR_MATCH = YES;
-				HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram";
+				HEADER_SEARCH_PATHS = (
+					"$(PROJECT_DIR)/../libkram/kram",
+					"$(PROJECT_DIR)/../libkram/vectormath",
+				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.hialec.kramc;
 				PRODUCT_NAME = kram;
 			};
diff --git a/build2/kramv.xcodeproj/project.pbxproj b/build2/kramv.xcodeproj/project.pbxproj
index 8aa3fc53..5f9c1043 100644
--- a/build2/kramv.xcodeproj/project.pbxproj
+++ b/build2/kramv.xcodeproj/project.pbxproj
@@ -37,6 +37,9 @@
 		708D44D5272FA4C800783DCE /* piazza_san_marco.ktx in Resources */ = {isa = PBXBuildFile; fileRef = 708D44D3272FA4C800783DCE /* piazza_san_marco.ktx */; };
 		7099CFBD28E8319C008D4ABF /* UniformTypeIdentifiers.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 7099CFBC28E8319B008D4ABF /* UniformTypeIdentifiers.framework */; };
 		70B5BFF828F5254000CD83D8 /* CoreText.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 70B5BFF728F5253F00CD83D8 /* CoreText.framework */; };
+		70B687222CAD1962007ACA58 /* libvectormath.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 70B687212CAD1962007ACA58 /* libvectormath.a */; };
+		70B687242CAD1976007ACA58 /* libvectormath.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 70B687232CAD1976007ACA58 /* libvectormath.a */; };
+		70B687262CAD197E007ACA58 /* libvectormath.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 70B687252CAD197E007ACA58 /* libvectormath.a */; };
 		70E33EC826E536BF00CBA422 /* QuickLookThumbnailing.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 70E33EC726E536BF00CBA422 /* QuickLookThumbnailing.framework */; };
 		70E33ECA26E536BF00CBA422 /* Quartz.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 70E33EC926E536BF00CBA422 /* Quartz.framework */; };
 		70E33ECE26E536BF00CBA422 /* KramThumbnailProvider.mm in Sources */ = {isa = PBXBuildFile; fileRef = 70E33ECD26E536BF00CBA422 /* KramThumbnailProvider.mm */; };
@@ -123,6 +126,7 @@
 		706EF25426D17C85001C950E /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
 		706EF25626D17C9D001C950E /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; };
 		706EF25926D17CAA001C950E /* libate.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libate.tbd; path = usr/lib/libate.tbd; sourceTree = SDKROOT; };
+		707D4C532CC420E700729BE0 /* kram.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = kram.xcconfig; sourceTree = "<group>"; };
 		7083365D2715642C0077BCB6 /* GLTF.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = GLTF.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		70833661271564320077BCB6 /* GLTFMTL.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = GLTFMTL.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		70833664271575E50077BCB6 /* GLTF.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = GLTF.framework; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -136,6 +140,9 @@
 		708D44D3272FA4C800783DCE /* piazza_san_marco.ktx */ = {isa = PBXFileReference; lastKnownFileType = file; path = piazza_san_marco.ktx; sourceTree = "<group>"; };
 		7099CFBC28E8319B008D4ABF /* UniformTypeIdentifiers.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UniformTypeIdentifiers.framework; path = System/Library/Frameworks/UniformTypeIdentifiers.framework; sourceTree = SDKROOT; };
 		70B5BFF728F5253F00CD83D8 /* CoreText.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreText.framework; path = System/Library/Frameworks/CoreText.framework; sourceTree = SDKROOT; };
+		70B687212CAD1962007ACA58 /* libvectormath.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; path = libvectormath.a; sourceTree = BUILT_PRODUCTS_DIR; };
+		70B687232CAD1976007ACA58 /* libvectormath.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; path = libvectormath.a; sourceTree = BUILT_PRODUCTS_DIR; };
+		70B687252CAD197E007ACA58 /* libvectormath.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; path = libvectormath.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		70E33EC626E536BF00CBA422 /* kram-thumb.appex */ = {isa = PBXFileReference; explicitFileType = "wrapper.app-extension"; includeInIndex = 0; path = "kram-thumb.appex"; sourceTree = BUILT_PRODUCTS_DIR; };
 		70E33EC726E536BF00CBA422 /* QuickLookThumbnailing.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = QuickLookThumbnailing.framework; path = System/Library/Frameworks/QuickLookThumbnailing.framework; sourceTree = SDKROOT; };
 		70E33EC926E536BF00CBA422 /* Quartz.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Quartz.framework; path = System/Library/Frameworks/Quartz.framework; sourceTree = SDKROOT; };
@@ -158,15 +165,16 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				70B687222CAD1962007ACA58 /* libvectormath.a in Frameworks */,
+				70871D4327CAD3EA00D0B9E1 /* libkram.a in Frameworks */,
+				706EF26726D17DFA001C950E /* libate.tbd in Frameworks */,
 				706EF24D26D17C30001C950E /* ModelIO.framework in Frameworks */,
 				706EF25226D17C6F001C950E /* MetalKit.framework in Frameworks */,
-				70871D4327CAD3EA00D0B9E1 /* libkram.a in Frameworks */,
 				70B5BFF828F5254000CD83D8 /* CoreText.framework in Frameworks */,
 				706EF25526D17C85001C950E /* Metal.framework in Frameworks */,
 				7099CFBD28E8319C008D4ABF /* UniformTypeIdentifiers.framework in Frameworks */,
 				706EF25726D17C9D001C950E /* AppKit.framework in Frameworks */,
 				70833669271575EA0077BCB6 /* GLTFMTL.framework in Frameworks */,
-				706EF26726D17DFA001C950E /* libate.tbd in Frameworks */,
 				706EF24F26D17C43001C950E /* Foundation.framework in Frameworks */,
 				70833665271575E50077BCB6 /* GLTF.framework in Frameworks */,
 				705F68FA2BA2DD4800437FAA /* libcompression.tbd in Frameworks */,
@@ -177,10 +185,11 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				70B687242CAD1976007ACA58 /* libvectormath.a in Frameworks */,
+				70E33ED826E5377000CBA422 /* libkram.a in Frameworks */,
 				70E33EC826E536BF00CBA422 /* QuickLookThumbnailing.framework in Frameworks */,
 				705F68FB2BA2DD5900437FAA /* libcompression.tbd in Frameworks */,
 				70E33ECA26E536BF00CBA422 /* Quartz.framework in Frameworks */,
-				70E33ED826E5377000CBA422 /* libkram.a in Frameworks */,
 				70E33EDD26E537AD00CBA422 /* Accelerate.framework in Frameworks */,
 				70E33EDB26E5379900CBA422 /* CoreGraphics.framework in Frameworks */,
 				70E33ED926E5378800CBA422 /* libate.tbd in Frameworks */,
@@ -191,12 +200,13 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				70B687262CAD197E007ACA58 /* libvectormath.a in Frameworks */,
+				70E33EF326E548C700CBA422 /* libkram.a in Frameworks */,
 				70E33EF526E548D800CBA422 /* CoreGraphics.framework in Frameworks */,
 				705F68FC2BA2DD6200437FAA /* libcompression.tbd in Frameworks */,
 				70E33EF426E548CF00CBA422 /* libate.tbd in Frameworks */,
 				70E33EF626E548E200CBA422 /* Accelerate.framework in Frameworks */,
 				70E33EF726E553B900CBA422 /* AppKit.framework in Frameworks */,
-				70E33EF326E548C700CBA422 /* libkram.a in Frameworks */,
 				70E33EE326E5478900CBA422 /* Quartz.framework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -214,6 +224,7 @@
 				70E33EE426E5478900CBA422 /* kram-preview */,
 				706EF21026D17A26001C950E /* Products */,
 				706EF24726D17BC2001C950E /* Frameworks */,
+				707D4C532CC420E700729BE0 /* kram.xcconfig */,
 			);
 			sourceTree = "<group>";
 		};
@@ -249,6 +260,9 @@
 		706EF24726D17BC2001C950E /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				70B687252CAD197E007ACA58 /* libvectormath.a */,
+				70B687232CAD1976007ACA58 /* libvectormath.a */,
+				70B687212CAD1962007ACA58 /* libvectormath.a */,
 				705F68F92BA2DD3E00437FAA /* libcompression.tbd */,
 				70B5BFF728F5253F00CD83D8 /* CoreText.framework */,
 				7099CFBC28E8319B008D4ABF /* UniformTypeIdentifiers.framework */,
@@ -386,7 +400,7 @@
 			isa = PBXProject;
 			attributes = {
 				BuildIndependentTargetsInParallel = YES;
-				LastUpgradeCheck = 1500;
+				LastUpgradeCheck = 1540;
 				TargetAttributes = {
 					706EF20E26D17A26001C950E = {
 						CreatedOnToolsVersion = 12.4;
@@ -519,8 +533,11 @@
 /* Begin XCBuildConfiguration section */
 		706EF22126D17A2E001C950E /* Debug */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C532CC420E700729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
@@ -537,6 +554,7 @@
 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_IMPLICIT_FALLTHROUGH = YES_ERROR;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
@@ -554,6 +572,7 @@
 				DEAD_CODE_STRIPPING = YES;
 				DEBUG_INFORMATION_FORMAT = dwarf;
 				DONT_GENERATE_INFOPLIST_FILE = YES;
+				ENABLE_APP_SANDBOX = YES;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				ENABLE_TESTABILITY = YES;
 				ENABLE_USER_SCRIPT_SANDBOXING = YES;
@@ -573,18 +592,19 @@
 					"$(PROJECT_DIR)/../libkram/kram",
 					"$(PROJECT_DIR)/../libkram",
 					"$(PROJECT_DIR)/../libkram/eastl/include",
+					"$(PROJECT_DIR)/../libkram/vectormath",
 				);
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = (
-					"-DCOMPILE_EASTL=0",
+					"$(inherited)",
 					"-include",
 					KramConfig.h,
 				);
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO;
 				SDKROOT = macosx;
 			};
@@ -592,8 +612,11 @@
 		};
 		706EF22226D17A2E001C950E /* Release */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C532CC420E700729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
@@ -610,6 +633,7 @@
 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_IMPLICIT_FALLTHROUGH = YES_ERROR;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
@@ -627,6 +651,8 @@
 				DEAD_CODE_STRIPPING = YES;
 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
 				DONT_GENERATE_INFOPLIST_FILE = YES;
+				ENABLE_APP_SANDBOX = YES;
+				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_NS_ASSERTIONS = NO;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				ENABLE_USER_SCRIPT_SANDBOXING = YES;
@@ -644,18 +670,18 @@
 					"$(PROJECT_DIR)/../libkram/kram",
 					"$(PROJECT_DIR)/../libkram",
 					"$(PROJECT_DIR)/../libkram/eastl/include",
+					"$(PROJECT_DIR)/../libkram/vectormath",
 				);
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				OTHER_CFLAGS = (
-					"-DNDEBUG=1",
-					"-DCOMPILE_EASTL=0",
+					"$(inherited)",
 					"-include",
 					KramConfig.h,
 				);
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO;
 				SDKROOT = macosx;
 			};
@@ -674,9 +700,12 @@
 				CLANG_WARN_OBJC_MISSING_PROPERTY_SYNTHESIS = NO;
 				CLANG_WARN_OBJC_REPEATED_USE_OF_WEAK = YES;
 				CODE_SIGN_ENTITLEMENTS = "$(PROJECT_DIR)/../kramv/kramv.entitlements";
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Developer ID Application";
+				CODE_SIGN_STYLE = Manual;
 				COMBINE_HIDPI_IMAGES = YES;
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS = YES;
 				GCC_WARN_NON_VIRTUAL_DESTRUCTOR = YES;
@@ -693,6 +722,7 @@
 				MTL_LANGUAGE_REVISION = Metal30;
 				PRODUCT_BUNDLE_IDENTIFIER = com.hialec.kramv;
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramViewerProvision;
 			};
 			name = Debug;
 		};
@@ -709,10 +739,13 @@
 				CLANG_WARN_OBJC_MISSING_PROPERTY_SYNTHESIS = NO;
 				CLANG_WARN_OBJC_REPEATED_USE_OF_WEAK = YES;
 				CODE_SIGN_ENTITLEMENTS = "$(PROJECT_DIR)/../kramv/kramv.entitlements";
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Developer ID Application";
+				CODE_SIGN_STYLE = Manual;
 				COMBINE_HIDPI_IMAGES = YES;
 				DEAD_CODE_STRIPPING = YES;
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS = YES;
 				GCC_WARN_NON_VIRTUAL_DESTRUCTOR = YES;
@@ -727,15 +760,9 @@
 				);
 				MARKETING_VERSION = 2.0.0;
 				MTL_LANGUAGE_REVISION = Metal30;
-				OTHER_CFLAGS = (
-					"-DNDEBUG=1",
-					"-DCOMPILE_FASTL=0",
-					"-DCOMPILE_EASTL=0",
-					"-include",
-					KramConfig.h,
-				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.hialec.kramv;
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramViewerProvision;
 			};
 			name = Release;
 		};
@@ -743,8 +770,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				CODE_SIGN_ENTITLEMENTS = "$(SRCROOT)/../kram-thumb/kram_thumb.entitlements";
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = "";
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				INFOPLIST_FILE = "../kram-thumb/Info.plist";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
@@ -753,6 +782,7 @@
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = "com.hialec.kramv.kram-thumb";
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramViewerThumbProvision;
 				SKIP_INSTALL = YES;
 			};
 			name = Debug;
@@ -761,8 +791,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				CODE_SIGN_ENTITLEMENTS = "$(SRCROOT)/../kram-thumb/kram_thumb.entitlements";
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = "";
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				INFOPLIST_FILE = "../kram-thumb/Info.plist";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
@@ -771,6 +803,7 @@
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = "com.hialec.kramv.kram-thumb";
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramViewerThumbProvision;
 				SKIP_INSTALL = YES;
 			};
 			name = Release;
@@ -779,8 +812,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				CODE_SIGN_ENTITLEMENTS = "$(SRCROOT)/../kram-preview/kram_preview.entitlements";
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = "";
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				INFOPLIST_FILE = "../kram-preview/Info.plist";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
@@ -789,6 +824,7 @@
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = "com.hialec.kramv.kram-preview";
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramViewerPreviewProvision;
 				SKIP_INSTALL = YES;
 			};
 			name = Debug;
@@ -797,8 +833,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				CODE_SIGN_ENTITLEMENTS = "$(SRCROOT)/../kram-preview/kram_preview.entitlements";
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = "";
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
 				INFOPLIST_FILE = "../kram-preview/Info.plist";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
@@ -807,6 +845,7 @@
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = "com.hialec.kramv.kram-preview";
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramViewerPreviewProvision;
 				SKIP_INSTALL = YES;
 			};
 			name = Release;
diff --git a/build2/vectormath.xcodeproj/project.pbxproj b/build2/vectormath.xcodeproj/project.pbxproj
new file mode 100644
index 00000000..3b4487e0
--- /dev/null
+++ b/build2/vectormath.xcodeproj/project.pbxproj
@@ -0,0 +1,378 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 77;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		70570FE52CB378EE005692BB /* bounds234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70570FE42CB378E7005692BB /* bounds234.h */; };
+		70570FE92CB379C9005692BB /* bounds234.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70570FE72CB37997005692BB /* bounds234.cpp */; };
+		70B6870B2CAD1072007ACA58 /* float234.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B687002CAD1072007ACA58 /* float234.cpp */; };
+		70B6870C2CAD1072007ACA58 /* double234.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B686FC2CAD1072007ACA58 /* double234.cpp */; };
+		70B6870D2CAD1072007ACA58 /* vectormath234.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B687092CAD1072007ACA58 /* vectormath234.cpp */; };
+		70B6870E2CAD1072007ACA58 /* float4a.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70B686FE2CAD1072007ACA58 /* float4a.cpp */; };
+		70B6870F2CAD1072007ACA58 /* vectormath234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B687082CAD1072007ACA58 /* vectormath234.h */; };
+		70B687102CAD1072007ACA58 /* long234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B687032CAD1072007ACA58 /* long234.h */; };
+		70B687112CAD1072007ACA58 /* sse2neon.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B687062CAD1072007ACA58 /* sse2neon.h */; };
+		70B687122CAD1072007ACA58 /* sse_mathfun.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B687052CAD1072007ACA58 /* sse_mathfun.h */; };
+		70B687132CAD1072007ACA58 /* float234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686FF2CAD1072007ACA58 /* float234.h */; };
+		70B687142CAD1072007ACA58 /* half234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B687012CAD1072007ACA58 /* half234.h */; };
+		70B687152CAD1072007ACA58 /* int234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B687022CAD1072007ACA58 /* int234.h */; };
+		70B687162CAD1072007ACA58 /* float4a.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686FD2CAD1072007ACA58 /* float4a.h */; };
+		70B687172CAD1072007ACA58 /* sse2neon-arm64.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B687072CAD1072007ACA58 /* sse2neon-arm64.h */; };
+		70B687182CAD1072007ACA58 /* double234.h in Headers */ = {isa = PBXBuildFile; fileRef = 70B686FB2CAD1072007ACA58 /* double234.h */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		70570FE42CB378E7005692BB /* bounds234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = bounds234.h; sourceTree = "<group>"; };
+		70570FE72CB37997005692BB /* bounds234.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = bounds234.cpp; sourceTree = "<group>"; };
+		70570FEF2CB8C5C6005692BB /* module.modulemap */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.module-map"; path = module.modulemap; sourceTree = "<group>"; };
+		707D4C522CC41F3900729BE0 /* kram.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = kram.xcconfig; sourceTree = "<group>"; };
+		7087895C2CC6C17700E34A6B /* simdk.py */ = {isa = PBXFileReference; lastKnownFileType = text.script.python; name = simdk.py; path = ../scripts/simdk.py; sourceTree = SOURCE_ROOT; };
+		70B686F42CAD1026007ACA58 /* libvectormath.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libvectormath.a; sourceTree = BUILT_PRODUCTS_DIR; };
+		70B686FB2CAD1072007ACA58 /* double234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = double234.h; sourceTree = "<group>"; };
+		70B686FC2CAD1072007ACA58 /* double234.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = double234.cpp; sourceTree = "<group>"; };
+		70B686FD2CAD1072007ACA58 /* float4a.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = float4a.h; sourceTree = "<group>"; };
+		70B686FE2CAD1072007ACA58 /* float4a.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = float4a.cpp; sourceTree = "<group>"; };
+		70B686FF2CAD1072007ACA58 /* float234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = float234.h; sourceTree = "<group>"; };
+		70B687002CAD1072007ACA58 /* float234.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = float234.cpp; sourceTree = "<group>"; };
+		70B687012CAD1072007ACA58 /* half234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = half234.h; sourceTree = "<group>"; };
+		70B687022CAD1072007ACA58 /* int234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = int234.h; sourceTree = "<group>"; };
+		70B687032CAD1072007ACA58 /* long234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = long234.h; sourceTree = "<group>"; };
+		70B687042CAD1072007ACA58 /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = "<group>"; };
+		70B687052CAD1072007ACA58 /* sse_mathfun.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sse_mathfun.h; sourceTree = "<group>"; };
+		70B687062CAD1072007ACA58 /* sse2neon.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sse2neon.h; sourceTree = "<group>"; };
+		70B687072CAD1072007ACA58 /* sse2neon-arm64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "sse2neon-arm64.h"; sourceTree = "<group>"; };
+		70B687082CAD1072007ACA58 /* vectormath234.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = vectormath234.h; sourceTree = "<group>"; };
+		70B687092CAD1072007ACA58 /* vectormath234.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = vectormath234.cpp; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		70B686F22CAD1026007ACA58 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		70B686EB2CAD1026007ACA58 = {
+			isa = PBXGroup;
+			children = (
+				70B6870A2CAD1072007ACA58 /* vectormath */,
+				70B686F52CAD1026007ACA58 /* Products */,
+				707D4C522CC41F3900729BE0 /* kram.xcconfig */,
+			);
+			sourceTree = "<group>";
+		};
+		70B686F52CAD1026007ACA58 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				70B686F42CAD1026007ACA58 /* libvectormath.a */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		70B6870A2CAD1072007ACA58 /* vectormath */ = {
+			isa = PBXGroup;
+			children = (
+				7087895C2CC6C17700E34A6B /* simdk.py */,
+				70570FEF2CB8C5C6005692BB /* module.modulemap */,
+				70B687042CAD1072007ACA58 /* README.md */,
+				70B686FB2CAD1072007ACA58 /* double234.h */,
+				70B686FC2CAD1072007ACA58 /* double234.cpp */,
+				70B686FD2CAD1072007ACA58 /* float4a.h */,
+				70B686FE2CAD1072007ACA58 /* float4a.cpp */,
+				70B686FF2CAD1072007ACA58 /* float234.h */,
+				70B687002CAD1072007ACA58 /* float234.cpp */,
+				70570FE42CB378E7005692BB /* bounds234.h */,
+				70570FE72CB37997005692BB /* bounds234.cpp */,
+				70B687012CAD1072007ACA58 /* half234.h */,
+				70B687022CAD1072007ACA58 /* int234.h */,
+				70B687032CAD1072007ACA58 /* long234.h */,
+				70B687052CAD1072007ACA58 /* sse_mathfun.h */,
+				70B687062CAD1072007ACA58 /* sse2neon.h */,
+				70B687072CAD1072007ACA58 /* sse2neon-arm64.h */,
+				70B687082CAD1072007ACA58 /* vectormath234.h */,
+				70B687092CAD1072007ACA58 /* vectormath234.cpp */,
+			);
+			name = vectormath;
+			path = ../libkram/vectormath;
+			sourceTree = SOURCE_ROOT;
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		70B686F02CAD1026007ACA58 /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				70B6870F2CAD1072007ACA58 /* vectormath234.h in Headers */,
+				70570FE52CB378EE005692BB /* bounds234.h in Headers */,
+				70B687102CAD1072007ACA58 /* long234.h in Headers */,
+				70B687112CAD1072007ACA58 /* sse2neon.h in Headers */,
+				70B687122CAD1072007ACA58 /* sse_mathfun.h in Headers */,
+				70B687132CAD1072007ACA58 /* float234.h in Headers */,
+				70B687142CAD1072007ACA58 /* half234.h in Headers */,
+				70B687152CAD1072007ACA58 /* int234.h in Headers */,
+				70B687162CAD1072007ACA58 /* float4a.h in Headers */,
+				70B687172CAD1072007ACA58 /* sse2neon-arm64.h in Headers */,
+				70B687182CAD1072007ACA58 /* double234.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		70B686F32CAD1026007ACA58 /* vectormath */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 70B686F82CAD1026007ACA58 /* Build configuration list for PBXNativeTarget "vectormath" */;
+			buildPhases = (
+				70B686F02CAD1026007ACA58 /* Headers */,
+				70B686F12CAD1026007ACA58 /* Sources */,
+				70B686F22CAD1026007ACA58 /* Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = vectormath;
+			packageProductDependencies = (
+			);
+			productName = vectormath;
+			productReference = 70B686F42CAD1026007ACA58 /* libvectormath.a */;
+			productType = "com.apple.product-type.library.static";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		70B686EC2CAD1026007ACA58 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastUpgradeCheck = 1600;
+				TargetAttributes = {
+					70B686F32CAD1026007ACA58 = {
+						CreatedOnToolsVersion = 16.0;
+					};
+				};
+			};
+			buildConfigurationList = 70B686EF2CAD1026007ACA58 /* Build configuration list for PBXProject "vectormath" */;
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 70B686EB2CAD1026007ACA58;
+			minimizedProjectReferenceProxies = 1;
+			preferredProjectObjectVersion = 77;
+			productRefGroup = 70B686F52CAD1026007ACA58 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				70B686F32CAD1026007ACA58 /* vectormath */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		70B686F12CAD1026007ACA58 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				70B6870B2CAD1072007ACA58 /* float234.cpp in Sources */,
+				70B6870C2CAD1072007ACA58 /* double234.cpp in Sources */,
+				70B6870D2CAD1072007ACA58 /* vectormath234.cpp in Sources */,
+				70570FE92CB379C9005692BB /* bounds234.cpp in Sources */,
+				70B6870E2CAD1072007ACA58 /* float4a.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		70B686F62CAD1026007ACA58 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C522CC41F3900729BE0 /* kram.xcconfig */;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = "$(ARCHS_STANDARD)";
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				DEFINES_MODULE = YES;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_ENABLE_CPP_EXCEPTIONS = NO;
+				GCC_ENABLE_CPP_RTTI = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 1;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MACOSX_DEPLOYMENT_TARGET = 13.0;
+				MODULEMAP_FILE = ../libkram/vectormath/module.modulemap;
+				MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c c++";
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = Automatic;
+				XROS_DEPLOYMENT_TARGET = 2.0;
+			};
+			name = Debug;
+		};
+		70B686F72CAD1026007ACA58 /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C522CC41F3900729BE0 /* kram.xcconfig */;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = "$(ARCHS_STANDARD)";
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				DEFINES_MODULE = YES;
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_ENABLE_CPP_EXCEPTIONS = NO;
+				GCC_ENABLE_CPP_RTTI = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MACOSX_DEPLOYMENT_TARGET = 13.0;
+				MODULEMAP_FILE = ../libkram/vectormath/module.modulemap;
+				MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c c++";
+				SDKROOT = Automatic;
+				XROS_DEPLOYMENT_TARGET = 2.0;
+			};
+			name = Release;
+		};
+		70B686F92CAD1026007ACA58 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALLOW_TARGET_PLATFORM_SPECIALIZATION = YES;
+				CODE_SIGN_STYLE = Automatic;
+				EXECUTABLE_PREFIX = lib;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				REGISTER_APP_GROUPS = NO;
+				SKIP_INSTALL = YES;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx xros xrsimulator";
+				SUPPORTS_MACCATALYST = NO;
+				TARGETED_DEVICE_FAMILY = "1,2,7";
+			};
+			name = Debug;
+		};
+		70B686FA2CAD1026007ACA58 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALLOW_TARGET_PLATFORM_SPECIALIZATION = YES;
+				CODE_SIGN_STYLE = Automatic;
+				EXECUTABLE_PREFIX = lib;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				REGISTER_APP_GROUPS = NO;
+				SKIP_INSTALL = YES;
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx xros xrsimulator";
+				SUPPORTS_MACCATALYST = NO;
+				TARGETED_DEVICE_FAMILY = "1,2,7";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		70B686EF2CAD1026007ACA58 /* Build configuration list for PBXProject "vectormath" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				70B686F62CAD1026007ACA58 /* Debug */,
+				70B686F72CAD1026007ACA58 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		70B686F82CAD1026007ACA58 /* Build configuration list for PBXNativeTarget "vectormath" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				70B686F92CAD1026007ACA58 /* Debug */,
+				70B686FA2CAD1026007ACA58 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 70B686EC2CAD1026007ACA58 /* Project object */;
+}
diff --git a/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj b/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj
index 91e2481b..9a63773c 100644
--- a/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj
+++ b/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj
@@ -56,6 +56,7 @@
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
+		707D4C5C2CC42C2700729BE0 /* kram.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = kram.xcconfig; sourceTree = "<group>"; };
 		83319297202589FC00B6C7E9 /* GLTFBinaryChunk.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = GLTFBinaryChunk.h; sourceTree = "<group>"; };
 		8331929B20258A4000B6C7E9 /* GLTFBinaryChunk.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = GLTFBinaryChunk.m; sourceTree = "<group>"; };
 		8331929E2025911D00B6C7E9 /* GLTFExtensionNames.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = GLTFExtensionNames.m; sourceTree = "<group>"; };
@@ -144,6 +145,7 @@
 				83D6FF7E1F48BBFA00F71E0C /* Info.plist */,
 				83D6FF491F48BB3A00F71E0C /* Products */,
 				83D600391F48C2FF00F71E0C /* Frameworks */,
+				707D4C5C2CC42C2700729BE0 /* kram.xcconfig */,
 			);
 			sourceTree = "<group>";
 		};
@@ -353,6 +355,7 @@
 /* Begin XCBuildConfiguration section */
 		83D6FF4E1F48BB3A00F71E0C /* Debug */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C5C2CC42C2700729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
 				CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
@@ -383,6 +386,7 @@
 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
 				CLANG_WARN_UNREACHABLE_CODE = YES;
 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_X86_VECTOR_INSTRUCTIONS = avx2;
 				CODE_SIGN_IDENTITY = "-";
 				COPY_PHASE_STRIP = NO;
 				CURRENT_PROJECT_VERSION = 1;
@@ -403,11 +407,11 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
-				OTHER_CFLAGS = "-ftime-trace";
+				OTHER_CFLAGS = "$(KRAM_FLAGS)";
 				SDKROOT = macosx;
 				VERSIONING_SYSTEM = "apple-generic";
 				VERSION_INFO_PREFIX = "";
@@ -446,6 +450,7 @@
 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
 				CLANG_WARN_UNREACHABLE_CODE = YES;
 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_X86_VECTOR_INSTRUCTIONS = avx2;
 				CODE_SIGN_IDENTITY = "-";
 				COPY_PHASE_STRIP = NO;
 				CURRENT_PROJECT_VERSION = 1;
@@ -464,13 +469,10 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
-				OTHER_CFLAGS = (
-					"-ftime-trace",
-					"-DNDEBUG=1",
-				);
+				OTHER_CFLAGS = "$(KRAM_FLAGS)";
 				SDKROOT = macosx;
 				VERSIONING_SYSTEM = "apple-generic";
 				VERSION_INFO_PREFIX = "";
@@ -480,6 +482,7 @@
 		83D6FF511F48BB3A00F71E0C /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
+				ARCHS = arm64;
 				CLANG_ENABLE_OBJC_WEAK = YES;
 				CODE_SIGN_IDENTITY = "";
 				COMBINE_HIDPI_IMAGES = YES;
@@ -488,7 +491,6 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_MODULE_VERIFIER = YES;
 				FRAMEWORK_VERSION = A;
 				INFOPLIST_FILE = Info.plist;
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
@@ -509,7 +511,9 @@
 		};
 		83D6FF521F48BB3A00F71E0C /* Release */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C5C2CC42C2700729BE0 /* kram.xcconfig */;
 			buildSettings = {
+				ARCHS = arm64;
 				CLANG_ENABLE_OBJC_WEAK = YES;
 				CODE_SIGN_IDENTITY = "";
 				COMBINE_HIDPI_IMAGES = YES;
@@ -518,7 +522,6 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_MODULE_VERIFIER = YES;
 				FRAMEWORK_VERSION = A;
 				INFOPLIST_FILE = Info.plist;
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
diff --git a/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h b/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h
index 5044f4e4..2806b2bd 100644
--- a/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h
+++ b/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h
@@ -15,6 +15,7 @@
 //
 
 #import <GLTF/GLTFObject.h>
+#import <simd/vector_types.h>
 
 //@import simd;
 
diff --git a/gtlf/GLTF/Headers/GLTFCamera.h b/gtlf/GLTF/Headers/GLTFCamera.h
index 348a44ca..b88e7ace 100644
--- a/gtlf/GLTF/Headers/GLTFCamera.h
+++ b/gtlf/GLTF/Headers/GLTFCamera.h
@@ -19,6 +19,8 @@
 
 //@import simd;
 
+#import <simd/matrix_types.h>
+
 NS_ASSUME_NONNULL_BEGIN
 
 @class GLTFNode;
diff --git a/gtlf/GLTF/Headers/GLTFEnums.h b/gtlf/GLTF/Headers/GLTFEnums.h
index 25a8d832..cee4a71a 100644
--- a/gtlf/GLTF/Headers/GLTFEnums.h
+++ b/gtlf/GLTF/Headers/GLTFEnums.h
@@ -16,7 +16,7 @@
 
 
 #import <Foundation/Foundation.h>
-#import <simd/simd.h>
+//#import <simd/simd.h>
 //@import Foundation;
 
 typedef NS_ENUM(NSInteger, GLTFDataType) {
diff --git a/gtlf/GLTF/Headers/GLTFObject.h b/gtlf/GLTF/Headers/GLTFObject.h
index 9d5c7654..7a9eef01 100644
--- a/gtlf/GLTF/Headers/GLTFObject.h
+++ b/gtlf/GLTF/Headers/GLTFObject.h
@@ -15,7 +15,7 @@
 //
 
 #import <Foundation/Foundation.h>
-#import <simd/simd.h>
+//#import <simd/simd.h>
 //@import Foundation;
 
 NS_ASSUME_NONNULL_BEGIN
diff --git a/gtlf/GLTF/Headers/GLTFTexture.h b/gtlf/GLTF/Headers/GLTFTexture.h
index 5cf0357c..b3062500 100644
--- a/gtlf/GLTF/Headers/GLTFTexture.h
+++ b/gtlf/GLTF/Headers/GLTFTexture.h
@@ -16,6 +16,7 @@
 
 #import <GLTF/GLTFObject.h>
 #import <GLTF/GLTFEnums.h>
+#import <simd/matrix_types.h>
 
 //@import simd;
 
diff --git a/gtlf/GLTF/Headers/GLTFUtilities.h b/gtlf/GLTF/Headers/GLTFUtilities.h
index 88d65925..b41ce931 100644
--- a/gtlf/GLTF/Headers/GLTFUtilities.h
+++ b/gtlf/GLTF/Headers/GLTFUtilities.h
@@ -16,6 +16,8 @@
 
 #import <GLTF/GLTFEnums.h>
 
+#import <simd/matrix.h>
+
 //@import Foundation;
 //@import simd;
 
diff --git a/gtlf/GLTF/Source/GLTFAnimation.m b/gtlf/GLTF/Source/GLTFAnimation.m
index 24ccd08f..46d153de 100644
--- a/gtlf/GLTF/Source/GLTFAnimation.m
+++ b/gtlf/GLTF/Source/GLTFAnimation.m
@@ -19,6 +19,7 @@
 #import "GLTFBufferView.h"
 #import "GLTFBuffer.h"
 #import "GLTFNode.h"
+#import <simd/quaternion.h>
 
 @implementation GLTFAnimationSampler
 
diff --git a/gtlf/GLTF/Source/GLTFCamera.m b/gtlf/GLTF/Source/GLTFCamera.m
index fd2d9c27..4b8f09c1 100644
--- a/gtlf/GLTF/Source/GLTFCamera.m
+++ b/gtlf/GLTF/Source/GLTFCamera.m
@@ -15,6 +15,7 @@
 //
 
 #import "GLTFCamera.h"
+#import <simd/simd.h>
 
 @interface GLTFCamera ()
 @property (nonatomic, assign, getter=projectionMatrixIsDirty) BOOL projectionMatrixDirty;
diff --git a/gtlf/GLTF/Source/GLTFNode.m b/gtlf/GLTF/Source/GLTFNode.m
index 52dd177f..6459d24a 100644
--- a/gtlf/GLTF/Source/GLTFNode.m
+++ b/gtlf/GLTF/Source/GLTFNode.m
@@ -18,6 +18,7 @@
 #import "GLTFAccessor.h"
 #import "GLTFMesh.h"
 #import "GLTFVertexDescriptor.h"
+#import <simd/simd.h>
 
 @interface GLTFNode ()
 @property (nonatomic, assign, getter=localTransformIsDirty) BOOL localTransformDirty;
diff --git a/gtlf/GLTF/Source/GLTFUtilities.m b/gtlf/GLTF/Source/GLTFUtilities.m
index a21bad4b..aa00c8c9 100644
--- a/gtlf/GLTF/Source/GLTFUtilities.m
+++ b/gtlf/GLTF/Source/GLTFUtilities.m
@@ -16,6 +16,8 @@
 
 #import "GLTFUtilities.h"
 
+#import <simd/simd.h>
+
 bool GLTFBoundingBoxIsEmpty(GLTFBoundingBox b) {
     return (b.minPoint.x == b.maxPoint.x) && (b.minPoint.y == b.maxPoint.y) && (b.minPoint.z == b.maxPoint.z);
 }
diff --git a/gtlf/GLTF/kram.xcconfig b/gtlf/GLTF/kram.xcconfig
new file mode 120000
index 00000000..66b528eb
--- /dev/null
+++ b/gtlf/GLTF/kram.xcconfig
@@ -0,0 +1 @@
+../../build2/kram.xcconfig
\ No newline at end of file
diff --git a/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj b/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj
index 78dc7491..a5ce1a78 100644
--- a/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj
+++ b/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj
@@ -25,6 +25,7 @@
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
+		707D4C5B2CC42C1100729BE0 /* kram.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = kram.xcconfig; sourceTree = "<group>"; };
 		839945C91F641E9000642E68 /* GLTFMTLLightingEnvironment.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = GLTFMTLLightingEnvironment.h; sourceTree = "<group>"; };
 		839945CA1F641E9000642E68 /* GLTFMTLLightingEnvironment.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = GLTFMTLLightingEnvironment.m; sourceTree = "<group>"; };
 		83AF30CA1FC4DB4D00053BED /* GLTFMTLTextureLoader.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = GLTFMTLTextureLoader.h; sourceTree = "<group>"; };
@@ -75,6 +76,7 @@
 				83D6FFD91F48BDFB00F71E0C /* Info.plist */,
 				83D6FFB21F48BCB500F71E0C /* Products */,
 				83D600341F48C24F00F71E0C /* Frameworks */,
+				707D4C5B2CC42C1100729BE0 /* kram.xcconfig */,
 			);
 			sourceTree = "<group>";
 		};
@@ -213,8 +215,10 @@
 /* Begin XCBuildConfiguration section */
 		83D6FFB71F48BCB500F71E0C /* Debug */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 707D4C5B2CC42C1100729BE0 /* kram.xcconfig */;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
 				CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
@@ -243,6 +247,7 @@
 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
 				CLANG_WARN_UNREACHABLE_CODE = YES;
 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_X86_VECTOR_INSTRUCTIONS = avx2;
 				CODE_SIGN_IDENTITY = "-";
 				COPY_PHASE_STRIP = NO;
 				CURRENT_PROJECT_VERSION = 1;
@@ -263,11 +268,11 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
-				OTHER_CFLAGS = "-ftime-trace";
+				OTHER_CFLAGS = "$(KRAM_FLAGS)";
 				SDKROOT = macosx;
 				SUPPORTED_PLATFORMS = "macosx iphoneos";
 				VALID_ARCHS = "i386 x86_64 armv7s armv7 arm64";
@@ -280,6 +285,7 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
 				CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
@@ -308,6 +314,7 @@
 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
 				CLANG_WARN_UNREACHABLE_CODE = YES;
 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_X86_VECTOR_INSTRUCTIONS = avx2;
 				CODE_SIGN_IDENTITY = "-";
 				COPY_PHASE_STRIP = NO;
 				CURRENT_PROJECT_VERSION = 1;
@@ -326,13 +333,10 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.0;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
-				OTHER_CFLAGS = (
-					"-DNDEBUG=1",
-					"-ftime-trace",
-				);
+				OTHER_CFLAGS = "$(KRAM_FLAGS)";
 				SDKROOT = macosx;
 				SUPPORTED_PLATFORMS = "macosx iphoneos";
 				VALID_ARCHS = "i386 x86_64 armv7s armv7 arm64";
@@ -352,7 +356,6 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_MODULE_VERIFIER = YES;
 				FRAMEWORK_VERSION = A;
 				INFOPLIST_FILE = "$(SRCROOT)/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
@@ -382,7 +385,6 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_MODULE_VERIFIER = YES;
 				FRAMEWORK_VERSION = A;
 				INFOPLIST_FILE = "$(SRCROOT)/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h b/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h
index acd40e02..17b7f929 100644
--- a/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h
+++ b/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h
@@ -16,7 +16,7 @@
 
 #import <GLTF/GLTF.h>
 #import <Foundation/Foundation.h>
-#import <simd/simd.h>
+//#import <simd/simd.h>
 #import <Metal/Metal.h>
 
 //@import Foundation;
diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h b/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h
index 95a99c44..6aefd850 100644
--- a/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h
+++ b/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h
@@ -16,7 +16,7 @@
 
 #import <GLTF/GLTF.h>
 #import <Foundation/Foundation.h>
-#import <simd/simd.h>
+//#import <simd/simd.h>
 #import <Metal/Metal.h>
 
 //@import Foundation;
diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h b/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h
index afc00fdd..aad79244 100644
--- a/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h
+++ b/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h
@@ -16,7 +16,7 @@
 
 #import <GLTF/GLTF.h>
 #import <Foundation/Foundation.h>
-#import <simd/simd.h>
+//#import <simd/simd.h>
 #import <Metal/Metal.h>
 
 //@import Metal;
diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h b/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h
index bc0fa5f8..0f183002 100644
--- a/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h
+++ b/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h
@@ -16,7 +16,7 @@
 
 #import <GLTF/GLTF.h>
 #import <Foundation/Foundation.h>
-#import <simd/simd.h>
+//#import <simd/simd.h>
 #import <Metal/Metal.h>
 
 
diff --git a/gtlf/GLTFMTL/kram.xcconfig b/gtlf/GLTFMTL/kram.xcconfig
new file mode 120000
index 00000000..66b528eb
--- /dev/null
+++ b/gtlf/GLTFMTL/kram.xcconfig
@@ -0,0 +1 @@
+../../build2/kram.xcconfig
\ No newline at end of file
diff --git a/hlslparser/CMakeLists.txt b/hlslparser/CMakeLists.txt
index 4e836fa1..e4c173dd 100644
--- a/hlslparser/CMakeLists.txt
+++ b/hlslparser/CMakeLists.txt
@@ -1,47 +1,18 @@
-#cmake_minimum_required(VERSION 3.19.1 FATAL_ERROR)
-
-#-----------------------------------------------------
-
-set(BUILD_MAC FALSE)
-set(BUILD_WIN FALSE)
-
-if (APPLE)
-    message("build for macOS")
-    set(BUILD_MAC TRUE)
-elseif (WIN32)
-    message("build for win x64")
-    set(BUILD_WIN TRUE)
-endif()
-
 #-----------------------------------------------------
 # hlslparser 
 
 # now setup the app project
 set(myTargetApp hlslparser)
-
-# not using version in cmake anymore, this is pulled in by KramVersion.h
-if (BUILD_MAC)
-    project(${myTargetApp} LANGUAGES C CXX OBJCXX)
-elseif (BUILD_WIN)
-    project(${myTargetApp} LANGUAGES C CXX)
-endif()
-
-# **** this the executable target ****, for a CLI App
+project(${myTargetApp})
 add_executable(${myTargetApp})
 
 #-----------------------------------------------------
     
 if (BUILD_MAC)
     set_target_properties(${myTargetApp} PROPERTIES
-         # Note: match this up with CXX version
-        # c++11 min
         XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++20"
         XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++"
-
-        # removed this in case run on Rosetta.  Not sure if this is Intel/Arm64 build.
-        # this is Intel specific, and no settings for Neon
-        # avx1
-        #XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx"
+        XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx2"
         
         # turn off exceptions/rtti
         XCODE_ATTRIBUTE_GCC_ENABLE_CPP_EXCEPTIONS NO
@@ -56,6 +27,9 @@ if (BUILD_MAC)
         XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT "dwarf-with-dsym"
         XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH "NO"
     
+        #-------------------------
+        # app specific settings
+        
         # this drops app from 762KB to 174KB with only ATE enabled
         # note about needing -gfull instead of -gused here or debug info messed up:
         # https://gist.github.com/tkersey/39b4fe69e14b859889ffadccb009e397
@@ -64,14 +38,18 @@ if (BUILD_MAC)
     
         #-------------------------
         # for now disable signing, and just "sign to run locally"
-        XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER "com.ba.hlslparser"
+        XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER "com.hialec.hlslparser"
         XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO"
         XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY ""
     )
 
     target_compile_options(${myTargetApp} PRIVATE -W -Wall)
 
-elseif (BUILD_WIN)
+endif()
+
+#-----------------------------------------------------
+    
+if (BUILD_WIN)
     # When Win rebuilds library, it doesn't relink app to correct code when you
     # build the app target project.  Breakpoints stop working after any library source edit,
     # and you have to rebuild solution to get the correct code to exectute.  Since 2014.  Try this.
@@ -88,8 +66,9 @@ elseif (BUILD_WIN)
 
     # don't need force with apps, since they only access kram folder files which include KramConfig
     
-    # all warnings, AVX1, and multiprocess compiles
-    target_compile_options(${myTargetApp} PRIVATE /W3 /arch:AVX -mf16c /MP /GF /FC)
+    # all warnings, AVX2, and multiprocess compiles,
+    # eliminate duplicate strings, embed full path
+    target_compile_options(${myTargetApp} PRIVATE /W3 -march=haswell -mf16c -mfma /MP /GF /FC)
     
     # fix STL (don't use -D here, will remove)
     target_compile_definitions(${myTargetApp} PRIVATE _ITERATOR_DEBUG_LEVEL=0 _HAS_EXCEPTIONS=0)
diff --git a/hlslparser/README.md b/hlslparser/README.md
index e33b5ec1..b4093f19 100644
--- a/hlslparser/README.md
+++ b/hlslparser/README.md
@@ -14,12 +14,13 @@ https://github.com/unknownworlds/hlslparser
 
 ---------------------------------
 
-Paths to turn HLSL and SPV 
+Paths to turn HLSL and SPV into MSL
 
 * HLSL2021 > hlslparser > HLSL2021 > dxc > SPV  
 * HLSL2021 > hlslparser > MSL    > metal > AIR(metallib)
 *
 * Reflection: spv > spv-reflect -> refl
+*             HLSL2021 > dxc -> ?
 *
 * Transpiling MSL: HLSL2021 > dxc   > SPV > spirv-cross > MSL
 * Transpiling MSL: HLSL2021 > glslc > SPV > spirv-cross > MSL (fails on simple HLSL)
@@ -27,6 +28,9 @@ Paths to turn HLSL and SPV
 * Variant generation 
 * HLSL2021 + defines > preprocess > HLSL2021
 * HLSL2021 + specialization > hlslparser
+*
+* Note this has no shader sources in gpu capture, nor AIR files to accumulate
+* HLSL2021 -> dxc -> DXIL -> metal-shaderconverter -> metallib
 
 ---------------------------------
 
@@ -104,6 +108,12 @@ Overview
 |spirv-opt | spv optimizer |
 |spirv-cross | transpile spv to MSL, HLSL, and GLSL, but codegen has 100's of temp vars, no comments, can target specific MSL/HLSL models |
 |spirv-reflect | gens reflection data from spv file |
+|metal-shaderconverter | compile dxil to metallib |
+
+https://github.com/microsoft/DirectXShaderCompiler
+https://github.com/KhronosGroup/SPIRV-Cross
+https://developer.apple.com/metal/shader-converter/
+https://github.com/google/shaderc
 
 Dealing with Half
 ---
@@ -131,10 +141,10 @@ HLSL2021 6.2 includes full half and int support.   So that is the compilation ta
 
 * Adreno also doesn't support half storage, so this limits SSBO and UBO usage.   
 
-* macOS on M1 - Rosetta2 lacks AVX and f16c cpu support, so translated x64 apps crash. Build Apple Silicon to fix this.
+* macOS on M1 - Rosetta2 lacks AVX and f16c cpu support, so translated x64 apps crash. Build Apple Silicon to fix this.  Win on ARM emulation (Qcom X Elite) also has the same limitations.  Neon is 16 128-bit registers where AVX needs 16 256-bit registers.
 
 * Android missing cpu arm64+f16 support from Redmi Note 8 and other chips.
-  vcvt_f32_f16 is still present without this.
+  vcvt_f32_f16 is still present without this.  Do math in fp32x4, then converter to fp16x4.
   
 Dealing with Double
 ---
@@ -177,6 +187,7 @@ Mali
 * Sparse index buffer limits 
 * 180MB parameter buffer limit - device lost after exceeded
 * Missing VK_POLYGON_MODE_LINE (feature.fillModeNonSolid) - affects debug visuals
+* Missing shaderClipDistance and shaderCullDistance
 * ARM licenses mobile cpu reference designs
 * ARM bought Mali gpu from Falanx Microsystems
 * Sets maxDrawIndirectCount = 1, limiting MDI utility
diff --git a/hlslparser/hlslparser.xcodeproj/project.pbxproj b/hlslparser/hlslparser.xcodeproj/project.pbxproj
index b9444ce2..9a2e5bcf 100644
--- a/hlslparser/hlslparser.xcodeproj/project.pbxproj
+++ b/hlslparser/hlslparser.xcodeproj/project.pbxproj
@@ -174,6 +174,7 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
@@ -242,6 +243,7 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
@@ -302,7 +304,11 @@
 		702A2B6129A49DC8007D9A99 /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				MACOSX_DEPLOYMENT_TARGET = 13.0;
+				PRODUCT_BUNDLE_IDENTIFIER = com.hialec.hlslparser;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 			};
 			name = Debug;
@@ -310,7 +316,11 @@
 		702A2B6229A49DC8007D9A99 /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				CODE_SIGN_STYLE = Manual;
+				DEVELOPMENT_TEAM = LDJ95E4NS8;
+				MACOSX_DEPLOYMENT_TARGET = 13.0;
+				PRODUCT_BUNDLE_IDENTIFIER = com.hialec.hlslparser;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 			};
 			name = Release;
diff --git a/hlslparser/src/CodeWriter.cpp b/hlslparser/src/CodeWriter.cpp
index 046219f4..c8af3706 100644
--- a/hlslparser/src/CodeWriter.cpp
+++ b/hlslparser/src/CodeWriter.cpp
@@ -7,52 +7,46 @@
 //
 //=============================================================================
 
-#include "Engine.h"
-
 #include "CodeWriter.h"
 
 #include <stdarg.h>
 
-namespace M4
-{
+#include "Engine.h"
+
+namespace M4 {
 CodeWriter::CodeWriter()
 {
-    m_currentLine       = 1;
-    m_currentFileName   = NULL;
-    m_spacesPerIndent   = 4;
-    m_writeFileLine     = false;
+    m_currentLine = 1;
+    m_currentFileName = NULL;
+    m_spacesPerIndent = 4;
+    m_writeFileLine = false;
 }
 
 void CodeWriter::BeginLine(int indent, const char* fileName, int lineNumber)
 {
     // probably missing an EndLine
     ASSERT(m_currentIndent == 0);
-    
-    if (m_writeFileLine)
-    {
+
+    if (m_writeFileLine) {
         bool outputLine = false;
         bool outputFile = false;
 
         // Output a line number pragma if necessary.
-        if (fileName != NULL && m_currentFileName != fileName)
-        {
+        if (fileName != NULL && m_currentFileName != fileName) {
             m_currentFileName = fileName;
             fileName = m_currentFileName;
             outputFile = true;
         }
-        if (lineNumber != -1 && m_currentLine != lineNumber)
-        {
+        if (lineNumber != -1 && m_currentLine != lineNumber) {
             m_currentLine = lineNumber;
             outputLine = true;
         }
 
         // if previous filename is same, only output line
-        if (outputFile)
-        {
+        if (outputFile) {
             String_Printf(m_buffer, "#line %d \"%s\"\n", lineNumber, fileName);
         }
-        else if (outputLine)
-        {
+        else if (outputLine) {
             String_Printf(m_buffer, "#line %d\n", lineNumber);
         }
     }
@@ -60,20 +54,18 @@ void CodeWriter::BeginLine(int indent, const char* fileName, int lineNumber)
     // Handle the indentation.
     if (indent)
         Write("%*s", indent * m_spacesPerIndent, "");
-    
+
     m_currentIndent = indent;
-    
 }
 
 int CodeWriter::EndLine(const char* text)
 {
-    if (text != NULL)
-    {
+    if (text != NULL) {
         m_buffer += text;
     }
     m_buffer += "\n";
     ++m_currentLine;
-    
+
     // so can EndLine/BeginLine
     int indent = m_currentIndent;
     m_currentIndent = 0;
@@ -93,13 +85,13 @@ void CodeWriter::WriteLine(int indent, const char* format, ...)
 {
     if (indent)
         Write("%*s", indent * m_spacesPerIndent, "");
-    
+
     va_list args;
     va_start(args, format);
     int result = String_PrintfArgList(m_buffer, format, args);
     ASSERT(result != -1);
     va_end(args);
-    
+
     EndLine();
 }
 
@@ -107,13 +99,13 @@ void CodeWriter::WriteLineTagged(int indent, const char* fileName, int lineNumbe
 {
     // TODO: this should make sure that line isn't already Begu
     BeginLine(indent, fileName, lineNumber);
-    
+
     va_list args;
     va_start(args, format);
     int result = String_PrintfArgList(m_buffer, format, args);
     ASSERT(result != -1);
     va_end(args);
-        
+
     EndLine();
 }
 
@@ -127,4 +119,4 @@ void CodeWriter::Reset()
     m_buffer.clear();
 }
 
-}
+} //namespace M4
diff --git a/hlslparser/src/CodeWriter.h b/hlslparser/src/CodeWriter.h
index f9d27b7e..5454fd71 100644
--- a/hlslparser/src/CodeWriter.h
+++ b/hlslparser/src/CodeWriter.h
@@ -14,8 +14,7 @@
 // stl
 #include <string>
 
-namespace M4
-{
+namespace M4 {
 
 class Allocator;
 
@@ -23,34 +22,29 @@ class Allocator;
  * This class is used for outputting code. It handles indentation and inserting #line markers
  * to match the desired output line numbers.
  */
-class CodeWriter
-{
-
+class CodeWriter {
 public:
     CodeWriter();
 
     void SetWriteFileLine(bool enable) { m_writeFileLine = enable; }
-    
+
     void BeginLine(int indent, const char* fileName = NULL, int lineNumber = -1);
     void Write(const char* format, ...) M4_PRINTF_ATTR(2, 3);
     int EndLine(const char* text = NULL);
 
     void WriteLine(int indent, const char* format, ...) M4_PRINTF_ATTR(3, 4);
-    void WriteLineTagged(int indent, const char* fileName, int lineNumber, const char* format, ...) M4_PRINTF_ATTR(5, 6) ;
+    void WriteLineTagged(int indent, const char* fileName, int lineNumber, const char* format, ...) M4_PRINTF_ATTR(5, 6);
 
     const char* GetResult() const;
     void Reset();
 
 private:
-
-    std::string     m_buffer;
-    int             m_currentLine;
-    const char*     m_currentFileName;
-    int             m_spacesPerIndent;
-    int             m_currentIndent;
-    bool            m_writeFileLine;
-
+    std::string m_buffer;
+    int m_currentLine;
+    const char* m_currentFileName;
+    int m_spacesPerIndent;
+    int m_currentIndent;
+    bool m_writeFileLine;
 };
 
-}
-
+} //namespace M4
diff --git a/hlslparser/src/Engine.cpp b/hlslparser/src/Engine.cpp
old mode 100755
new mode 100644
index 49bdd1c0..85b1c1c2
--- a/hlslparser/src/Engine.cpp
+++ b/hlslparser/src/Engine.cpp
@@ -1,9 +1,9 @@
 
 #include "Engine.h"
 
-#include <stdio.h>  // vsnprintf
+#include <stdio.h> // vsnprintf
+#include <stdlib.h> // strtod, strtol
 #include <string.h> // strcmp, strcasecmp
-#include <stdlib.h>	// strtod, strtol
 
 // this is usually just an unordered_map internally
 #include <unordered_set>
@@ -16,23 +16,22 @@ void String_Copy(char* str, const char* b, uint32_t size)
 {
 #ifdef WIN32
     strncpy(str, b, size);
-    str[size-1] = 0;
+    str[size - 1] = 0;
 #else
     strlcpy(str, b, size);
 #endif
 }
 
 // This version doesn't truncate and is simpler
-int String_PrintfArgList(std::string& buffer, const char * format, va_list args) {
+int String_PrintfArgList(std::string& buffer, const char* format, va_list args)
+{
     int n = 0;
-    
-    if (!String_HasChar(format, '%'))
-    {
+
+    if (!String_HasChar(format, '%')) {
         buffer += format;
         n = (uint32_t)strlen(format);
     }
-    else if (String_Equal(format, "%s"))
-    {
+    else if (String_Equal(format, "%s")) {
         va_list tmp;
         va_copy(tmp, args);
         const char* text = va_arg(args, const char*);
@@ -40,50 +39,46 @@ int String_PrintfArgList(std::string& buffer, const char * format, va_list args)
         buffer += text;
         va_end(tmp);
     }
-    else
-    {
+    else {
         va_list tmp;
         va_copy(tmp, args);
-        
+
         int len = vsnprintf(nullptr, 0, format, tmp);
-        if (len >= 0)
-        {
+        if (len >= 0) {
             size_t bufferLength = buffer.length();
-            buffer.resize(bufferLength+len);
-            vsnprintf((char*)buffer.data() + bufferLength, len+1, format, tmp);
-            
+            buffer.resize(bufferLength + len);
+            vsnprintf((char*)buffer.data() + bufferLength, len + 1, format, tmp);
+
             n = len;
         }
         va_end(tmp);
     }
-    
+
     return n;
 }
 
 // This version truncates but works on stack
-int String_PrintfArgList(char* buffer, int size, const char * format, va_list args) {
+int String_PrintfArgList(char* buffer, int size, const char* format, va_list args)
+{
     int n;
-    
-    if (!String_HasChar(format, '%'))
-    {
+
+    if (!String_HasChar(format, '%')) {
         String_Copy(buffer, format, size);
-        
+
         // truncation or not
         n = (int)strlen(format);
     }
-    else if (String_Equal(format, "%s"))
-    {
+    else if (String_Equal(format, "%s")) {
         va_list tmp;
         va_copy(tmp, args);
         const char* text = va_arg(args, const char*);
         n = (int)strlen(text);
-        
+
         // truncation
         String_Copy(buffer, text, size);
         va_end(tmp);
     }
-    else
-    {
+    else {
         va_list tmp;
         va_copy(tmp, args);
 
@@ -92,15 +87,15 @@ int String_PrintfArgList(char* buffer, int size, const char * format, va_list ar
         n = vsnprintf(buffer, size, format, tmp);
         va_end(tmp);
     }
-   
-	if (n < 0 || (n+1) > size)
+
+    if (n < 0 || (n + 1) > size)
         return -1;
-	
+
     return n;
 }
 
-int String_Printf(std::string& buffer, const char * format, ...) {
-
+int String_Printf(std::string& buffer, const char* format, ...)
+{
     va_list args;
     va_start(args, format);
 
@@ -111,8 +106,8 @@ int String_Printf(std::string& buffer, const char * format, ...) {
     return n;
 }
 
-int String_Printf(char * buffer, int size, const char * format, ...) {
-
+int String_Printf(char* buffer, int size, const char* format, ...)
+{
     va_list args;
     va_start(args, format);
 
@@ -120,109 +115,112 @@ int String_Printf(char * buffer, int size, const char * format, ...) {
 
     va_end(args);
 
-	return n;
+    return n;
 }
 
-int String_FormatFloat(char * buffer, int size, float value) {
+int String_FormatFloat(char* buffer, int size, float value)
+{
     return String_Printf(buffer, size, "%.6f", value);
 }
 
-bool String_HasChar(const char* str, char c) {
+bool String_HasChar(const char* str, char c)
+{
     return strchr(str, c) != NULL;
 }
 
-bool String_HasString(const char* str, const char* search) {
+bool String_HasString(const char* str, const char* search)
+{
     return strstr(str, search) != NULL;
 }
 
-bool String_Equal(const char * a, const char * b) {
-	if (a == b) return true;
-	if (a == NULL || b == NULL) return false;
-	return strcmp(a, b) == 0;
+bool String_Equal(const char* a, const char* b)
+{
+    if (a == b) return true;
+    if (a == NULL || b == NULL) return false;
+    return strcmp(a, b) == 0;
 }
 
-bool String_EqualNoCase(const char * a, const char * b) {
-	if (a == b) return true;
-	if (a == NULL || b == NULL) return false;
+bool String_EqualNoCase(const char* a, const char* b)
+{
+    if (a == b) return true;
+    if (a == NULL || b == NULL) return false;
 #if _MSC_VER
-	return _stricmp(a, b) == 0;
+    return _stricmp(a, b) == 0;
 #else
-	return strcasecmp(a, b) == 0;
+    return strcasecmp(a, b) == 0;
 #endif
 }
 
-double String_ToDouble(const char * str, char ** endptr) {
-	return strtod(str, endptr);
+double String_ToDouble(const char* str, char** endptr)
+{
+    return strtod(str, endptr);
 }
 
-float String_ToFloat(const char * str, char ** endptr) {
+float String_ToFloat(const char* str, char** endptr)
+{
     return strtof(str, endptr);
 }
 
 static const int kBase10 = 10;
 static const int kBase16 = 16;
 
-int32_t String_ToIntHex(const char * str, char ** endptr) {
+int32_t String_ToIntHex(const char* str, char** endptr)
+{
     return (int)strtol(str, endptr, kBase16);
 }
 
-int32_t String_ToInt(const char * str, char ** endptr) {
-	return (int)strtol(str, endptr, kBase10);
+int32_t String_ToInt(const char* str, char** endptr)
+{
+    return (int)strtol(str, endptr, kBase10);
 }
 
-uint32_t String_ToUint(const char * str, char ** endptr) {
+uint32_t String_ToUint(const char* str, char** endptr)
+{
     return (int)strtoul(str, endptr, kBase10);
 }
 
-uint64_t String_ToUlong(const char * str, char ** endptr) {
+uint64_t String_ToUlong(const char* str, char** endptr)
+{
     return (int)strtoull(str, endptr, kBase10);
 }
 
-int64_t String_ToLong(const char * str, char ** endptr) {
+int64_t String_ToLong(const char* str, char** endptr)
+{
     return (int)strtoll(str, endptr, kBase10);
 }
 
-
-
-
-
-
-
 void String_StripTrailingFloatZeroes(char* buffer)
 {
     const char* dotPos = strrchr(buffer, '.');
     if (dotPos == nullptr) return;
-    
+
     uint32_t bufferLen = (uint32_t)strlen(buffer);
-    
+
     // strip trailing zeroes
-    while (bufferLen > 0)
-    {
-        char& c = buffer[bufferLen-1];
-        if (c == '0')
-        {
+    while (bufferLen > 0) {
+        char& c = buffer[bufferLen - 1];
+        if (c == '0') {
             c = 0;
             bufferLen--;
         }
-        else
-        {
+        else {
             break;
         }
     }
-    
+
     // This breaks appending h to a number in MSL
     // strip the period (only for MSL)
-//    char& c = buffer[bufferLen-1];
-//    if (dotPos == &c)
-//    {
-//        c = 0;
-//        bufferLen--;
-//    }
+    // char& c = buffer[bufferLen-1];
+    // if (dotPos == &c)
+    // {
+    //     c = 0;
+    //     bufferLen--;
+    // }
 }
 
 // Engine/Log.cpp
 
-void Log_Error(const char * format, ...)
+void Log_Error(const char* format, ...)
 {
     va_list args;
     va_start(args, format);
@@ -230,82 +228,83 @@ void Log_Error(const char * format, ...)
     va_end(args);
 }
 
-void Log_ErrorArgList(const char * format, va_list args, const char* filename, uint32_t line)
+void Log_ErrorArgList(const char* format, va_list args, const char* filename, uint32_t line)
 {
     va_list tmp;
     va_copy(tmp, args);
-    
+
     // Not thread-safe
     static std::string buffer;
     buffer.clear();
     String_PrintfArgList(buffer, format, tmp);
-    
+
     // TODO: this doesn't work on Win/Android
     // use a real log abstraction to ODS/etc from Kram
     if (filename)
-        fprintf( stderr, "%s:%d: error: %s", filename, line, buffer.c_str());
+        fprintf(stderr, "%s:%d: error: %s", filename, line, buffer.c_str());
     else
-        fprintf( stderr, "error: %s", buffer.c_str());
-    
-    va_end(tmp);
+        fprintf(stderr, "error: %s", buffer.c_str());
 
+    va_end(tmp);
 }
 
-
 // Engine/StringPool.cpp
 
 using StringPoolSet = std::unordered_set<const char*, CompareAndHandStrings, CompareAndHandStrings>;
 
 #define CastImpl(imp) (StringPoolSet*)imp
 
-StringPool::StringPool(Allocator * allocator) {
+StringPool::StringPool(Allocator* allocator)
+{
     // NOTE: allocator not used
-    
+
     m_impl = new StringPoolSet();
 }
-StringPool::~StringPool() {
+StringPool::~StringPool()
+{
     auto* impl = CastImpl(m_impl);
-    
+
     // delete the strings
     for (auto it : *impl) {
         const char* text = it;
         free((char*)text);
     }
-    
+
     delete impl;
 }
 
-const char * StringPool::AddString(const char * text) {
+const char* StringPool::AddString(const char* text)
+{
     auto* impl = CastImpl(m_impl);
     auto it = impl->find(text);
     if (it != impl->end())
         return *it;
-    
+
     // _strdup doesn't go through allocator either
 #if _MSC_VER
-    const char * dup = _strdup(text);
+    const char* dup = _strdup(text);
 #else
-    const char * dup = strdup(text);
+    const char* dup = strdup(text);
 #endif
-    
+
     impl->insert(dup);
     return dup;
 }
 
-const char* StringPool::PrintFormattedVaList(const char* fmt, va_list args) {
+const char* StringPool::PrintFormattedVaList(const char* fmt, va_list args)
+{
     char* res = nullptr;
-    
+
     va_list tmp;
 
     // va_copy needed?
     va_copy(tmp, args);
-    
+
     // just call 2x, once for len
     int len = vsnprintf(nullptr, 0, fmt, tmp);
-    if (len >= 0)
-    {
-        res = (char*)malloc(len+1);
-        vsnprintf(res, len+1, fmt, tmp);
+    if (len >= 0) {
+        res = (char*)malloc(len + 1);
+        vsnprintf(res, len + 1, fmt, tmp);
     }
     va_end(tmp);
 
@@ -313,41 +312,43 @@ const char* StringPool::PrintFormattedVaList(const char* fmt, va_list args) {
     return res;
 }
 
-const char * StringPool::AddStringFormatList(const char * format, va_list args) {
+const char* StringPool::AddStringFormatList(const char* format, va_list args)
+{
     // don't format if no tokens
     va_list tmp;
     va_copy(tmp, args);
-    const char * text = PrintFormattedVaList(format, tmp);
+    const char* text = PrintFormattedVaList(format, tmp);
     va_end(tmp);
 
     auto* impl = CastImpl(m_impl);
-    
+
     // add it if not found
     auto it = impl->find(text);
-    if (it == impl->end())
-    {
+    if (it == impl->end()) {
         impl->insert(text);
         return text;
     }
-    
+
     // allocated inside PrintFormattedVaList
     free((char*)text);
     return *it;
 }
 
-const char * StringPool::AddStringFormat(const char * format, ...) {
+const char* StringPool::AddStringFormat(const char* format, ...)
+{
     // TODO: don't format if no tokens
     va_list args;
     va_start(args, format);
-    const char * string = AddStringFormatList(format, args);
+    const char* string = AddStringFormatList(format, args);
     va_end(args);
 
     return string;
 }
 
-bool StringPool::GetContainsString(const char * text) const {
+bool StringPool::GetContainsString(const char* text) const
+{
     const auto* impl = CastImpl(m_impl);
     return impl->find(text) != impl->end();
 }
 
-} // M4 namespace
+} //namespace M4
diff --git a/hlslparser/src/Engine.h b/hlslparser/src/Engine.h
old mode 100755
new mode 100644
index ea3eeb2c..43535cd9
--- a/hlslparser/src/Engine.h
+++ b/hlslparser/src/Engine.h
@@ -6,13 +6,14 @@
 
 #include <stdarg.h> // va_list, vsnprintf
 #include <stdlib.h> // malloc
+
 #include <new> // for placement new
 
 // stl
 #include <string>
 
 #ifndef NULL
-#define NULL    0
+#define NULL 0
 #endif
 
 #ifndef va_copy
@@ -33,7 +34,6 @@
 
 namespace M4 {
 
-
 // Engine/Allocator.h
 
 // This doesn't do placement new/delete, but is only
@@ -42,50 +42,55 @@ namespace M4 {
 // there default ctor variable initializers are safe to use.
 class Allocator {
 public:
-    template <typename T> T * New() {
-        return (T *)malloc(sizeof(T));
+    template <typename T>
+    T* New()
+    {
+        return (T*)malloc(sizeof(T));
     }
-    template <typename T> T * New(size_t count) {
-        return (T *)malloc(sizeof(T) * count);
+    template <typename T>
+    T* New(size_t count)
+    {
+        return (T*)malloc(sizeof(T) * count);
     }
-    template <typename T> void Delete(T * ptr) {
-        free((void *)ptr);
+    template <typename T>
+    void Delete(T* ptr)
+    {
+        free((void*)ptr);
     }
-    template <typename T> T * Realloc(T * ptr, size_t count) {
-        return (T *)realloc(ptr, sizeof(T) * count);
+    template <typename T>
+    T* Realloc(T* ptr, size_t count)
+    {
+        return (T*)realloc(ptr, sizeof(T) * count);
     }
 };
 
-
 // Engine/String.h
 
+int String_FormatFloat(char* buffer, int size, float value);
+bool String_Equal(const char* a, const char* b);
+bool String_EqualNoCase(const char* a, const char* b);
 
-
-int String_FormatFloat(char * buffer, int size, float value);
-bool String_Equal(const char * a, const char * b);
-bool String_EqualNoCase(const char * a, const char * b);
-
-double String_ToDouble(const char * str, char ** end);
-float String_ToFloat(const char * str, char ** end);
+double String_ToDouble(const char* str, char** end);
+float String_ToFloat(const char* str, char** end);
 // no half
 
-int32_t String_ToIntHex(const char * str, char ** end);
-int32_t String_ToInt(const char * str, char ** end);
-uint32_t String_ToUint(const char * str, char ** end);
+int32_t String_ToIntHex(const char* str, char** end);
+int32_t String_ToInt(const char* str, char** end);
+uint32_t String_ToUint(const char* str, char** end);
 
-uint64_t String_ToUlong(const char * str, char ** end);
-int64_t String_ToLong(const char * str, char ** end);
+uint64_t String_ToUlong(const char* str, char** end);
+int64_t String_ToLong(const char* str, char** end);
 
 bool String_HasChar(const char* str, char c);
 bool String_HasString(const char* str, const char* search);
 
 // just use these, it's way easier than using fixed buffers
-int String_PrintfArgList(std::string& buffer, const char * format, va_list args);
-int String_Printf(std::string& buffer, const char * format, ...) M4_PRINTF_ATTR(2, 3);
+int String_PrintfArgList(std::string& buffer, const char* format, va_list args);
+int String_Printf(std::string& buffer, const char* format, ...) M4_PRINTF_ATTR(2, 3);
 
 // These 3 calls have truncation issues
-int String_Printf(char * buffer, int size, const char * format, ...) M4_PRINTF_ATTR(3, 4);
-int String_PrintfArgList(char * buffer, int size, const char * format, va_list args);
+int String_Printf(char* buffer, int size, const char* format, ...) M4_PRINTF_ATTR(3, 4);
+int String_PrintfArgList(char* buffer, int size, const char* format, va_list args);
 void String_Copy(char* str, const char* b, uint32_t size);
 
 void String_StripTrailingFloatZeroes(char* buffer);
@@ -94,23 +99,24 @@ void String_StripTrailingFloatZeroes(char* buffer);
 // case sensitive fnv1a hash, can pass existing hash to continue a hash
 inline uint32_t HashFnv1a(const char* val, uint32_t hash = 0x811c9dc5)
 {
-    const uint32_t prime  = 0x01000193; // 16777619 (32-bit)
-    while (*val)
-    {
+    const uint32_t prime = 0x01000193; // 16777619 (32-bit)
+    while (*val) {
         hash = (hash * prime) ^ (uint32_t)*val++;
     }
     return hash;
 }
 
 // this compares string stored as const char*
-struct CompareAndHandStrings
-{
+struct CompareAndHandStrings {
     template <class _Tp>
     bool operator()(const _Tp& __x, const _Tp& __y) const
-    { return String_Equal( __x, __y ); }
-    
+    {
+        return String_Equal(__x, __y);
+    }
+
     template <class _Tp>
-    size_t operator()(const _Tp& __x) const {
+    size_t operator()(const _Tp& __x) const
+    {
         // assumes 32-bit hash to int64 conversion here
         return (size_t)HashFnv1a(__x);
     }
@@ -118,42 +124,44 @@ struct CompareAndHandStrings
 
 // Engine/Log.h
 
-void Log_Error(const char * format, ...) M4_PRINTF_ATTR(1, 2);
-
-void Log_ErrorArgList(const char * format, va_list args, const char* filename = NULL, uint32_t line = 0);
+void Log_Error(const char* format, ...) M4_PRINTF_ATTR(1, 2);
 
+void Log_ErrorArgList(const char* format, va_list args, const char* filename = NULL, uint32_t line = 0);
 
 // Engine/Array.h
 
 template <typename T>
-void ConstructRange(T * buffer, int new_size, int old_size) {
+void ConstructRange(T* buffer, int new_size, int old_size)
+{
     for (int i = old_size; i < new_size; i++) {
-        new(buffer+i) T; // placement new
+        new (buffer + i) T; // placement new
     }
 }
 
 template <typename T>
-void ConstructRange(T * buffer, int new_size, int old_size, const T & val) {
+void ConstructRange(T* buffer, int new_size, int old_size, const T& val)
+{
     for (int i = old_size; i < new_size; i++) {
-        new(buffer+i) T(val); // placement new
+        new (buffer + i) T(val); // placement new
     }
 }
 
 template <typename T>
-void DestroyRange(T * buffer, int new_size, int old_size) {
+void DestroyRange(T* buffer, int new_size, int old_size)
+{
     for (int i = new_size; i < old_size; i++) {
-        (buffer+i)->~T(); // Explicit call to the destructor
+        (buffer + i)->~T(); // Explicit call to the destructor
     }
 }
 
-
 template <typename T>
 class Array {
 public:
-    Array(Allocator * allocator) : allocator(allocator), buffer(NULL), size(0), capacity(0) {}
+    Array(Allocator* allocator) : allocator(allocator), buffer(NULL), size(0), capacity(0) {}
 
-    void PushBack(const T & val) {
-        ASSERT(&val < buffer || &val >= buffer+size);
+    void PushBack(const T& val)
+    {
+        ASSERT(&val < buffer || &val >= buffer + size);
 
         int old_size = size;
         int new_size = size + 1;
@@ -162,7 +170,8 @@ class Array {
 
         ConstructRange(buffer, new_size, old_size, val);
     }
-    T & PushBackNew() {
+    T& PushBackNew()
+    {
         int old_size = size;
         int new_size = size + 1;
 
@@ -172,7 +181,8 @@ class Array {
 
         return buffer[old_size];
     }
-    void Resize(int new_size) {
+    void Resize(int new_size)
+    {
         int old_size = size;
 
         DestroyRange(buffer, new_size, old_size);
@@ -183,13 +193,21 @@ class Array {
     }
 
     int GetSize() const { return size; }
-    const T & operator[](int i) const { ASSERT(i < size); return buffer[i]; }
-    T & operator[](int i) { ASSERT(i < size); return buffer[i]; }
+    const T& operator[](int i) const
+    {
+        ASSERT(i < size);
+        return buffer[i];
+    }
+    T& operator[](int i)
+    {
+        ASSERT(i < size);
+        return buffer[i];
+    }
 
 private:
-
     // Change array size.
-    void SetSize(int new_size) {
+    void SetSize(int new_size)
+    {
         size = new_size;
 
         if (new_size > capacity) {
@@ -208,7 +226,8 @@ class Array {
     }
 
     // Change array capacity.
-    void SetCapacity(int new_capacity) {
+    void SetCapacity(int new_capacity)
+    {
         ASSERT(new_capacity >= size);
 
         if (new_capacity == 0) {
@@ -226,30 +245,28 @@ class Array {
         capacity = new_capacity;
     }
 
-
 private:
-    Allocator * allocator; // @@ Do we really have to keep a pointer to this?
-    T * buffer;
+    Allocator* allocator; // @@ Do we really have to keep a pointer to this?
+    T* buffer;
     int size;
     int capacity;
 };
 
-
 // Engine/StringPool.h
 
 // @@ Implement this with a hash table!
 struct StringPool {
-    StringPool(Allocator * allocator);
+    StringPool(Allocator* allocator);
     ~StringPool();
 
-    const char * AddString(const char * text);
-    const char * AddStringFormat(const char * fmt, ...) M4_PRINTF_ATTR(2, 3);
-    const char * AddStringFormatList(const char * fmt, va_list args);
-    bool GetContainsString(const char * text) const;
+    const char* AddString(const char* text);
+    const char* AddStringFormat(const char* fmt, ...) M4_PRINTF_ATTR(2, 3);
+    const char* AddStringFormatList(const char* fmt, va_list args);
+    bool GetContainsString(const char* text) const;
+
 private:
-    const char*PrintFormattedVaList(const char* fmt, va_list args);
+    const char* PrintFormattedVaList(const char* fmt, va_list args);
     void* m_impl = NULL;
 };
 
-
-} // M4 namespace
+} //namespace M4
diff --git a/hlslparser/src/HLSLGenerator.cpp b/hlslparser/src/HLSLGenerator.cpp
index 2565098c..9c78e26c 100644
--- a/hlslparser/src/HLSLGenerator.cpp
+++ b/hlslparser/src/HLSLGenerator.cpp
@@ -13,8 +13,7 @@
 #include "HLSLParser.h"
 #include "HLSLTree.h"
 
-namespace M4
-{
+namespace M4 {
 
 const char* HLSLGenerator::GetTypeName(const HLSLType& type)
 {
@@ -23,45 +22,42 @@ const char* HLSLGenerator::GetTypeName(const HLSLType& type)
     // number
     bool isHalfNumerics = promote && !m_options.treatHalfAsFloat;
     HLSLBaseType baseType = type.baseType;
-    
+
     // Note: these conversions should really be done during parsing
     // so that casting gets applied.
     if (!isHalfNumerics)
         baseType = HalfToFloatBaseType(baseType);
-    
+
     // MSL doesn't support double, and many HLSL cards don't either.
     //if (IsDouble(baseType))
     //    baseType = DoubleToFloatBaseType(baseType);
-    
+
     HLSLType remappedType(type);
     remappedType.baseType = baseType;
-    
+
     // DONE: these can all just use a table entry, have another slot for MSL
     // Functions can return void, especially with compute
     if (IsTextureType(baseType) || IsSamplerType(baseType) || IsNumericType(baseType) || baseType == HLSLBaseType_Void || baseType == HLSLBaseType_UserDefined)
         return GetTypeNameHLSL(remappedType);
-    
+
     Error("Unknown type");
     return NULL;
 }
 
 // TODO: copied from MSLGenerator
 // @@ We could be a lot smarter removing parenthesis based on the operator precedence of the parent expression.
-static bool NeedsParenthesis(HLSLExpression* expression, HLSLExpression* parentExpression) {
-
+static bool NeedsParenthesis(HLSLExpression* expression, HLSLExpression* parentExpression)
+{
     // For now we just omit the parenthesis if there's no parent expression.
-    if (parentExpression == NULL)
-    {
+    if (parentExpression == NULL) {
         return false;
     }
 
     // One more special case that's pretty common.
-    if (parentExpression->nodeType == HLSLNodeType_MemberAccess)
-    {
+    if (parentExpression->nodeType == HLSLNodeType_MemberAccess) {
         if (expression->nodeType == HLSLNodeType_IdentifierExpression ||
             expression->nodeType == HLSLNodeType_ArrayAccess ||
-            expression->nodeType == HLSLNodeType_MemberAccess)
-        {
+            expression->nodeType == HLSLNodeType_MemberAccess) {
             return false;
         }
     }
@@ -89,62 +85,49 @@ static int GetFunctionArguments(HLSLFunctionCall* functionCall, HLSLExpression*
 
 HLSLGenerator::HLSLGenerator()
 {
-    m_tree                          = NULL;
-    m_entryName                     = NULL;
-    m_target                        = HLSLTarget_VertexShader;
-    m_isInsideBuffer                = false;
-    m_error                         = false;
+    m_tree = NULL;
+    m_entryName = NULL;
+    m_target = HLSLTarget_VertexShader;
+    m_isInsideBuffer = false;
+    m_error = false;
 }
 
 // @@ We need a better way of doing semantic replacement:
 // - Look at the function being generated.
 // - Return semantic, semantics associated to fields of the return structure, or output arguments, or fields of structures associated to output arguments -> output semantic replacement.
 // - Semantics associated input arguments or fields of the input arguments -> input semantic replacement.
-static const char * TranslateSemantic(const char* semantic, bool output, HLSLTarget target)
+static const char* TranslateSemantic(const char* semantic, bool output, HLSLTarget target)
 {
     // Note: these are all just passthrough of the DX10 semantics
     // except for BASEVERTEX/INSTANCE which doesn't seem to dxc compile.
-    
-    if (target == HLSLTarget_VertexShader)
-    {
-        if (output) 
-        {
 
+    if (target == HLSLTarget_VertexShader) {
+        if (output) {
         }
         else {
             // see here for sample of builtin notation
             // https://github.com/microsoft/DirectXShaderCompiler/commit/b6fe9886ad
-            
+
             // Vulkan/MSL only, requires ext DrawParameters
             // [[vk::builtin(\"BaseVertex\")]] uint baseVertex :
             // [[vk::builtin(\"BaseInstance\")]] uint instance : SV_BaseInstance
-            
+
             if (String_Equal(semantic, "BASEVERTEX"))
-                return "BaseVertex";  // vulkan only
+                return "BaseVertex"; // vulkan only
             if (String_Equal(semantic, "BASEINSTANCE"))
-                return "BaseInstance";  // vulkan only
+                return "BaseInstance"; // vulkan only
         }
     }
-    else if (target == HLSLTarget_PixelShader)
-    {
-        if (output)
-        {
-
+    else if (target == HLSLTarget_PixelShader) {
+        if (output) {
         }
-        else
-        {
-
+        else {
         }
     }
-    else if (target == HLSLTarget_ComputeShader)
-    {
-        if (output)
-        {
-
+    else if (target == HLSLTarget_ComputeShader) {
+        if (output) {
         }
-        else
-        {
-
+        else {
         }
     }
     return NULL;
@@ -155,8 +138,7 @@ void HLSLGenerator::Error(const char* format, ...)
     // It's not always convenient to stop executing when an error occurs,
     // so just track once we've hit an error and stop reporting them until
     // we successfully bail out of execution.
-    if (m_error)
-    {
+    if (m_error) {
         return;
     }
     m_error = true;
@@ -169,21 +151,19 @@ void HLSLGenerator::Error(const char* format, ...)
 
 bool HLSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entryName, const HLSLOptions& options)
 {
-    
-    m_tree      = tree;
+    m_tree = tree;
     m_entryName = entryName;
-    m_target    = target;
+    m_target = target;
     m_isInsideBuffer = false;
 
-    m_options   = options;
+    m_options = options;
     m_writer.SetWriteFileLine(options.writeFileLine);
-    
+
     m_writer.Reset();
 
     // Find entry point function
     HLSLFunction* entryFunction = tree->FindFunction(entryName);
-    if (entryFunction == NULL)
-    {
+    if (entryFunction == NULL) {
         Error("Entry point '%s' doesn't exist\n", entryName);
         return false;
     }
@@ -191,88 +171,90 @@ bool HLSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entr
     // PruneTree resets hidden flags to true, then marks visible elements
     // based on whether entry point visits them.
     PruneTree(tree, entryFunction->name); // Note: takes second entry
-    
+
     // This sorts tree by type, but keeps ordering
     SortTree(tree);
-   
+
     // This strips any unused inputs to the entry point function
     HideUnusedArguments(entryFunction);
-    
+
     // Is this needed
     FlattenExpressions(tree);
-    
+
     m_writer.WriteLine(0, "#include \"ShaderHLSL.h\"");
-    
+
     // @@ Should we generate an entirely new copy of the tree so that we can modify it in place?
     //if (!legacy)
     {
-        HLSLFunction * function = tree->FindFunction(entryName);
+        HLSLFunction* function = tree->FindFunction(entryName);
 
         // Handle return value semantics
         if (function->semantic != NULL) {
             function->sv_semantic = TranslateSemantic(function->semantic, /*output=*/true, target);
         }
         if (function->returnType.baseType == HLSLBaseType_UserDefined) {
-            HLSLStruct * s = tree->FindGlobalStruct(function->returnType.typeName);
+            HLSLStruct* s = tree->FindGlobalStruct(function->returnType.typeName);
 
-			HLSLStructField * sv_fields = NULL;
+            HLSLStructField* sv_fields = NULL;
 
-			HLSLStructField * lastField = NULL;
-            HLSLStructField * field = s->field;
+            HLSLStructField* lastField = NULL;
+            HLSLStructField* field = s->field;
             while (field) {
-				HLSLStructField * nextField = field->nextField;
+                HLSLStructField* nextField = field->nextField;
 
                 // TODO: may have to be careful with SV_Position, since this puts
                 // those last.  SSBO won't use those semantics, so should be okay.
-                
+
                 if (field->semantic) {
-					field->hidden = false;
+                    field->hidden = false;
                     field->sv_semantic = TranslateSemantic(field->semantic, /*output=*/true, target);
 
-					// Fields with SV semantics are stored at the end to avoid linkage problems.
-					if (field->sv_semantic != NULL) {
-						// Unlink from last.
-						if (lastField != NULL) lastField->nextField = nextField;
-						else s->field = nextField;
-
-						// Add to sv_fields.
-						field->nextField = sv_fields;
-						sv_fields = field;
-					}
+                    // Fields with SV semantics are stored at the end to avoid linkage problems.
+                    if (field->sv_semantic != NULL) {
+                        // Unlink from last.
+                        if (lastField != NULL)
+                            lastField->nextField = nextField;
+                        else
+                            s->field = nextField;
+
+                        // Add to sv_fields.
+                        field->nextField = sv_fields;
+                        sv_fields = field;
+                    }
                 }
 
-				if (field != sv_fields) lastField = field;
+                if (field != sv_fields) lastField = field;
                 field = nextField;
             }
 
-			// Append SV fields at the end.
-			if (sv_fields != NULL) {
-				if (lastField == NULL) {
-					s->field = sv_fields;
-				}
-				else {
-					ASSERT(lastField->nextField == NULL);
-					lastField->nextField = sv_fields;
-				}
-			}
+            // Append SV fields at the end.
+            if (sv_fields != NULL) {
+                if (lastField == NULL) {
+                    s->field = sv_fields;
+                }
+                else {
+                    ASSERT(lastField->nextField == NULL);
+                    lastField->nextField = sv_fields;
+                }
+            }
         }
 
         // Handle argument semantics.
         // @@ It would be nice to flag arguments that are used by the program and skip or hide the unused ones.
-        HLSLArgument * argument = function->argument;
+        HLSLArgument* argument = function->argument;
         while (argument) {
             bool output = argument->modifier == HLSLArgumentModifier_Out;
             if (argument->semantic) {
-                argument->sv_semantic = TranslateSemantic(argument->semantic, output, target); 
+                argument->sv_semantic = TranslateSemantic(argument->semantic, output, target);
             }
 
             if (argument->type.baseType == HLSLBaseType_UserDefined) {
-                HLSLStruct * s = tree->FindGlobalStruct(argument->type.typeName);
+                HLSLStruct* s = tree->FindGlobalStruct(argument->type.typeName);
 
-                HLSLStructField * field = s->field;
+                HLSLStructField* field = s->field;
                 while (field) {
                     if (field->semantic) {
-						field->hidden = false;
+                        field->hidden = false;
                         field->sv_semantic = TranslateSemantic(field->semantic, output, target);
                     }
 
@@ -283,7 +265,7 @@ bool HLSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entr
             argument = argument->nextArgument;
         }
     }
-    
+
     HLSLRoot* root = m_tree->GetRoot();
     OutputStatements(0, root->statement);
 
@@ -299,10 +281,8 @@ const char* HLSLGenerator::GetResult() const
 void HLSLGenerator::OutputExpressionList(HLSLExpression* expression)
 {
     int numExpressions = 0;
-    while (expression != NULL)
-    {
-        if (numExpressions > 0)
-        {
+    while (expression != NULL) {
+        if (numExpressions > 0) {
             m_writer.Write(", ");
         }
         OutputExpression(expression);
@@ -311,154 +291,196 @@ void HLSLGenerator::OutputExpressionList(HLSLExpression* expression)
     }
 }
 
-
-
 void HLSLGenerator::OutputExpression(HLSLExpression* expression)
 {
-    if (expression->nodeType == HLSLNodeType_IdentifierExpression)
-    {
+    if (expression->nodeType == HLSLNodeType_IdentifierExpression) {
         HLSLIdentifierExpression* identifierExpression = static_cast<HLSLIdentifierExpression*>(expression);
         const char* name = identifierExpression->name;
-        
+
         m_writer.Write("%s", name);
     }
-    else if (expression->nodeType == HLSLNodeType_CastingExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_CastingExpression) {
         HLSLCastingExpression* castingExpression = static_cast<HLSLCastingExpression*>(expression);
         m_writer.Write("(");
         // OutputDeclaration(castingExpression->type, ""); // old - adds space after type
-        OutputDeclarationType(castingExpression->type, true/*isTypeCast*/); // new
+        OutputDeclarationType(castingExpression->type, true /*isTypeCast*/); // new
         m_writer.Write(")");
-        
+
         // These parens may not be needed
         m_writer.Write("(");
         OutputExpression(castingExpression->expression);
         m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_ConstructorExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_ConstructorExpression) {
         HLSLConstructorExpression* constructorExpression = static_cast<HLSLConstructorExpression*>(expression);
         m_writer.Write("%s(", GetTypeName(constructorExpression->type));
         OutputExpressionList(constructorExpression->argument);
         m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_LiteralExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_LiteralExpression) {
         HLSLLiteralExpression* literalExpression = static_cast<HLSLLiteralExpression*>(expression);
-        
+
         HLSLBaseType type = literalExpression->type;
         if (m_options.treatHalfAsFloat && IsHalf(type))
             type = HLSLBaseType_Float;
-        
-        switch (type)
-        {
-        case HLSLBaseType_Half:
-        case HLSLBaseType_Float:
-        case HLSLBaseType_Double:
-            {
+
+        switch (type) {
+            case HLSLBaseType_Half:
+            case HLSLBaseType_Float:
+            case HLSLBaseType_Double: {
                 // Don't use printf directly so that we don't use the system locale.
                 char buffer[64];
                 String_FormatFloat(buffer, sizeof(buffer), literalExpression->fValue);
                 String_StripTrailingFloatZeroes(buffer);
-                m_writer.Write("%s%s", buffer, type == HLSLBaseType_Half ? "h" : "" );
-            }
-            break;
-                
-        case HLSLBaseType_Short:
-        case HLSLBaseType_Ulong:
-        case HLSLBaseType_Int:
-            m_writer.Write("%d", literalExpression->iValue);
-            break;
-        // TODO: missing uint, u/short, u/long double
-                
-        case HLSLBaseType_Bool:
-            m_writer.Write("%s", literalExpression->bValue ? "true" : "false");
-            break;
-        default:
-            Error("Unhandled literal");
-            //ASSERT(false);
+                m_writer.Write("%s%s", buffer, type == HLSLBaseType_Half ? "h" : "");
+            } break;
+
+            case HLSLBaseType_Short:
+            case HLSLBaseType_Ulong:
+            case HLSLBaseType_Int:
+                m_writer.Write("%d", literalExpression->iValue);
+                break;
+                // TODO: missing uint, u/short, u/long double
+
+            case HLSLBaseType_Bool:
+                m_writer.Write("%s", literalExpression->bValue ? "true" : "false");
+                break;
+            default:
+                Error("Unhandled literal");
+                //ASSERT(false);
         }
     }
-    else if (expression->nodeType == HLSLNodeType_UnaryExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_UnaryExpression) {
         HLSLUnaryExpression* unaryExpression = static_cast<HLSLUnaryExpression*>(expression);
         const char* op = "?";
         bool pre = true;
-        switch (unaryExpression->unaryOp)
-        {
-        case HLSLUnaryOp_Negative:      op = "-";  break;
-        case HLSLUnaryOp_Positive:      op = "+";  break;
-        case HLSLUnaryOp_Not:           op = "!";  break;
-        case HLSLUnaryOp_PreIncrement:  op = "++"; break;
-        case HLSLUnaryOp_PreDecrement:  op = "--"; break;
-        case HLSLUnaryOp_PostIncrement: op = "++"; pre = false; break;
-        case HLSLUnaryOp_PostDecrement: op = "--"; pre = false; break;
-        case HLSLUnaryOp_BitNot:        op = "~";  break;
+        switch (unaryExpression->unaryOp) {
+            case HLSLUnaryOp_Negative:
+                op = "-";
+                break;
+            case HLSLUnaryOp_Positive:
+                op = "+";
+                break;
+            case HLSLUnaryOp_Not:
+                op = "!";
+                break;
+            case HLSLUnaryOp_PreIncrement:
+                op = "++";
+                break;
+            case HLSLUnaryOp_PreDecrement:
+                op = "--";
+                break;
+            case HLSLUnaryOp_PostIncrement:
+                op = "++";
+                pre = false;
+                break;
+            case HLSLUnaryOp_PostDecrement:
+                op = "--";
+                pre = false;
+                break;
+            case HLSLUnaryOp_BitNot:
+                op = "~";
+                break;
         }
-        
+
         // eliminate () if pure characters
         bool addParenthesis = NeedsParenthesis(unaryExpression->expression, expression);
         if (addParenthesis) m_writer.Write("(");
-        
-        if (pre)
-        {
+
+        if (pre) {
             m_writer.Write("%s", op);
             OutputExpression(unaryExpression->expression);
         }
-        else
-        {
+        else {
             OutputExpression(unaryExpression->expression);
             m_writer.Write("%s", op);
         }
         if (addParenthesis) m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_BinaryExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_BinaryExpression) {
         HLSLBinaryExpression* binaryExpression = static_cast<HLSLBinaryExpression*>(expression);
-        
+
         // TODO: to fix this need to pass in parentExpression to
         // the call.  And MSLGenerator passes NULL for most of these.
         // TODO: eliminate () if pure characters
-        
+
         bool addParenthesis = false; // NeedsParenthesis(expression, parentExpression);
         if (addParenthesis) m_writer.Write("(");
-        
+
         OutputExpression(binaryExpression->expression1);
         const char* op = "?";
-        switch (binaryExpression->binaryOp)
-        {
-        case HLSLBinaryOp_Add:          op = " + "; break;
-        case HLSLBinaryOp_Sub:          op = " - "; break;
-        case HLSLBinaryOp_Mul:          op = " * "; break;
-        case HLSLBinaryOp_Div:          op = " / "; break;
-        case HLSLBinaryOp_Less:         op = " < "; break;
-        case HLSLBinaryOp_Greater:      op = " > "; break;
-        case HLSLBinaryOp_LessEqual:    op = " <= "; break;
-        case HLSLBinaryOp_GreaterEqual: op = " >= "; break;
-        case HLSLBinaryOp_Equal:        op = " == "; break;
-        case HLSLBinaryOp_NotEqual:     op = " != "; break;
-        case HLSLBinaryOp_Assign:       op = " = "; break;
-        case HLSLBinaryOp_AddAssign:    op = " += "; break;
-        case HLSLBinaryOp_SubAssign:    op = " -= "; break;
-        case HLSLBinaryOp_MulAssign:    op = " *= "; break;
-        case HLSLBinaryOp_DivAssign:    op = " /= "; break;
-        case HLSLBinaryOp_And:          op = " && "; break;
-        case HLSLBinaryOp_Or:           op = " || "; break;
-		case HLSLBinaryOp_BitAnd:       op = " & "; break;
-        case HLSLBinaryOp_BitOr:        op = " | "; break;
-        case HLSLBinaryOp_BitXor:       op = " ^ "; break;
-        default:
-            Error("Unhandled binary op");
-            //ASSERT(false);
+        switch (binaryExpression->binaryOp) {
+            case HLSLBinaryOp_Add:
+                op = " + ";
+                break;
+            case HLSLBinaryOp_Sub:
+                op = " - ";
+                break;
+            case HLSLBinaryOp_Mul:
+                op = " * ";
+                break;
+            case HLSLBinaryOp_Div:
+                op = " / ";
+                break;
+            case HLSLBinaryOp_Less:
+                op = " < ";
+                break;
+            case HLSLBinaryOp_Greater:
+                op = " > ";
+                break;
+            case HLSLBinaryOp_LessEqual:
+                op = " <= ";
+                break;
+            case HLSLBinaryOp_GreaterEqual:
+                op = " >= ";
+                break;
+            case HLSLBinaryOp_Equal:
+                op = " == ";
+                break;
+            case HLSLBinaryOp_NotEqual:
+                op = " != ";
+                break;
+            case HLSLBinaryOp_Assign:
+                op = " = ";
+                break;
+            case HLSLBinaryOp_AddAssign:
+                op = " += ";
+                break;
+            case HLSLBinaryOp_SubAssign:
+                op = " -= ";
+                break;
+            case HLSLBinaryOp_MulAssign:
+                op = " *= ";
+                break;
+            case HLSLBinaryOp_DivAssign:
+                op = " /= ";
+                break;
+            case HLSLBinaryOp_And:
+                op = " && ";
+                break;
+            case HLSLBinaryOp_Or:
+                op = " || ";
+                break;
+            case HLSLBinaryOp_BitAnd:
+                op = " & ";
+                break;
+            case HLSLBinaryOp_BitOr:
+                op = " | ";
+                break;
+            case HLSLBinaryOp_BitXor:
+                op = " ^ ";
+                break;
+            default:
+                Error("Unhandled binary op");
+                //ASSERT(false);
         }
         m_writer.Write("%s", op);
         OutputExpression(binaryExpression->expression2);
         if (addParenthesis) m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_ConditionalExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_ConditionalExpression) {
         HLSLConditionalExpression* conditionalExpression = static_cast<HLSLConditionalExpression*>(expression);
-        
+
         // TODO: eliminate () if pure characters
         m_writer.Write("((");
         OutputExpression(conditionalExpression->condition);
@@ -468,56 +490,50 @@ void HLSLGenerator::OutputExpression(HLSLExpression* expression)
         OutputExpression(conditionalExpression->falseExpression);
         m_writer.Write("))");
     }
-    else if (expression->nodeType == HLSLNodeType_MemberAccess)
-    {
+    else if (expression->nodeType == HLSLNodeType_MemberAccess) {
         HLSLMemberAccess* memberAccess = static_cast<HLSLMemberAccess*>(expression);
-        
+
         bool addParenthesis = NeedsParenthesis(memberAccess->object, expression);
-        
+
         // eliminate () if pure characters
-        if ( addParenthesis ) m_writer.Write("(");
+        if (addParenthesis) m_writer.Write("(");
         OutputExpression(memberAccess->object);
-        if ( addParenthesis ) m_writer.Write(")");
+        if (addParenthesis) m_writer.Write(")");
         m_writer.Write(".%s", memberAccess->field);
     }
-    else if (expression->nodeType == HLSLNodeType_ArrayAccess)
-    {
+    else if (expression->nodeType == HLSLNodeType_ArrayAccess) {
         HLSLArrayAccess* arrayAccess = static_cast<HLSLArrayAccess*>(expression);
         OutputExpression(arrayAccess->array);
         m_writer.Write("[");
         OutputExpression(arrayAccess->index);
         m_writer.Write("]");
     }
-    else if (expression->nodeType == HLSLNodeType_FunctionCall)
-    {
+    else if (expression->nodeType == HLSLNodeType_FunctionCall) {
         HLSLFunctionCall* functionCall = static_cast<HLSLFunctionCall*>(expression);
         const char* name = functionCall->function->name;
         m_writer.Write("%s(", name);
         OutputExpressionList(functionCall->argument);
         m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_MemberFunctionCall)
-    {
+    else if (expression->nodeType == HLSLNodeType_MemberFunctionCall) {
         HLSLMemberFunctionCall* functionCall = static_cast<HLSLMemberFunctionCall*>(expression);
-        
+
         // Spriv only supports fp32 or i32/i64 OpTypeImage
-        if (IsHalf(functionCall->function->returnType.baseType) && m_options.writeVulkan)
-        {
+        if (IsHalf(functionCall->function->returnType.baseType) && m_options.writeVulkan) {
             // TODO: may need parens
             m_writer.Write("(half4)");
         }
-        
+
         // Write out the member identifier
         m_writer.Write("%s.", functionCall->memberIdentifier->name);
-        
+
         // Same as FunctionCall
         const char* name = functionCall->function->name;
         m_writer.Write("%s(", name);
         OutputExpressionList(functionCall->argument);
         m_writer.Write(")");
     }
-    else
-    {
+    else {
         Error("unknown expression");
     }
 }
@@ -525,19 +541,16 @@ void HLSLGenerator::OutputExpression(HLSLExpression* expression)
 void HLSLGenerator::OutputArguments(HLSLArgument* argument)
 {
     int numArgs = 0;
-    while (argument != NULL)
-    {
-        if (numArgs > 0)
-        {
+    while (argument != NULL) {
+        if (numArgs > 0) {
             int indent = m_writer.EndLine(",");
             m_writer.BeginLine(indent);
         }
 
-        const char * semantic = argument->sv_semantic ? argument->sv_semantic : argument->semantic;
+        const char* semantic = argument->sv_semantic ? argument->sv_semantic : argument->semantic;
 
         // Have to inject vulkan
-        if (semantic && m_options.writeVulkan)
-        {
+        if (semantic && m_options.writeVulkan) {
             if (String_Equal(semantic, "PSIZE"))
                 m_writer.Write("%s ", "[[vk::builtin(\"PointSize\")]]");
             else if (String_Equal(semantic, "BaseVertex"))
@@ -545,34 +558,33 @@ void HLSLGenerator::OutputArguments(HLSLArgument* argument)
             else if (String_Equal(semantic, "BaseInstance"))
                 m_writer.Write("%s ", "[[vk::builtin(\"BaseInstance\")]]");
         }
-        
+
         // Then modifier
-        switch (argument->modifier)
-        {
-        case HLSLArgumentModifier_In:
-            m_writer.Write("in ");
-            break;
-        case HLSLArgumentModifier_Out:
-            m_writer.Write("out ");
-            break;
-        case HLSLArgumentModifier_Inout:
-            m_writer.Write("inout ");
-            break;
-        case HLSLArgumentModifier_Uniform:
-            m_writer.Write("uniform ");
-            break;
-        default:
-            break;
+        switch (argument->modifier) {
+            case HLSLArgumentModifier_In:
+                m_writer.Write("in ");
+                break;
+            case HLSLArgumentModifier_Out:
+                m_writer.Write("out ");
+                break;
+            case HLSLArgumentModifier_Inout:
+                m_writer.Write("inout ");
+                break;
+            case HLSLArgumentModifier_Uniform:
+                m_writer.Write("uniform ");
+                break;
+            default:
+                break;
         }
-        
+
         OutputDeclaration(argument->type, argument->name, semantic, /*registerName=*/NULL, argument->defaultValue);
-        
+
         argument = argument->nextArgument;
         ++numArgs;
     }
 }
 
-static const char * GetAttributeName(HLSLAttributeType attributeType)
+static const char* GetAttributeName(HLSLAttributeType attributeType)
 {
     if (attributeType == HLSLAttributeType_Unroll) return "unroll";
     if (attributeType == HLSLAttributeType_Branch) return "branch";
@@ -582,12 +594,10 @@ static const char * GetAttributeName(HLSLAttributeType attributeType)
 
 void HLSLGenerator::OutputAttributes(int indent, HLSLAttribute* attribute)
 {
-    while (attribute != NULL)
-    {
-        const char * attributeName = GetAttributeName(attribute->attributeType);
-    
-        if (attributeName != NULL)
-        {
+    while (attribute != NULL) {
+        const char* attributeName = GetAttributeName(attribute->attributeType);
+
+        if (attributeName != NULL) {
             m_writer.WriteLineTagged(indent, attribute->fileName, attribute->line, "[%s]", attributeName);
         }
 
@@ -598,93 +608,95 @@ void HLSLGenerator::OutputAttributes(int indent, HLSLAttribute* attribute)
 static const char* BufferTypeToName(HLSLBufferType bufferType)
 {
     const char* name = "";
-    switch(bufferType)
-    {
-        case HLSLBufferType_CBuffer: name = "cbuffer"; break;
-        case HLSLBufferType_TBuffer: name = "tbuffer"; break;
-            
-        case HLSLBufferType_ConstantBuffer: name = "ConstantBuffer"; break;
-        case HLSLBufferType_StructuredBuffer: name = "StructuredBuffer"; break;
-        case HLSLBufferType_RWStructuredBuffer: name = "RWStructuredBuffer"; break;
-        case HLSLBufferType_ByteAddressBuffer: name = "ByteAddressBuffer"; break;
-        case HLSLBufferType_RWByteAddressBuffer: name = "RWByteAddresssBuffer"; break;
+    switch (bufferType) {
+        case HLSLBufferType_CBuffer:
+            name = "cbuffer";
+            break;
+        case HLSLBufferType_TBuffer:
+            name = "tbuffer";
+            break;
+
+        case HLSLBufferType_ConstantBuffer:
+            name = "ConstantBuffer";
+            break;
+        case HLSLBufferType_StructuredBuffer:
+            name = "StructuredBuffer";
+            break;
+        case HLSLBufferType_RWStructuredBuffer:
+            name = "RWStructuredBuffer";
+            break;
+        case HLSLBufferType_ByteAddressBuffer:
+            name = "ByteAddressBuffer";
+            break;
+        case HLSLBufferType_RWByteAddressBuffer:
+            name = "RWByteAddresssBuffer";
+            break;
     }
-    
+
     return name;
 }
 
 bool HLSLGenerator::CanSkipWrittenStatement(const HLSLStatement* statement) const
 {
     if (!statement->written) return false;
-    
+
     // only write these once for multi-entrypoint
     if (statement->nodeType == HLSLNodeType_Comment ||
-         statement->nodeType == HLSLNodeType_Buffer ||
-         statement->nodeType == HLSLNodeType_Struct)
+        statement->nodeType == HLSLNodeType_Buffer ||
+        statement->nodeType == HLSLNodeType_Struct)
         return true;
-    
+
     // only write const scalars out once, so they don't conflict
-    if (statement->nodeType == HLSLNodeType_Declaration)
-    {
+    if (statement->nodeType == HLSLNodeType_Declaration) {
         const HLSLDeclaration* decl = (const HLSLDeclaration*)statement;
-        if (IsScalarType(decl->type.baseType) && decl->type.flags & HLSLTypeFlag_Const)
-        {
+        if (IsScalarType(decl->type.baseType) && decl->type.flags & HLSLTypeFlag_Const) {
             return true;
         }
     }
-    
+
     // Helper functions should be skipped once written out
-    if (statement->nodeType == HLSLNodeType_Function)
-    {
+    if (statement->nodeType == HLSLNodeType_Function) {
         return true;
     }
-    
+
     return false;
 }
 void HLSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
 {
-    while (statement != NULL)
-    {
+    while (statement != NULL) {
         // skip pruned statements
-        if (statement->hidden) 
-        {
+        if (statement->hidden) {
             statement = statement->nextStatement;
             continue;
         }
 
         // skip writing some types across multiple entry points
-        if (CanSkipWrittenStatement(statement))
-        {
+        if (CanSkipWrittenStatement(statement)) {
             statement = statement->nextStatement;
             continue;
         }
         statement->written = true;
-        
+
         OutputAttributes(indent, statement->attributes);
 
-        if (statement->nodeType == HLSLNodeType_Comment)
-        {
+        if (statement->nodeType == HLSLNodeType_Comment) {
             HLSLComment* comment = static_cast<HLSLComment*>(statement);
             m_writer.WriteLine(indent, "//%s", comment->text);
         }
-        else if (statement->nodeType == HLSLNodeType_Declaration)
-        {
+        else if (statement->nodeType == HLSLNodeType_Declaration) {
             HLSLDeclaration* declaration = static_cast<HLSLDeclaration*>(statement);
             m_writer.BeginLine(indent, declaration->fileName, declaration->line);
             OutputDeclaration(declaration);
             m_writer.EndLine(";");
         }
-        else if (statement->nodeType == HLSLNodeType_Struct)
-        {
+        else if (statement->nodeType == HLSLNodeType_Struct) {
             HLSLStruct* structure = static_cast<HLSLStruct*>(statement);
             m_writer.WriteLineTagged(indent, structure->fileName, structure->line, "struct %s {", structure->name);
             HLSLStructField* field = structure->field;
-            while (field != NULL)
-            {
-                if (!field->hidden)
-                {
+            while (field != NULL) {
+                if (!field->hidden) {
                     m_writer.BeginLine(indent + 1, field->fileName, field->line);
-                    const char * semantic = field->sv_semantic ? field->sv_semantic : field->semantic;
+                    const char* semantic = field->sv_semantic ? field->sv_semantic : field->semantic;
                     OutputDeclaration(field->type, field->name, semantic);
                     m_writer.Write(";");
                     m_writer.EndLine();
@@ -693,95 +705,83 @@ void HLSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
             }
             m_writer.WriteLine(indent, "};");
         }
-        else if (statement->nodeType == HLSLNodeType_Buffer)
-        {
+        else if (statement->nodeType == HLSLNodeType_Buffer) {
             HLSLBuffer* buffer = static_cast<HLSLBuffer*>(statement);
             HLSLDeclaration* field = buffer->field;
 
-            if (!buffer->IsGlobalFields())
-            {
+            if (!buffer->IsGlobalFields()) {
                 // Constant/Structured/ByteAdddressBuffer
                 m_writer.BeginLine(indent, buffer->fileName, buffer->line);
-                
+
                 // Handle push constant for Vulkan.
                 // This is just a buffer to MSL.
                 // VK is limited to 1 buffer as a result.  Cannot contain half on AMD.
-                if (buffer->bufferType == HLSLBufferType_ConstantBuffer)
-                {
+                if (buffer->bufferType == HLSLBufferType_ConstantBuffer) {
                     if (m_options.writeVulkan &&
                         (String_HasString(buffer->name, "Push") ||
-                         String_HasString(buffer->name, "push")))
-                    {
+                         String_HasString(buffer->name, "push"))) {
                         m_writer.Write("[[vk::push_constant]] ");
                     }
                 }
-                
+
                 // write out template
                 m_writer.Write("%s<%s> %s",
                                BufferTypeToName(buffer->bufferType),
                                buffer->bufferStruct->name,
                                buffer->name);
-                
+
                 // write out optinal register
-                if (buffer->registerName != NULL)
-                {
-                     m_writer.Write(" : register(%s)", buffer->registerName);
+                if (buffer->registerName != NULL) {
+                    m_writer.Write(" : register(%s)", buffer->registerName);
                 }
-                
+
                 m_writer.Write(";");
                 m_writer.EndLine();
             }
-            else
-            {
+            else {
                 // c/tbuffer
                 m_writer.BeginLine(indent, buffer->fileName, buffer->line);
-                
+
                 // not templated
                 m_writer.Write("%s %s",
                                BufferTypeToName(buffer->bufferType),
                                buffer->name);
-                
+
                 // write out optional register
-                if (buffer->registerName != NULL)
-                {
-                     m_writer.Write(" : register(%s)", buffer->registerName);
+                if (buffer->registerName != NULL) {
+                    m_writer.Write(" : register(%s)", buffer->registerName);
                 }
-                
+
                 m_writer.EndLine(" {");
                 m_isInsideBuffer = true;
-                
-                while (field != NULL)
-                {
-                    if (!field->hidden)
-                    {
+
+                while (field != NULL) {
+                    if (!field->hidden) {
                         m_writer.BeginLine(indent + 1, field->fileName, field->line);
-                        OutputDeclaration(field->type, field->name, /*semantic=*/NULL, /*registerName*/field->registerName, field->assignment);
+                        OutputDeclaration(field->type, field->name, /*semantic=*/NULL, /*registerName*/ field->registerName, field->assignment);
                         m_writer.Write(";");
                         m_writer.EndLine();
                     }
                     field = (HLSLDeclaration*)field->nextStatement;
                 }
-                
+
                 m_isInsideBuffer = false;
-                
+
                 m_writer.WriteLine(indent, "};");
             }
         }
-        else if (statement->nodeType == HLSLNodeType_Function)
-        {
+        else if (statement->nodeType == HLSLNodeType_Function) {
             HLSLFunction* function = static_cast<HLSLFunction*>(statement);
 
             // Use an alternate name for the function which is supposed to be entry point
             // so that we can supply our own function which will be the actual entry point.
-            const char* functionName   = function->name;
+            const char* functionName = function->name;
             const char* returnTypeName = GetTypeName(function->returnType);
 
             bool isEntryPoint = String_Equal(functionName, m_entryName);
-            if (isEntryPoint)
-            {
+            if (isEntryPoint) {
                 // This is a SM6.x construct for tagging entry points
-                switch(m_target)
-                {
+                switch (m_target) {
                     case HLSLTarget_VertexShader:
                         m_writer.WriteLine(indent, "[shader(\"vertex\")] ");
                         break;
@@ -795,19 +795,17 @@ void HLSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
                         break;
                 }
             }
-            
+
             m_writer.BeginLine(indent, function->fileName, function->line);
             m_writer.Write("%s %s(", returnTypeName, functionName);
 
             OutputArguments(function->argument);
 
-            const char * semantic = function->sv_semantic ? function->sv_semantic : function->semantic;
-            if (semantic != NULL)
-            {
+            const char* semantic = function->sv_semantic ? function->sv_semantic : function->semantic;
+            if (semantic != NULL) {
                 m_writer.Write(") : %s {", semantic);
             }
-            else
-            {
+            else {
                 m_writer.Write(") {");
             }
 
@@ -816,45 +814,37 @@ void HLSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
             OutputStatements(indent + 1, function->statement);
             m_writer.WriteLine(indent, "};");
         }
-        else if (statement->nodeType == HLSLNodeType_ExpressionStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ExpressionStatement) {
             HLSLExpressionStatement* expressionStatement = static_cast<HLSLExpressionStatement*>(statement);
             m_writer.BeginLine(indent, statement->fileName, statement->line);
             OutputExpression(expressionStatement->expression);
             m_writer.EndLine(";");
         }
-        else if (statement->nodeType == HLSLNodeType_ReturnStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ReturnStatement) {
             HLSLReturnStatement* returnStatement = static_cast<HLSLReturnStatement*>(statement);
-            if (returnStatement->expression != NULL)
-            {
+            if (returnStatement->expression != NULL) {
                 m_writer.BeginLine(indent, returnStatement->fileName, returnStatement->line);
                 m_writer.Write("return ");
                 OutputExpression(returnStatement->expression);
                 m_writer.EndLine(";");
             }
-            else
-            {
+            else {
                 m_writer.WriteLineTagged(indent, returnStatement->fileName, returnStatement->line, "return;");
             }
         }
-        else if (statement->nodeType == HLSLNodeType_DiscardStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_DiscardStatement) {
             HLSLDiscardStatement* discardStatement = static_cast<HLSLDiscardStatement*>(statement);
             m_writer.WriteLineTagged(indent, discardStatement->fileName, discardStatement->line, "discard;");
         }
-        else if (statement->nodeType == HLSLNodeType_BreakStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_BreakStatement) {
             HLSLBreakStatement* breakStatement = static_cast<HLSLBreakStatement*>(statement);
             m_writer.WriteLineTagged(indent, breakStatement->fileName, breakStatement->line, "break;");
         }
-        else if (statement->nodeType == HLSLNodeType_ContinueStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ContinueStatement) {
             HLSLContinueStatement* continueStatement = static_cast<HLSLContinueStatement*>(statement);
             m_writer.WriteLineTagged(indent, continueStatement->fileName, continueStatement->line, "continue;");
         }
-        else if (statement->nodeType == HLSLNodeType_IfStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_IfStatement) {
             HLSLIfStatement* ifStatement = static_cast<HLSLIfStatement*>(statement);
             m_writer.BeginLine(indent, ifStatement->fileName, ifStatement->line);
             m_writer.Write("if (");
@@ -863,15 +853,13 @@ void HLSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
             m_writer.EndLine();
             OutputStatements(indent + 1, ifStatement->statement);
             m_writer.WriteLine(indent, "}");
-            if (ifStatement->elseStatement != NULL)
-            {
+            if (ifStatement->elseStatement != NULL) {
                 m_writer.WriteLine(indent, "else {");
                 OutputStatements(indent + 1, ifStatement->elseStatement);
                 m_writer.WriteLine(indent, "}");
             }
         }
-        else if (statement->nodeType == HLSLNodeType_ForStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ForStatement) {
             HLSLForStatement* forStatement = static_cast<HLSLForStatement*>(statement);
             m_writer.BeginLine(indent, forStatement->fileName, forStatement->line);
             m_writer.Write("for (");
@@ -885,24 +873,22 @@ void HLSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
             OutputStatements(indent + 1, forStatement->statement);
             m_writer.WriteLine(indent, "}");
         }
-        else if (statement->nodeType == HLSLNodeType_BlockStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_BlockStatement) {
             HLSLBlockStatement* blockStatement = static_cast<HLSLBlockStatement*>(statement);
             m_writer.WriteLineTagged(indent, blockStatement->fileName, blockStatement->line, "{");
             OutputStatements(indent + 1, blockStatement->statement);
             m_writer.WriteLine(indent, "}");
         }
         // FX file constructs
-//        else if (statement->nodeType == HLSLNodeType_Technique)
-//        {
-//            // Techniques are ignored.
-//        }
-//        else if (statement->nodeType == HLSLNodeType_Pipeline)
-//        {
-//            // Pipelines are ignored.
-//        }
-        else
-        {
+        // else if (statement->nodeType == HLSLNodeType_Technique)
+        // {
+        //     // Techniques are ignored.
+        // }
+        // else if (statement->nodeType == HLSLNodeType_Pipeline)
+        // {
+        //     // Pipelines are ignored.
+        // }
+        else {
             // Unhanded statement type.
             Error("Unhandled statement");
             //ASSERT(false);
@@ -917,73 +903,62 @@ const char* HLSLGenerator::GetFormatName(HLSLBaseType bufferOrTextureType, HLSLB
 {
     // TODO: have a way to disable use of half (like on MSLGenerator)
     bool isHalf = IsHalf(formatType);
-    
+
     // Can't use half4 textures with spirv.  Can only cast from full float sampler.
     // Can tell Vulkan was written by/for desktop IHVs.
     // https://github.com/microsoft/DirectXShaderCompiler/issues/2711
     bool isSpirvTarget = m_options.writeVulkan;
     if (isSpirvTarget)
         isHalf = false;
-    
+
     const char* formatName = isHalf ? "half4" : "float4";
-    
+
     // MSL only uses half/float mostly. With HLSL, this is a full
     // template format of float/2/3/4.
-    
+
     return formatName;
 }
 
-
 void HLSLGenerator::OutputDeclaration(HLSLDeclaration* declaration)
 {
-    if (IsSamplerType(declaration->type))
-    {
+    if (IsSamplerType(declaration->type)) {
         int reg = -1;
-        if (declaration->registerName != NULL)
-        {
+        if (declaration->registerName != NULL) {
             sscanf(declaration->registerName, "s%d", &reg);
         }
-        
+
         // sampler
         const char* samplerTypeName = GetTypeName(declaration->type);
-        if (samplerTypeName)
-        {
-            if (reg != -1)
-            {
+        if (samplerTypeName) {
+            if (reg != -1) {
                 m_writer.Write("%s %s : register(s%d)", samplerTypeName, declaration->name, reg);
             }
-            else
-            {
+            else {
                 m_writer.Write("%s %s", samplerTypeName, declaration->name);
             }
         }
         return;
     }
-    if (IsTextureType(declaration->type))
-    {
+    if (IsTextureType(declaration->type)) {
         int reg = -1;
-        if (declaration->registerName != NULL)
-        {
+        if (declaration->registerName != NULL) {
             sscanf(declaration->registerName, "t%d", &reg);
         }
 
         HLSLBaseType formatType = declaration->type.formatType;
         if (m_options.treatHalfAsFloat && IsHalf(formatType))
             formatType = HalfToFloatBaseType(formatType);
-            
+
         const char* formatTypeName = GetFormatName(declaration->type.baseType, formatType);
-       
+
         // texture carts the dimension and format
         const char* textureTypeName = GetTypeName(declaration->type);
-    
-        if (textureTypeName != NULL)
-        {
-            if (reg != -1)
-            {
+
+        if (textureTypeName != NULL) {
+            if (reg != -1) {
                 m_writer.Write("%s<%s> %s : register(t%d)", textureTypeName, formatTypeName, declaration->name, reg);
             }
-            else
-            {
+            else {
                 m_writer.Write("%s<%s> %s", textureTypeName, formatTypeName, declaration->name);
             }
         }
@@ -994,7 +969,7 @@ void HLSLGenerator::OutputDeclaration(HLSLDeclaration* declaration)
     OutputDeclarationBody(declaration->type, declaration->name, declaration->semantic, declaration->registerName, declaration->assignment);
     declaration = declaration->nextDeclaration;
 
-    while(declaration != NULL) {
+    while (declaration != NULL) {
         m_writer.Write(", ");
         OutputDeclarationBody(declaration->type, declaration->name, declaration->semantic, declaration->registerName, declaration->assignment);
         declaration = declaration->nextDeclaration;
@@ -1005,39 +980,32 @@ void HLSLGenerator::OutputDeclarationType(const HLSLType& type, bool isTypeCast)
 {
     const char* typeName = GetTypeName(type);
 
-    if (isTypeCast)
-    {
+    if (isTypeCast) {
         m_writer.Write("%s", typeName);
         return;
     }
-    
-    if (type.flags & HLSLTypeFlag_Static)
-    {
+
+    if (type.flags & HLSLTypeFlag_Static) {
         m_writer.Write("static ");
     }
-    if (type.flags & HLSLTypeFlag_Const)
-    {
+    if (type.flags & HLSLTypeFlag_Const) {
         m_writer.Write("const ");
     }
-    
+
     // Interpolation modifiers.
-    if (type.flags & HLSLTypeFlag_Centroid)
-    {
+    if (type.flags & HLSLTypeFlag_Centroid) {
         m_writer.Write("centroid ");
     }
-    if (type.flags & HLSLTypeFlag_Linear)
-    {
+    if (type.flags & HLSLTypeFlag_Linear) {
         m_writer.Write("linear ");
     }
-    if (type.flags & HLSLTypeFlag_NoInterpolation)
-    {
+    if (type.flags & HLSLTypeFlag_NoInterpolation) {
         m_writer.Write("nointerpolation ");
     }
-    if (type.flags & HLSLTypeFlag_NoPerspective)
-    {
+    if (type.flags & HLSLTypeFlag_NoPerspective) {
         m_writer.Write("noperspective ");
     }
-    if (type.flags & HLSLTypeFlag_Sample)   // @@ Only in shader model >= 4.1
+    if (type.flags & HLSLTypeFlag_Sample) // @@ Only in shader model >= 4.1
     {
         m_writer.Write("sample ");
     }
@@ -1045,55 +1013,46 @@ void HLSLGenerator::OutputDeclarationType(const HLSLType& type, bool isTypeCast)
     m_writer.Write("%s ", typeName);
 }
 
-void HLSLGenerator::OutputDeclarationBody(const HLSLType& type, const char* name, const char* semantic/*=NULL*/, const char* registerName/*=NULL*/, HLSLExpression * assignment/*=NULL*/)
+void HLSLGenerator::OutputDeclarationBody(const HLSLType& type, const char* name, const char* semantic /*=NULL*/, const char* registerName /*=NULL*/, HLSLExpression* assignment /*=NULL*/)
 {
     m_writer.Write("%s", name);
 
-    if (type.array)
-    {
+    if (type.array) {
         ASSERT(semantic == NULL);
         m_writer.Write("[");
-        if (type.arraySize != NULL)
-        {
+        if (type.arraySize != NULL) {
             OutputExpression(type.arraySize);
         }
         m_writer.Write("]");
     }
 
-    if (semantic != NULL) 
-    {
+    if (semantic != NULL) {
         m_writer.Write(" : %s", semantic);
     }
 
-    if (registerName != NULL)
-    {
-        if (m_isInsideBuffer)
-        {
+    if (registerName != NULL) {
+        if (m_isInsideBuffer) {
             m_writer.Write(" : packoffset(%s)", registerName);
         }
-        else 
-        {
+        else {
             m_writer.Write(" : register(%s)", registerName);
         }
     }
 
-    if (assignment != NULL && !IsSamplerType(type))
-    {
+    if (assignment != NULL && !IsSamplerType(type)) {
         m_writer.Write(" = ");
-        if (type.array)
-        {
+        if (type.array) {
             m_writer.Write("{ ");
             OutputExpressionList(assignment);
             m_writer.Write(" }");
         }
-        else
-        {
+        else {
             OutputExpression(assignment);
         }
     }
 }
 
-void HLSLGenerator::OutputDeclaration(const HLSLType& type, const char* name, const char* semantic/*=NULL*/, const char* registerName/*=NULL*/, HLSLExpression * assignment/*=NULL*/)
+void HLSLGenerator::OutputDeclaration(const HLSLType& type, const char* name, const char* semantic /*=NULL*/, const char* registerName /*=NULL*/, HLSLExpression* assignment /*=NULL*/)
 {
     OutputDeclarationType(type);
     OutputDeclarationBody(type, name, semantic, registerName, assignment);
@@ -1103,20 +1062,17 @@ bool HLSLGenerator::ChooseUniqueName(const char* base, char* dst, int dstLength)
 {
     // IC: Try without suffix first.
     String_Printf(dst, dstLength, "%s", base);
-    if (!m_tree->GetContainsString(base))
-    {
+    if (!m_tree->GetContainsString(base)) {
         return true;
     }
 
-    for (int i = 1; i < 1024; ++i)
-    {
+    for (int i = 1; i < 1024; ++i) {
         String_Printf(dst, dstLength, "%s%d", base, i);
-        if (!m_tree->GetContainsString(dst))
-        {
+        if (!m_tree->GetContainsString(dst)) {
             return true;
         }
     }
     return false;
 }
 
-}
+} //namespace M4
diff --git a/hlslparser/src/HLSLGenerator.h b/hlslparser/src/HLSLGenerator.h
index a909d0ca..5ad6b711 100644
--- a/hlslparser/src/HLSLGenerator.h
+++ b/hlslparser/src/HLSLGenerator.h
@@ -12,25 +12,23 @@
 #include "CodeWriter.h"
 #include "HLSLTree.h"
 
-namespace M4
-{
+namespace M4 {
 
-class  HLSLTree;
+class HLSLTree;
 struct HLSLFunction;
 struct HLSLStruct;
 
 // TODO: try to unify some options with MSLGenerator
-struct HLSLOptions
-{
+struct HLSLOptions {
     // int (*attributeCallback)(const char* name, uint32_t index) = NULL;
     // uint32_t bufferRegisterOffset = 0;
-    
+
     bool writeFileLine = false;
-    
+
     bool treatHalfAsFloat = false;
     // TODO: hook this up
     // bool treatDoubleAsFloat = true;
-    
+
     // add vk constructions to HLSL source to convert to Spriv
     bool writeVulkan = false;
 };
@@ -39,19 +37,14 @@ struct HLSLOptions
  * This class is used to generate HLSL which is compatible with the D3D9
  * compiler (i.e. no cbuffers).
  */
-class HLSLGenerator
-{
-
+class HLSLGenerator {
 public:
     HLSLGenerator();
-    
-    
-    
-    bool Generate(HLSLTree* tree, HLSLTarget target, const char* entryName, const HLSLOptions& options = HLSLOptions() );
+
+    bool Generate(HLSLTree* tree, HLSLTarget target, const char* entryName, const HLSLOptions& options = HLSLOptions());
     const char* GetResult() const;
 
 private:
-
     void OutputExpressionList(HLSLExpression* expression);
     void OutputExpression(HLSLExpression* expression);
     void OutputArguments(HLSLArgument* argument);
@@ -60,7 +53,7 @@ class HLSLGenerator
     void OutputDeclaration(HLSLDeclaration* declaration);
     void OutputDeclaration(const HLSLType& type, const char* name, const char* semantic = NULL, const char* registerName = NULL, HLSLExpression* defaultValue = NULL);
     void OutputDeclarationType(const HLSLType& type, bool isTypeCast = false);
-    void OutputDeclarationBody(const HLSLType& type, const char* name, const char* semantic =NULL, const char* registerName = NULL, HLSLExpression * assignment = NULL);
+    void OutputDeclarationBody(const HLSLType& type, const char* name, const char* semantic = NULL, const char* registerName = NULL, HLSLExpression* assignment = NULL);
 
     /** Generates a name of the format "base+n" where n is an integer such that the name
      * isn't used in the syntax tree. */
@@ -69,20 +62,19 @@ class HLSLGenerator
     const char* GetTypeName(const HLSLType& type);
 
     void Error(const char* format, ...) M4_PRINTF_ATTR(2, 3);
-    
+
     const char* GetFormatName(HLSLBaseType bufferOrTextureType, HLSLBaseType formatType);
     bool CanSkipWrittenStatement(const HLSLStatement* statement) const;
 
 private:
-
-    CodeWriter      m_writer;
+    CodeWriter m_writer;
 
     const HLSLTree* m_tree;
-    const char*     m_entryName;
-    HLSLTarget      m_target;
-    bool            m_isInsideBuffer;
-    bool            m_error;
-    HLSLOptions     m_options;
+    const char* m_entryName;
+    HLSLTarget m_target;
+    bool m_isInsideBuffer;
+    bool m_error;
+    HLSLOptions m_options;
 };
 
-} // M4
+} //namespace M4
diff --git a/hlslparser/src/HLSLParser.cpp b/hlslparser/src/HLSLParser.cpp
index b3dd5086..46580c57 100644
--- a/hlslparser/src/HLSLParser.cpp
+++ b/hlslparser/src/HLSLParser.cpp
@@ -10,7 +10,6 @@
 #include "HLSLParser.h"
 
 #include "Engine.h"
-
 #include "HLSLTree.h"
 
 #ifdef _WIN32
@@ -21,27 +20,24 @@
 
 // stl
 #include <algorithm>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
-namespace M4
-{
+namespace M4 {
 
-enum CompareFunctionsResult
-{
+enum CompareFunctionsResult {
     FunctionsEqual,
     Function1Better,
     Function2Better
 };
 
-enum CoreType
-{
+enum CoreType {
     CoreType_None,
-    
+
     CoreType_Scalar,
     CoreType_Vector,
     CoreType_Matrix,
-    
+
     CoreType_Sampler,
     CoreType_Texture,
     CoreType_Struct,
@@ -49,12 +45,11 @@ enum CoreType
     CoreType_Expression,
     CoreType_Comment,
     CoreType_Buffer,
-    
+
     CoreType_Count // must be last
 };
 
-enum DimensionType
-{
+enum DimensionType {
     DimensionType_None,
 
     DimensionType_Scalar,
@@ -66,29 +61,28 @@ enum DimensionType
     DimensionType_Matrix2x2,
     DimensionType_Matrix3x3,
     DimensionType_Matrix4x4,
-    
+
     //DimensionType_Matrix4x3, // TODO: no 3x4
     //DimensionType_Matrix4x2
 };
 
 // Can use this to break apart type to useful constructs
-struct BaseTypeDescription
-{
-    const char*     typeName = "";
-    const char*     typeNameMetal = "";
-    
-    HLSLBaseType    baseType = HLSLBaseType_Unknown;
-    CoreType        coreType = CoreType_None;
-    DimensionType   dimensionType = DimensionType_None;
-    NumericType     numericType = NumericType_NaN;
-    
+struct BaseTypeDescription {
+    const char* typeName = "";
+    const char* typeNameMetal = "";
+
+    HLSLBaseType baseType = HLSLBaseType_Unknown;
+    CoreType coreType = CoreType_None;
+    DimensionType dimensionType = DimensionType_None;
+    NumericType numericType = NumericType_NaN;
+
     // TODO: is this useful ?
-   // int             numDimensions; // scalar = 0, vector = 1, matrix = 2
-    uint8_t             numDimensions = 0;
-    uint8_t             numComponents = 0;
-    uint8_t             height = 0;
-    
-    int8_t              binaryOpRank = -1; // or was this supposed to be max (-1 in uint8_t)
+    // int             numDimensions; // scalar = 0, vector = 1, matrix = 2
+    uint8_t numDimensions = 0;
+    uint8_t numComponents = 0;
+    uint8_t height = 0;
+
+    int8_t binaryOpRank = -1; // or was this supposed to be max (-1 in uint8_t)
 };
 
 // really const
@@ -122,18 +116,16 @@ bool IsTextureType(HLSLBaseType baseType)
 bool IsDepthTextureType(HLSLBaseType baseType)
 {
     // return baseTypeDescriptions[baseType].coreType == CoreType_DepthTexture;
-    return  baseType == HLSLBaseType_Depth2D ||
-            baseType == HLSLBaseType_Depth2DArray ||
-            baseType == HLSLBaseType_DepthCube;
+    return baseType == HLSLBaseType_Depth2D ||
+           baseType == HLSLBaseType_Depth2DArray ||
+           baseType == HLSLBaseType_DepthCube;
 }
 
-
 bool IsBufferType(HLSLBaseType baseType)
 {
     return baseTypeDescriptions[baseType].coreType == CoreType_Buffer;
 }
 
-
 bool IsCoreTypeEqual(HLSLBaseType lhsType, HLSLBaseType rhsType)
 {
     return baseTypeDescriptions[lhsType].coreType ==
@@ -143,9 +135,9 @@ bool IsCoreTypeEqual(HLSLBaseType lhsType, HLSLBaseType rhsType)
 bool IsDimensionEqual(HLSLBaseType lhsType, HLSLBaseType rhsType)
 {
     return baseTypeDescriptions[lhsType].numComponents ==
-           baseTypeDescriptions[rhsType].numComponents &&
+               baseTypeDescriptions[rhsType].numComponents &&
            baseTypeDescriptions[lhsType].height ==
-           baseTypeDescriptions[rhsType].height;
+               baseTypeDescriptions[rhsType].height;
 }
 
 bool IsCrossDimensionEqual(HLSLBaseType lhsType, HLSLBaseType rhsType)
@@ -154,7 +146,6 @@ bool IsCrossDimensionEqual(HLSLBaseType lhsType, HLSLBaseType rhsType)
            baseTypeDescriptions[rhsType].numComponents;
 }
 
-
 bool IsNumericTypeEqual(HLSLBaseType lhsType, HLSLBaseType rhsType)
 {
     return baseTypeDescriptions[lhsType].numericType ==
@@ -191,7 +182,6 @@ bool IsIntegerType(HLSLBaseType type)
            n == NumericType_Long || n == NumericType_Ulong;
 }
 
-
 bool IsInt(HLSLBaseType type)
 {
     return baseTypeDescriptions[type].numericType == NumericType_Int;
@@ -225,31 +215,27 @@ bool IsBool(HLSLBaseType type)
     return baseTypeDescriptions[type].numericType == NumericType_Bool;
 }
 
-
-
-
-
-bool IsSamplerType(const HLSLType & type)
+bool IsSamplerType(const HLSLType& type)
 {
     return IsSamplerType(type.baseType);
 }
 
-bool IsScalarType(const HLSLType & type)
+bool IsScalarType(const HLSLType& type)
 {
     return IsScalarType(type.baseType);
 }
 
-bool IsVectorType(const HLSLType & type)
+bool IsVectorType(const HLSLType& type)
 {
     return IsVectorType(type.baseType);
 }
 
-bool IsMatrixType(const HLSLType & type)
+bool IsMatrixType(const HLSLType& type)
 {
     return IsMatrixType(type.baseType);
 }
 
-bool IsTextureType(const HLSLType & type)
+bool IsTextureType(const HLSLType& type)
 {
     return IsTextureType(type.baseType);
 }
@@ -262,55 +248,80 @@ bool IsNumericType(HLSLBaseType baseType)
 HLSLBufferType ConvertTokenToBufferType(HLSLToken token)
 {
     HLSLBufferType type = HLSLBufferType_CBuffer;
-    
-    switch(token)
-    {
+
+    switch (token) {
         // DX9
         case HLSLToken_CBuffer:
-            type = HLSLBufferType_CBuffer; break;
+            type = HLSLBufferType_CBuffer;
+            break;
         case HLSLToken_TBuffer:
-            type = HLSLBufferType_TBuffer; break;
-        
+            type = HLSLBufferType_TBuffer;
+            break;
+
         // DX10
         case HLSLToken_ConstantBuffer:
-            type = HLSLBufferType_ConstantBuffer; break;
+            type = HLSLBufferType_ConstantBuffer;
+            break;
         case HLSLToken_StructuredBuffer:
-            type = HLSLBufferType_StructuredBuffer; break;
+            type = HLSLBufferType_StructuredBuffer;
+            break;
         case HLSLToken_RWStructuredBuffer:
-            type = HLSLBufferType_RWStructuredBuffer; break;
+            type = HLSLBufferType_RWStructuredBuffer;
+            break;
         case HLSLToken_ByteAddressBuffer:
-            type = HLSLBufferType_ByteAddressBuffer; break;
+            type = HLSLBufferType_ByteAddressBuffer;
+            break;
         case HLSLToken_RWByteAddressBuffer:
-            type = HLSLBufferType_RWByteAddressBuffer; break;
-            
+            type = HLSLBufferType_RWByteAddressBuffer;
+            break;
+
         default:
             break;
     }
-    
+
     return type;
 }
 
 HLSLBaseType NumericToBaseType(NumericType numericType)
 {
     HLSLBaseType baseType = HLSLBaseType_Unknown;
-    switch(numericType)
-    {
-        case NumericType_Float: baseType = HLSLBaseType_Float; break;
-        case NumericType_Half: baseType = HLSLBaseType_Half; break;
-        case NumericType_Double: baseType = HLSLBaseType_Bool; break;
-       
-        case NumericType_Int: baseType = HLSLBaseType_Int; break;
-        case NumericType_Uint: baseType = HLSLBaseType_Uint; break;
-        case NumericType_Ushort: baseType = HLSLBaseType_Ushort; break;
-        case NumericType_Short: baseType = HLSLBaseType_Short; break;
-        case NumericType_Ulong: baseType = HLSLBaseType_Ulong; break;
-        case NumericType_Long: baseType = HLSLBaseType_Long; break;
-        case NumericType_Bool: baseType = HLSLBaseType_Bool; break;
-        
-        // MSL has 8-bit, but HLSL/Vulkan don't
-        //case NumericType_Uint8: baseType = HLSLBaseType_Uint8; break;
-        //case NumericType_Int8: baseType = HLSLBaseType_Int8; break;
-    
+    switch (numericType) {
+        case NumericType_Float:
+            baseType = HLSLBaseType_Float;
+            break;
+        case NumericType_Half:
+            baseType = HLSLBaseType_Half;
+            break;
+        case NumericType_Double:
+            baseType = HLSLBaseType_Bool;
+            break;
+
+        case NumericType_Int:
+            baseType = HLSLBaseType_Int;
+            break;
+        case NumericType_Uint:
+            baseType = HLSLBaseType_Uint;
+            break;
+        case NumericType_Ushort:
+            baseType = HLSLBaseType_Ushort;
+            break;
+        case NumericType_Short:
+            baseType = HLSLBaseType_Short;
+            break;
+        case NumericType_Ulong:
+            baseType = HLSLBaseType_Ulong;
+            break;
+        case NumericType_Long:
+            baseType = HLSLBaseType_Long;
+            break;
+        case NumericType_Bool:
+            baseType = HLSLBaseType_Bool;
+            break;
+
+            // MSL has 8-bit, but HLSL/Vulkan don't
+            //case NumericType_Uint8: baseType = HLSLBaseType_Uint8; break;
+            //case NumericType_Int8: baseType = HLSLBaseType_Int8; break;
+
         default:
             break;
     }
@@ -327,7 +338,7 @@ int32_t GetVectorDimension(HLSLBaseType type)
 {
     if (IsScalarType(type)) return 1;
     if (!IsVectorType(type)) return 0;
-    
+
     return baseTypeDescriptions[type].numComponents;
 }
 
@@ -345,14 +356,13 @@ HLSLBaseType DoubleToFloatBaseType(HLSLBaseType type)
     return type;
 }
 
-
 static HLSLBaseType ArithmeticOpResultType(HLSLBinaryOp binaryOp, HLSLBaseType t1, HLSLBaseType t2);
 
 const char* GetNumericTypeName(HLSLBaseType type)
 {
     if (!IsNumericType(type))
         return nullptr;
-    
+
     // MSL/HLSL share the same type names
     const auto& b = baseTypeDescriptions[type];
     return b.typeName;
@@ -364,47 +374,41 @@ HLSLBaseType PromoteType(HLSLBaseType toType, HLSLBaseType type)
                         baseTypeDescriptions[type].dimensionType - DimensionType_Scalar);
 }
 
-
-
 /** This structure stores a HLSLFunction-like declaration for an intrinsic function */
-struct Intrinsic
-{
+struct Intrinsic {
     explicit Intrinsic(const char* name, uint32_t numArgs)
     {
-        function.name         = name;
+        function.name = name;
         function.numArguments = numArgs;
-        
+
         if (numArgs == 0) return;
-        
-        for (uint32_t i = 0; i < numArgs; ++i)
-        {
+
+        for (uint32_t i = 0; i < numArgs; ++i) {
             argument[i].type.flags = HLSLTypeFlag_Const;
         }
     }
-    
+
     void ChainArgumentPointers()
     {
         function.argument = argument + 0;
-        
+
         uint32_t numArgs = function.numArguments;
         // This chain pf pointers won't surive copy
-        for (uint32_t i = 0; i < numArgs; ++i)
-        {
+        for (uint32_t i = 0; i < numArgs; ++i) {
             if (i < numArgs - 1)
                 argument[i].nextArgument = argument + i + 1;
         }
     }
-    
+
     void SetArgumentTypes(HLSLBaseType returnType, HLSLBaseType args[4])
     {
         function.returnType.baseType = returnType;
-        for (uint32_t i = 0; i < function.numArguments; ++i)
-        {
+        for (uint32_t i = 0; i < function.numArguments; ++i) {
             ASSERT(args[i] != HLSLBaseType_Unknown);
             argument[i].type.baseType = args[i];
         }
     }
-    
+
     void ArgsToArray(HLSLBaseType args[4], uint32_t& numArgs, HLSLBaseType arg1, HLSLBaseType arg2, HLSLBaseType arg3, HLSLBaseType arg4)
     {
         numArgs = 0;
@@ -417,24 +421,24 @@ struct Intrinsic
         if (arg4 == HLSLBaseType_Unknown) return;
         args[numArgs++] = arg4;
     }
-    
+
     explicit Intrinsic(const char* name, HLSLBaseType returnType, HLSLBaseType arg1 = HLSLBaseType_Unknown, HLSLBaseType arg2 = HLSLBaseType_Unknown, HLSLBaseType arg3 = HLSLBaseType_Unknown, HLSLBaseType arg4 = HLSLBaseType_Unknown)
     {
-        function.name                   = name;
-        
+        function.name = name;
+
         HLSLBaseType argumentTypes[4];
         uint32_t numArgs = 0;
         ArgsToArray(argumentTypes, numArgs, arg1, arg2, arg3, arg4);
-        
+
         *this = Intrinsic(name, numArgs);
         SetArgumentTypes(returnType, argumentTypes);
     }
-    
+
     // TODO: allow member function intrinsices on buffers/textures
-    HLSLFunction    function;
-    HLSLArgument    argument[4];
+    HLSLFunction function;
+    HLSLArgument argument[4];
 };
-    
+
 // So many calls are member functions in modern HLSL/MSL.
 // This means the parser has to work harder to write out these intrinsics
 // since some have default args, and some need level(), bias() wrappers in MSL.
@@ -449,12 +453,12 @@ void AddTextureLoadIntrinsic(const char* name, HLSLBaseType returnType, HLSLBase
     Intrinsic i(name, returnType, uvType, arg3, arg4);
     i.function.memberType = textureType; // extract formatType from return type
 #else
-//    Intrinsic i(name, returnType, textureType, uvType);
+// Intrinsic i(name, returnType, textureType, uvType);
 //
-//    // classify textureType subtype off scalar
-//    i.argument[0].type.formatType = GetScalarType(returnType);
+// // classify textureType subtype off scalar
+// i.argument[0].type.formatType = GetScalarType(returnType);
 #endif
-    
+
     AddIntrinsic(i);
 }
 
@@ -464,37 +468,36 @@ void AddTextureIntrinsic(const char* name, HLSLBaseType returnType, HLSLBaseType
     Intrinsic i(name, returnType, HLSLBaseType_SamplerState, uvType, arg3, arg4);
     i.function.memberType = textureType;
 #else
-//    Intrinsic i(name, returnType, textureType, HLSLBaseType_SamplerState, uvType);
+// Intrinsic i(name, returnType, textureType, HLSLBaseType_SamplerState, uvType);
 //
-//    // classify textureType subtype off scalar
-//    i.argument[0].type.formatType = GetScalarType(returnType);
+// // classify textureType subtype off scalar
+// i.argument[0].type.formatType = GetScalarType(returnType);
 #endif
-    
+
     AddIntrinsic(i);
 }
 
 void AddTextureIntrinsics(const char* name, HLSLBaseType textureType, HLSLBaseType uvType, HLSLBaseType arg3 = HLSLBaseType_Unknown, HLSLBaseType arg4 = HLSLBaseType_Unknown)
 {
-    AddTextureIntrinsic( name, HLSLBaseType_Float4, textureType, uvType, arg3, arg4);
-    AddTextureIntrinsic( name, HLSLBaseType_Half4, textureType, uvType, arg3, arg4);
+    AddTextureIntrinsic(name, HLSLBaseType_Float4, textureType, uvType, arg3, arg4);
+    AddTextureIntrinsic(name, HLSLBaseType_Half4, textureType, uvType, arg3, arg4);
 }
 
-
 // DepthCmp takes additional arg for comparison value, but this rolls it into uv
 void AddDepthIntrinsic(const char* name, HLSLBaseType returnType, HLSLBaseType textureType, HLSLBaseType uvType, HLSLBaseType arg3 = HLSLBaseType_Unknown, HLSLBaseType arg4 = HLSLBaseType_Unknown)
 {
     // ComparisonState is only for SampleCmp/GatherCmp
     bool isCompare = String_Equal(name, "GatherCmp") || String_Equal(name, "SampleCmp");
     HLSLBaseType samplerType = isCompare ? HLSLBaseType_SamplerComparisonState : HLSLBaseType_SamplerState;
-    
+
 #if USE_MEMBER_FUNCTIONS
     Intrinsic i(name, returnType, samplerType, uvType, arg3, arg4);
     i.function.memberType = textureType;
 #else
-//    Intrinsic i(name, returnType, textureType, samplerType, uvType);
-//    i.argument[0].type.formatType = GetScalarType(returnType);
+// Intrinsic i(name, returnType, textureType, samplerType, uvType);
+// i.argument[0].type.formatType = GetScalarType(returnType);
 #endif
-    
+
     AddIntrinsic(i);
 }
 
@@ -503,23 +506,22 @@ void AddDepthIntrinsic(const char* name, HLSLBaseType returnType, HLSLBaseType t
 //    AddTextureIntrinsic( name, HLSLBaseType_Float4, textureType, uvType) \
 //    AddTextureIntrinsic( name, HLSLBaseType_Half4, textureType, uvType )
 
+static const int _numberTypeRank[NumericType_Count][NumericType_Count] =
+    {
+        // across is what type list on right is converted into (5 means don't, 0 means best)
+        //F  H  D  B  I  UI  S US  L UL
+        {0, 3, 3, 4, 4, 4, 4, 4, 4, 4}, // NumericType_Float
+        {2, 0, 4, 4, 4, 4, 4, 4, 4, 4}, // NumericType_Half
+        {1, 4, 0, 4, 4, 4, 4, 4, 4, 4}, // NumericType_Double
 
-static const int _numberTypeRank[NumericType_Count][NumericType_Count] = 
-{
-    // across is what type list on right is converted into (5 means don't, 0 means best)
-    //F  H  D  B  I  UI  S US  L UL
-    { 0, 3, 3, 4, 4, 4,  4, 4, 4, 4 },  // NumericType_Float
-    { 2, 0, 4, 4, 4, 4,  4, 4, 4, 4 },  // NumericType_Half
-    { 1, 4, 0, 4, 4, 4,  4, 4, 4, 4 },  // NumericType_Double
-   
-    { 5, 5, 5, 0, 5, 5,  5, 5, 5, 5 },  // NumericType_Bool
-    { 5, 5, 5, 4, 0, 3,  4, 3, 5, 5 },  // NumericType_Int
-    { 5, 5, 5, 4, 2, 0,  3, 4, 5, 5 },  // NumericType_Uint
-    { 5, 5, 5, 4, 0, 3,  0, 5, 5, 5 },  // NumericType_Short
-    { 5, 5, 5, 4, 2, 0,  5, 0, 5, 5 },  // NumericType_Ushort
-    
-    { 5, 5, 5, 4, 0, 3,  5, 5, 0, 5 },  // NumericType_Long
-    { 5, 5, 5, 4, 2, 0,  5, 5, 5, 0 },  // NumericType_Ulong
+        {5, 5, 5, 0, 5, 5, 5, 5, 5, 5}, // NumericType_Bool
+        {5, 5, 5, 4, 0, 3, 4, 3, 5, 5}, // NumericType_Int
+        {5, 5, 5, 4, 2, 0, 3, 4, 5, 5}, // NumericType_Uint
+        {5, 5, 5, 4, 0, 3, 0, 5, 5, 5}, // NumericType_Short
+        {5, 5, 5, 4, 2, 0, 5, 0, 5, 5}, // NumericType_Ushort
+
+        {5, 5, 5, 4, 0, 3, 5, 5, 0, 5}, // NumericType_Long
+        {5, 5, 5, 4, 2, 0, 5, 5, 5, 0}, // NumericType_Ulong
 };
 
 /* All FX state
@@ -774,34 +776,32 @@ static const EffectState pipelineStates[] = {
 // Note: these strings need to live until end of the app
 StringPool gStringPool(NULL);
 
-enum All
-{
-    AllHalf = (1<<0),
-    AllFloat = (1<<1),
-    AllDouble = (1<<2),
-    
+enum All {
+    AllHalf = (1 << 0),
+    AllFloat = (1 << 1),
+    AllDouble = (1 << 2),
+
     AllFloats = AllHalf | AllFloat | AllDouble,
-    
-    AllUint = (1<<3),
-    AllInt = (1<<4),
-    AllShort = (1<<5),
-    AllUshort = (1<<6),
-    AllLong = (1<<7),
-    AllUlong = (1<<8),
-    AllBool = (1<<9),
-    
+
+    AllUint = (1 << 3),
+    AllInt = (1 << 4),
+    AllShort = (1 << 5),
+    AllUshort = (1 << 6),
+    AllLong = (1 << 7),
+    AllUlong = (1 << 8),
+    AllBool = (1 << 9),
+
     AllInts = AllUint | AllInt | AllShort | AllUshort | AllLong | AllUlong | AllBool,
-    
+
     //AllScalar  = (1<<15),
-    AllVecs = (1<<16),
-    AllMats = (1<<17),
+    AllVecs = (1 << 16),
+    AllMats = (1 << 17),
     AllDims = AllVecs | AllMats,
 };
 using AllMask = uint32_t;
 
 // TODO: want to use Array, but it needs Allocator passed
-struct Range
-{
+struct Range {
     uint32_t start;
     uint32_t count;
 };
@@ -815,30 +815,28 @@ static IntrinsicRangeMap _intrinsicRangeMap;
 static void AddIntrinsic(const Intrinsic& intrinsic)
 {
     const char* name = intrinsic.function.name;
-    
+
     // Put in string pool since using this as a key.  Also means equals just ptr compar.
-   name = gStringPool.AddString(name);
-    
+    name = gStringPool.AddString(name);
+
     // track intrinsic range in a map, also the name lookup helps speed the parser up
     auto it = _intrinsicRangeMap.find(name);
-    if (it != _intrinsicRangeMap.end())
-    {
+    if (it != _intrinsicRangeMap.end()) {
         it->second.count++;
     }
-    else
-    {
-        _intrinsicRangeMap[name] = { (uint32_t)_intrinsics.size(), 1 };
+    else {
+        _intrinsicRangeMap[name] = {(uint32_t)_intrinsics.size(), 1};
     }
-    
+
     // To avoid having growth destroy the argument chains
     const uint32_t kMaxIntrinsics = 10000; // TODO: reduce once count is known
     if (_intrinsics.empty())
         _intrinsics.reserve(kMaxIntrinsics);
     ASSERT(_intrinsics.size() < kMaxIntrinsics);
-    
+
     _intrinsics.push_back(intrinsic);
     _intrinsics.back().function.name = name;
-    
+
     // These pointers change when copied or when vector grows, so do a reserve
     _intrinsics.back().ChainArgumentPointers();
 }
@@ -852,16 +850,15 @@ void AddIntrinsic(const char* name, HLSLBaseType returnType, HLSLBaseType arg1 =
 void RegisterBaseTypeIntrinsic(Intrinsic& intrinsic, uint32_t numArgs, HLSLBaseType returnType, HLSLBaseType baseType, uint32_t start, uint32_t end)
 {
     HLSLBaseType args[4] = {};
-    
-    for (uint32_t i = start; i < end; ++i)
-    {
+
+    for (uint32_t i = start; i < end; ++i) {
         HLSLBaseType baseTypeIter = (HLSLBaseType)(baseType + i);
-        
+
         HLSLBaseType newReturnType = (returnType == HLSLBaseType_Unknown) ? baseTypeIter : returnType;
-        
+
         for (uint32_t a = 0; a < numArgs; ++a)
             args[a] = baseTypeIter;
-        
+
         intrinsic.SetArgumentTypes(newReturnType, args);
         AddIntrinsic(intrinsic);
     }
@@ -878,8 +875,8 @@ void RegisterIntrinsics(const char* name, uint32_t numArgs, AllMask mask, HLSLBa
 
     {
         const uint32_t kNumTypes = 3;
-        HLSLBaseType baseTypes[kNumTypes] = { HLSLBaseType_Float, HLSLBaseType_Half, HLSLBaseType_Double };
-    
+        HLSLBaseType baseTypes[kNumTypes] = {HLSLBaseType_Float, HLSLBaseType_Half, HLSLBaseType_Double};
+
         bool skip[kNumTypes] = {};
         if (!TestBits(mask, AllFloat))
             skip[0] = true;
@@ -887,12 +884,11 @@ void RegisterIntrinsics(const char* name, uint32_t numArgs, AllMask mask, HLSLBa
             skip[1] = true;
         if (!TestBits(mask, AllDouble))
             skip[2] = true;
-        
-        for (uint32_t i = 0; i < kNumTypes; ++i)
-        {
+
+        for (uint32_t i = 0; i < kNumTypes; ++i) {
             if (skip[i]) continue;
             HLSLBaseType baseType = baseTypes[i];
-            
+
             if (mask & AllVecs)
                 RegisterBaseTypeIntrinsic(intrinsic, numArgs, returnType, baseType, 0, 4);
             if (mask & AllMats)
@@ -900,16 +896,14 @@ void RegisterIntrinsics(const char* name, uint32_t numArgs, AllMask mask, HLSLBa
         }
     }
 
-    if ((mask & AllInts) == AllInts)
-    {
+    if ((mask & AllInts) == AllInts) {
         const uint32_t kNumTypes = 7;
         HLSLBaseType baseTypes[kNumTypes] = {
             HLSLBaseType_Long, HLSLBaseType_Ulong,
-            HLSLBaseType_Int,  HLSLBaseType_Uint,
+            HLSLBaseType_Int, HLSLBaseType_Uint,
             HLSLBaseType_Short, HLSLBaseType_Ushort,
-            HLSLBaseType_Bool
-        };
-        
+            HLSLBaseType_Bool};
+
         bool skip[kNumTypes] = {};
         if (!TestBits(mask, AllLong))
             skip[0] = true;
@@ -925,15 +919,14 @@ void RegisterIntrinsics(const char* name, uint32_t numArgs, AllMask mask, HLSLBa
             skip[5] = true;
         if (!TestBits(mask, AllBool))
             skip[6] = true;
-        
-        for (uint32_t i = 0; i < kNumTypes; ++i)
-        {
+
+        for (uint32_t i = 0; i < kNumTypes; ++i) {
             if (skip[i]) continue;
             HLSLBaseType baseType = baseTypes[i];
-            
+
             if (mask & AllVecs)
                 RegisterBaseTypeIntrinsic(intrinsic, numArgs, returnType, baseType, 0, 4);
-            
+
             // TODO: No int matrices yet, but could add them
             //if (mask & AllMats)
             //    RegisterBaseTypeIntrinsic(intrinsic, numArgs, returnType, 4, 7);
@@ -941,7 +934,7 @@ void RegisterIntrinsics(const char* name, uint32_t numArgs, AllMask mask, HLSLBa
     }
 }
 
-#define ArrayCount(array) (sizeof(array) / sizeof(array[0]) )
+#define ArrayCount(array) (sizeof(array) / sizeof(array[0]))
 
 bool InitIntrinsics()
 {
@@ -949,7 +942,7 @@ bool InitIntrinsics()
     // since an unordered map is used for lookup.  But do need
     // all intrinsics of the same name to be defined together in
     // a single range.
-    
+
     const char* kVecOps1[] = {
         "acos", "asin", "atan",
         "cos", "sin", "tan",
@@ -962,33 +955,38 @@ bool InitIntrinsics()
         "isnan", "isinf", "isfinite",
         "degrees", "radians" // emulated in MSL
     };
-    
+
     // apply to float/int
     const char* kVecOps1All[] = {
         "abs",
     };
-    
+
     const char* kVecOps2[] = {
-        "atan2", "pow", // can't pow take scalar?
-        "step", "frexp",
+        "atan2",
+        "pow", // can't pow take scalar?
+        "step",
+        "frexp",
     };
-    
+
     // apply to float/int
     const char* kVecOps2All[] = {
-        "min", "max",
+        "min",
+        "max",
     };
-    
+
     const char* kVecOps3[] = {
         "lerp", // can clamp and lerp take a scalar for last args/arg?
-        "smoothstep", "fma",
+        "smoothstep",
+        "fma",
     };
 
     // apply to float/int
     const char* kVecOps3All[] = {
         "clamp",
-        "min3", "max3",
+        "min3",
+        "max3",
     };
-    
+
     // HLSL intrinsics
     //
     // not going to support due to swizzle, just have similar routine for half
@@ -1012,156 +1010,147 @@ bool InitIntrinsics()
     // absdiff, hadd(x,y),
     // is_null_texture(tex)
     // tex.fence()
-    
 
-    
-    
     AllMask mask = AllFloats | AllVecs;
-    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps1); i < iEnd; ++i)
-    {
-        RegisterIntrinsics( kVecOps1[i], 1, mask );
+    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps1); i < iEnd; ++i) {
+        RegisterIntrinsics(kVecOps1[i], 1, mask);
     }
-    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps2); i < iEnd; ++i)
-    {
-        RegisterIntrinsics( kVecOps2[i], 2, mask );
+    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps2); i < iEnd; ++i) {
+        RegisterIntrinsics(kVecOps2[i], 2, mask);
     }
-    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps3); i < iEnd; ++i)
-    {
-        RegisterIntrinsics( kVecOps3[i], 3, mask );
+    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps3); i < iEnd; ++i) {
+        RegisterIntrinsics(kVecOps3[i], 3, mask);
     }
-    
+
     mask = AllFloats | AllInts | AllVecs;
-    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps1All); i < iEnd; ++i)
-    {
-        RegisterIntrinsics( kVecOps1All[i], 1, mask );
+    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps1All); i < iEnd; ++i) {
+        RegisterIntrinsics(kVecOps1All[i], 1, mask);
     }
-    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps2All); i < iEnd; ++i)
-    {
-        RegisterIntrinsics( kVecOps2All[i], 2, mask );
+    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps2All); i < iEnd; ++i) {
+        RegisterIntrinsics(kVecOps2All[i], 2, mask);
     }
-    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps3All); i < iEnd; ++i)
-    {
-        RegisterIntrinsics( kVecOps3All[i], 3, mask );
+    for (uint32_t i = 0, iEnd = ArrayCount(kVecOps3All); i < iEnd; ++i) {
+        RegisterIntrinsics(kVecOps3All[i], 3, mask);
     }
-    
+
     // bit counting
-    RegisterIntrinsics( "countbits", 1, AllInts | AllVecs); // popcount in MSL
-    RegisterIntrinsics( "firstbithigh", 1, AllInts | AllVecs); // clz in MSL
-    RegisterIntrinsics( "firstbitlow", 1, AllInts | AllVecs); // ctz in MSL
-    RegisterIntrinsics( "reversebits", 1, AllInts | AllVecs); // ctz in MSL
-   
-    RegisterIntrinsics( "sincos", 2, AllFloats | AllVecs, HLSLBaseType_Void);
+    RegisterIntrinsics("countbits", 1, AllInts | AllVecs); // popcount in MSL
+    RegisterIntrinsics("firstbithigh", 1, AllInts | AllVecs); // clz in MSL
+    RegisterIntrinsics("firstbitlow", 1, AllInts | AllVecs); // ctz in MSL
+    RegisterIntrinsics("reversebits", 1, AllInts | AllVecs); // ctz in MSL
+
+    RegisterIntrinsics("sincos", 2, AllFloats | AllVecs, HLSLBaseType_Void);
+
+    RegisterIntrinsics("mad", 3, AllFloats | AllVecs);
+
+    RegisterIntrinsics("any", 1, AllFloats | AllInts | AllVecs, HLSLBaseType_Bool);
+    RegisterIntrinsics("all", 1, AllFloats | AllInts | AllVecs, HLSLBaseType_Bool);
+
+    RegisterIntrinsics("clip", 1, AllFloats | AllVecs, HLSLBaseType_Void);
+
+    RegisterIntrinsics("dot", 2, AllHalf | AllVecs, HLSLBaseType_Half);
+    RegisterIntrinsics("dot", 2, AllFloat | AllVecs, HLSLBaseType_Float);
+    RegisterIntrinsics("dot", 2, AllDouble | AllVecs, HLSLBaseType_Double);
 
-    RegisterIntrinsics( "mad", 3, AllFloats | AllVecs);
-   
-    RegisterIntrinsics( "any", 1, AllFloats | AllInts | AllVecs, HLSLBaseType_Bool);
-    RegisterIntrinsics( "all", 1, AllFloats | AllInts | AllVecs, HLSLBaseType_Bool);
-    
-    RegisterIntrinsics( "clip", 1, AllFloats | AllVecs, HLSLBaseType_Void);
-    
-    RegisterIntrinsics( "dot", 2, AllHalf | AllVecs, HLSLBaseType_Half);
-    RegisterIntrinsics( "dot", 2, AllFloat | AllVecs, HLSLBaseType_Float);
-    RegisterIntrinsics( "dot", 2, AllDouble | AllVecs, HLSLBaseType_Double);
-  
     // 3d cross product only
-    AddIntrinsic( "cross", HLSLBaseType_Float3,  HLSLBaseType_Float3,  HLSLBaseType_Float3 );
-    AddIntrinsic( "cross", HLSLBaseType_Half3,   HLSLBaseType_Half3,   HLSLBaseType_Half3 );
-    AddIntrinsic( "cross", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3 );
-    
-    AddIntrinsic( "reflect", HLSLBaseType_Float3,  HLSLBaseType_Float3,  HLSLBaseType_Float3 );
-    AddIntrinsic( "reflect", HLSLBaseType_Half3,   HLSLBaseType_Half3,   HLSLBaseType_Half3 );
-    AddIntrinsic( "reflect", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3 );
-    
-    AddIntrinsic( "refract", HLSLBaseType_Float3,  HLSLBaseType_Float3,  HLSLBaseType_Float3, HLSLBaseType_Float );
-    AddIntrinsic( "refract", HLSLBaseType_Half3,   HLSLBaseType_Half3,   HLSLBaseType_Half3, HLSLBaseType_Half );
-    AddIntrinsic( "refract", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double );
-    
-    RegisterIntrinsics( "length", 1, AllHalf | AllVecs, HLSLBaseType_Half);
-    RegisterIntrinsics( "length", 1, AllFloat | AllVecs, HLSLBaseType_Float);
-    RegisterIntrinsics( "length", 1, AllDouble | AllVecs, HLSLBaseType_Double);
-  
+    AddIntrinsic("cross", HLSLBaseType_Float3, HLSLBaseType_Float3, HLSLBaseType_Float3);
+    AddIntrinsic("cross", HLSLBaseType_Half3, HLSLBaseType_Half3, HLSLBaseType_Half3);
+    AddIntrinsic("cross", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3);
+
+    AddIntrinsic("reflect", HLSLBaseType_Float3, HLSLBaseType_Float3, HLSLBaseType_Float3);
+    AddIntrinsic("reflect", HLSLBaseType_Half3, HLSLBaseType_Half3, HLSLBaseType_Half3);
+    AddIntrinsic("reflect", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3);
+
+    AddIntrinsic("refract", HLSLBaseType_Float3, HLSLBaseType_Float3, HLSLBaseType_Float3, HLSLBaseType_Float);
+    AddIntrinsic("refract", HLSLBaseType_Half3, HLSLBaseType_Half3, HLSLBaseType_Half3, HLSLBaseType_Half);
+    AddIntrinsic("refract", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double);
+
+    RegisterIntrinsics("length", 1, AllHalf | AllVecs, HLSLBaseType_Half);
+    RegisterIntrinsics("length", 1, AllFloat | AllVecs, HLSLBaseType_Float);
+    RegisterIntrinsics("length", 1, AllDouble | AllVecs, HLSLBaseType_Double);
+
     // MSL construct
-    RegisterIntrinsics( "length_squared", 1, AllHalf | AllVecs, HLSLBaseType_Half);
-    RegisterIntrinsics( "length_squared", 1, AllFloat | AllVecs, HLSLBaseType_Float);
-    RegisterIntrinsics( "length_squared", 1, AllDouble | AllVecs, HLSLBaseType_Double);
+    RegisterIntrinsics("length_squared", 1, AllHalf | AllVecs, HLSLBaseType_Half);
+    RegisterIntrinsics("length_squared", 1, AllFloat | AllVecs, HLSLBaseType_Float);
+    RegisterIntrinsics("length_squared", 1, AllDouble | AllVecs, HLSLBaseType_Double);
 
-    RegisterIntrinsics( "distance", 1, AllHalf | AllVecs, HLSLBaseType_Half);
-    RegisterIntrinsics( "distance", 1, AllFloat | AllVecs, HLSLBaseType_Float);
-    RegisterIntrinsics( "distance", 1, AllDouble | AllVecs, HLSLBaseType_Double);
+    RegisterIntrinsics("distance", 1, AllHalf | AllVecs, HLSLBaseType_Half);
+    RegisterIntrinsics("distance", 1, AllFloat | AllVecs, HLSLBaseType_Float);
+    RegisterIntrinsics("distance", 1, AllDouble | AllVecs, HLSLBaseType_Double);
 
-    RegisterIntrinsics( "distance_squared", 1, AllHalf | AllVecs, HLSLBaseType_Half);
-    RegisterIntrinsics( "distance_squared", 1, AllFloat | AllVecs, HLSLBaseType_Float);
-    RegisterIntrinsics( "distance_squared", 1, AllDouble | AllVecs, HLSLBaseType_Double);
+    RegisterIntrinsics("distance_squared", 1, AllHalf | AllVecs, HLSLBaseType_Half);
+    RegisterIntrinsics("distance_squared", 1, AllFloat | AllVecs, HLSLBaseType_Float);
+    RegisterIntrinsics("distance_squared", 1, AllDouble | AllVecs, HLSLBaseType_Double);
 
     // ps only
-    AddIntrinsic( "fwidth", HLSLBaseType_Float, HLSLBaseType_Float2, HLSLBaseType_Float2 );
-   
+    AddIntrinsic("fwidth", HLSLBaseType_Float, HLSLBaseType_Float2, HLSLBaseType_Float2);
+
     // scalar/vec ops
-    RegisterIntrinsics( "mul", 2, AllFloat | AllVecs | AllMats );
-    
+    RegisterIntrinsics("mul", 2, AllFloat | AllVecs | AllMats);
+
     // scalar mul, since * isn't working on Metal properly
     // m = s * m
-    AddIntrinsic( "mul", HLSLBaseType_Float2x2, HLSLBaseType_Float, HLSLBaseType_Float2x2 );
-    AddIntrinsic( "mul", HLSLBaseType_Float3x3, HLSLBaseType_Float, HLSLBaseType_Float3x3 );
-    AddIntrinsic( "mul", HLSLBaseType_Float4x4, HLSLBaseType_Float, HLSLBaseType_Float4x4 );
-    AddIntrinsic( "mul", HLSLBaseType_Float2x2, HLSLBaseType_Float2x2, HLSLBaseType_Float );
-    AddIntrinsic( "mul", HLSLBaseType_Float3x3, HLSLBaseType_Float3x3, HLSLBaseType_Float );
-    AddIntrinsic( "mul", HLSLBaseType_Float4x4, HLSLBaseType_Float4x4, HLSLBaseType_Float );
-    
+    AddIntrinsic("mul", HLSLBaseType_Float2x2, HLSLBaseType_Float, HLSLBaseType_Float2x2);
+    AddIntrinsic("mul", HLSLBaseType_Float3x3, HLSLBaseType_Float, HLSLBaseType_Float3x3);
+    AddIntrinsic("mul", HLSLBaseType_Float4x4, HLSLBaseType_Float, HLSLBaseType_Float4x4);
+    AddIntrinsic("mul", HLSLBaseType_Float2x2, HLSLBaseType_Float2x2, HLSLBaseType_Float);
+    AddIntrinsic("mul", HLSLBaseType_Float3x3, HLSLBaseType_Float3x3, HLSLBaseType_Float);
+    AddIntrinsic("mul", HLSLBaseType_Float4x4, HLSLBaseType_Float4x4, HLSLBaseType_Float);
+
     // v = v * m
-    AddIntrinsic( "mul", HLSLBaseType_Float2, HLSLBaseType_Float2, HLSLBaseType_Float2x2 );
-    AddIntrinsic( "mul", HLSLBaseType_Float3, HLSLBaseType_Float3, HLSLBaseType_Float3x3 );
-    AddIntrinsic( "mul", HLSLBaseType_Float4, HLSLBaseType_Float4, HLSLBaseType_Float4x4 );
-    AddIntrinsic( "mul", HLSLBaseType_Float2, HLSLBaseType_Float2x2, HLSLBaseType_Float2 );
-    AddIntrinsic( "mul", HLSLBaseType_Float3, HLSLBaseType_Float3x3, HLSLBaseType_Float3 );
-    AddIntrinsic( "mul", HLSLBaseType_Float4, HLSLBaseType_Float4x4, HLSLBaseType_Float4 );
-    
+    AddIntrinsic("mul", HLSLBaseType_Float2, HLSLBaseType_Float2, HLSLBaseType_Float2x2);
+    AddIntrinsic("mul", HLSLBaseType_Float3, HLSLBaseType_Float3, HLSLBaseType_Float3x3);
+    AddIntrinsic("mul", HLSLBaseType_Float4, HLSLBaseType_Float4, HLSLBaseType_Float4x4);
+    AddIntrinsic("mul", HLSLBaseType_Float2, HLSLBaseType_Float2x2, HLSLBaseType_Float2);
+    AddIntrinsic("mul", HLSLBaseType_Float3, HLSLBaseType_Float3x3, HLSLBaseType_Float3);
+    AddIntrinsic("mul", HLSLBaseType_Float4, HLSLBaseType_Float4x4, HLSLBaseType_Float4);
+
     // m = s * m
-    AddIntrinsic( "mul", HLSLBaseType_Half2x2, HLSLBaseType_Half, HLSLBaseType_Half2x2 );
-    AddIntrinsic( "mul", HLSLBaseType_Half3x3, HLSLBaseType_Half, HLSLBaseType_Half3x3 );
-    AddIntrinsic( "mul", HLSLBaseType_Half4x4, HLSLBaseType_Half, HLSLBaseType_Half4x4 );
-    AddIntrinsic( "mul", HLSLBaseType_Half2x2, HLSLBaseType_Half2x2, HLSLBaseType_Half );
-    AddIntrinsic( "mul", HLSLBaseType_Half3x3, HLSLBaseType_Half3x3, HLSLBaseType_Half );
-    AddIntrinsic( "mul", HLSLBaseType_Half4x4, HLSLBaseType_Half4x4, HLSLBaseType_Half );
-    
+    AddIntrinsic("mul", HLSLBaseType_Half2x2, HLSLBaseType_Half, HLSLBaseType_Half2x2);
+    AddIntrinsic("mul", HLSLBaseType_Half3x3, HLSLBaseType_Half, HLSLBaseType_Half3x3);
+    AddIntrinsic("mul", HLSLBaseType_Half4x4, HLSLBaseType_Half, HLSLBaseType_Half4x4);
+    AddIntrinsic("mul", HLSLBaseType_Half2x2, HLSLBaseType_Half2x2, HLSLBaseType_Half);
+    AddIntrinsic("mul", HLSLBaseType_Half3x3, HLSLBaseType_Half3x3, HLSLBaseType_Half);
+    AddIntrinsic("mul", HLSLBaseType_Half4x4, HLSLBaseType_Half4x4, HLSLBaseType_Half);
+
     // v = v * m
-    AddIntrinsic( "mul", HLSLBaseType_Half2, HLSLBaseType_Half2, HLSLBaseType_Half2x2 );
-    AddIntrinsic( "mul", HLSLBaseType_Half3, HLSLBaseType_Half3, HLSLBaseType_Half3x3 );
-    AddIntrinsic( "mul", HLSLBaseType_Half4, HLSLBaseType_Half4, HLSLBaseType_Half4x4 );
-    AddIntrinsic( "mul", HLSLBaseType_Half2, HLSLBaseType_Half2x2, HLSLBaseType_Half2 );
-    AddIntrinsic( "mul", HLSLBaseType_Half3, HLSLBaseType_Half3x3, HLSLBaseType_Half3 );
-    AddIntrinsic( "mul", HLSLBaseType_Half4, HLSLBaseType_Half4x4, HLSLBaseType_Half4 );
-    
+    AddIntrinsic("mul", HLSLBaseType_Half2, HLSLBaseType_Half2, HLSLBaseType_Half2x2);
+    AddIntrinsic("mul", HLSLBaseType_Half3, HLSLBaseType_Half3, HLSLBaseType_Half3x3);
+    AddIntrinsic("mul", HLSLBaseType_Half4, HLSLBaseType_Half4, HLSLBaseType_Half4x4);
+    AddIntrinsic("mul", HLSLBaseType_Half2, HLSLBaseType_Half2x2, HLSLBaseType_Half2);
+    AddIntrinsic("mul", HLSLBaseType_Half3, HLSLBaseType_Half3x3, HLSLBaseType_Half3);
+    AddIntrinsic("mul", HLSLBaseType_Half4, HLSLBaseType_Half4x4, HLSLBaseType_Half4);
+
     // m = s * m
-    AddIntrinsic( "mul", HLSLBaseType_Double2x2, HLSLBaseType_Double, HLSLBaseType_Double2x2 );
-    AddIntrinsic( "mul", HLSLBaseType_Double3x3, HLSLBaseType_Double, HLSLBaseType_Double3x3 );
-    AddIntrinsic( "mul", HLSLBaseType_Double4x4, HLSLBaseType_Double, HLSLBaseType_Double4x4 );
-    AddIntrinsic( "mul", HLSLBaseType_Double2x2, HLSLBaseType_Double2x2, HLSLBaseType_Double );
-    AddIntrinsic( "mul", HLSLBaseType_Double3x3, HLSLBaseType_Double3x3, HLSLBaseType_Double );
-    AddIntrinsic( "mul", HLSLBaseType_Double4x4, HLSLBaseType_Double4x4, HLSLBaseType_Double );
-    
+    AddIntrinsic("mul", HLSLBaseType_Double2x2, HLSLBaseType_Double, HLSLBaseType_Double2x2);
+    AddIntrinsic("mul", HLSLBaseType_Double3x3, HLSLBaseType_Double, HLSLBaseType_Double3x3);
+    AddIntrinsic("mul", HLSLBaseType_Double4x4, HLSLBaseType_Double, HLSLBaseType_Double4x4);
+    AddIntrinsic("mul", HLSLBaseType_Double2x2, HLSLBaseType_Double2x2, HLSLBaseType_Double);
+    AddIntrinsic("mul", HLSLBaseType_Double3x3, HLSLBaseType_Double3x3, HLSLBaseType_Double);
+    AddIntrinsic("mul", HLSLBaseType_Double4x4, HLSLBaseType_Double4x4, HLSLBaseType_Double);
+
     // v = v * m
-    AddIntrinsic( "mul", HLSLBaseType_Double2, HLSLBaseType_Double2, HLSLBaseType_Double2x2 );
-    AddIntrinsic( "mul", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3x3 );
-    AddIntrinsic( "mul", HLSLBaseType_Double4, HLSLBaseType_Double4, HLSLBaseType_Double4x4 );
-    AddIntrinsic( "mul", HLSLBaseType_Double2, HLSLBaseType_Double2x2, HLSLBaseType_Double2 );
-    AddIntrinsic( "mul", HLSLBaseType_Double3, HLSLBaseType_Double3x3, HLSLBaseType_Double3 );
-    AddIntrinsic( "mul", HLSLBaseType_Double4, HLSLBaseType_Double4x4, HLSLBaseType_Double4 );
-    
+    AddIntrinsic("mul", HLSLBaseType_Double2, HLSLBaseType_Double2, HLSLBaseType_Double2x2);
+    AddIntrinsic("mul", HLSLBaseType_Double3, HLSLBaseType_Double3, HLSLBaseType_Double3x3);
+    AddIntrinsic("mul", HLSLBaseType_Double4, HLSLBaseType_Double4, HLSLBaseType_Double4x4);
+    AddIntrinsic("mul", HLSLBaseType_Double2, HLSLBaseType_Double2x2, HLSLBaseType_Double2);
+    AddIntrinsic("mul", HLSLBaseType_Double3, HLSLBaseType_Double3x3, HLSLBaseType_Double3);
+    AddIntrinsic("mul", HLSLBaseType_Double4, HLSLBaseType_Double4x4, HLSLBaseType_Double4);
+
     // matrix transpose
     RegisterIntrinsics("transpose", 1, AllFloats | AllMats);
-    
+
     // determinant needs to return scalar for all 9 mat types
-    AddIntrinsic("determinant", HLSLBaseType_Float,  HLSLBaseType_Float2x2);
-    AddIntrinsic("determinant", HLSLBaseType_Float,  HLSLBaseType_Float3x3);
-    AddIntrinsic("determinant", HLSLBaseType_Float,  HLSLBaseType_Float4x4);
-    AddIntrinsic("determinant", HLSLBaseType_Half,   HLSLBaseType_Half2x2);
-    AddIntrinsic("determinant", HLSLBaseType_Half,   HLSLBaseType_Half3x3);
-    AddIntrinsic("determinant", HLSLBaseType_Half,   HLSLBaseType_Half4x4);
+    AddIntrinsic("determinant", HLSLBaseType_Float, HLSLBaseType_Float2x2);
+    AddIntrinsic("determinant", HLSLBaseType_Float, HLSLBaseType_Float3x3);
+    AddIntrinsic("determinant", HLSLBaseType_Float, HLSLBaseType_Float4x4);
+    AddIntrinsic("determinant", HLSLBaseType_Half, HLSLBaseType_Half2x2);
+    AddIntrinsic("determinant", HLSLBaseType_Half, HLSLBaseType_Half3x3);
+    AddIntrinsic("determinant", HLSLBaseType_Half, HLSLBaseType_Half4x4);
     AddIntrinsic("determinant", HLSLBaseType_Double, HLSLBaseType_Double2x2);
     AddIntrinsic("determinant", HLSLBaseType_Double, HLSLBaseType_Double3x3);
     AddIntrinsic("determinant", HLSLBaseType_Double, HLSLBaseType_Double4x4);
-   
+
 #if 0
     // TODO: more conversions fp16, double, etc.
     // MSL can just do simple casts.  These are more for getting data in/out
@@ -1186,7 +1175,7 @@ bool InitIntrinsics()
     AddIntrinsic("asuint", HLSLBaseType_Ulong, HLSLBaseType_Uint, HLSLBaseType_Uint);
     AddIntrinsic("asuint", HLSLBaseType_Uint, HLSLBaseType_Float);
 #endif
-    
+
 #if 0
     // TODO: get atomics working
     // these work on atomic_int/uint, then bool/ulong 2.4,
@@ -1247,77 +1236,74 @@ bool InitIntrinsics()
 
 #endif
 
-    
     // TODO: split off sampler intrinsics from math above
     // these need to be member functions and have default arg value support
-    
+
     //------------------------
-    
+
     // TODO: need optional offset
-    
+
     // Cannot use Sample with 2DMS/Array
     AddTextureIntrinsics("Sample", HLSLBaseType_Texture2D, HLSLBaseType_Float2); // Int2 offset
     AddTextureIntrinsics("Sample", HLSLBaseType_Texture3D, HLSLBaseType_Float3); // Int3 offset
     AddTextureIntrinsics("Sample", HLSLBaseType_Texture2DArray, HLSLBaseType_Float3); // Int2 offset
-    
+
     // these don't have offset
     AddTextureIntrinsics("Sample", HLSLBaseType_TextureCube, HLSLBaseType_Float3);
     AddTextureIntrinsics("Sample", HLSLBaseType_TextureCubeArray, HLSLBaseType_Float4);
-    
+
     // Depth
     AddDepthIntrinsic("Sample", HLSLBaseType_Float, HLSLBaseType_Depth2D, HLSLBaseType_Float2); // Int2 offset
-    AddDepthIntrinsic("Sample", HLSLBaseType_Float, HLSLBaseType_Depth2DArray,  HLSLBaseType_Float3); // Int2 offset
-    AddDepthIntrinsic("Sample", HLSLBaseType_Float, HLSLBaseType_DepthCube,  HLSLBaseType_Float3); // no offset
-    
+    AddDepthIntrinsic("Sample", HLSLBaseType_Float, HLSLBaseType_Depth2DArray, HLSLBaseType_Float3); // Int2 offset
+    AddDepthIntrinsic("Sample", HLSLBaseType_Float, HLSLBaseType_DepthCube, HLSLBaseType_Float3); // no offset
+
     AddDepthIntrinsic("SampleCmp", HLSLBaseType_Float, HLSLBaseType_Depth2D, HLSLBaseType_Float2, HLSLBaseType_Float);
     AddDepthIntrinsic("SampleCmp", HLSLBaseType_Float, HLSLBaseType_Depth2DArray, HLSLBaseType_Float3, HLSLBaseType_Float);
     AddDepthIntrinsic("SampleCmp", HLSLBaseType_Float, HLSLBaseType_DepthCube, HLSLBaseType_Float3, HLSLBaseType_Float);
-    
+
     // returns float4 w/comparisons, probably only on mip0
     // TODO: add GatherRed? to read 4 depth values
     AddDepthIntrinsic("GatherCmp", HLSLBaseType_Float4, HLSLBaseType_Depth2D, HLSLBaseType_Float2, HLSLBaseType_Float);
     AddDepthIntrinsic("GatherCmp", HLSLBaseType_Float4, HLSLBaseType_Depth2DArray, HLSLBaseType_Float3, HLSLBaseType_Float);
     AddDepthIntrinsic("GatherCmp", HLSLBaseType_Float4, HLSLBaseType_DepthCube, HLSLBaseType_Float3, HLSLBaseType_Float);
-    
+
     // one more dimension than Sample
     AddTextureIntrinsics("SampleLevel", HLSLBaseType_Texture2D, HLSLBaseType_Float2, HLSLBaseType_Float);
     AddTextureIntrinsics("SampleLevel", HLSLBaseType_Texture3D, HLSLBaseType_Float3, HLSLBaseType_Float);
     AddTextureIntrinsics("SampleLevel", HLSLBaseType_Texture2DArray, HLSLBaseType_Float3, HLSLBaseType_Float);
     AddTextureIntrinsics("SampleLevel", HLSLBaseType_TextureCube, HLSLBaseType_Float3, HLSLBaseType_Float);
     // TEXTURE_INTRINSIC_FUNCTION("SampleLevel", HLSLBaseType_TextureCubeArray, HLSLBaseType_Float4, Float);
-    
+
     // bias always in w
     AddTextureIntrinsics("SampleBias", HLSLBaseType_Texture2D, HLSLBaseType_Float2, HLSLBaseType_Float);
     AddTextureIntrinsics("SampleBias", HLSLBaseType_Texture3D, HLSLBaseType_Float3, HLSLBaseType_Float);
     AddTextureIntrinsics("SampleBias", HLSLBaseType_Texture2DArray, HLSLBaseType_Float3, HLSLBaseType_Float);
-    
-    
+
     // no offset on cube/cubearray
     AddTextureIntrinsics("SampleBias", HLSLBaseType_TextureCube, HLSLBaseType_Float3, HLSLBaseType_Float);
     // AddTextureIntrinsics("SampleBias", HLSLBaseType_TextureCubeArray, HLSLBaseType_Float4, Float);
-    
 
     // TODO: for 2D tex (int2 offset is optional, how to indicate that?)
     // arguments have defaultValue that can be set.
-    
-    AddTextureIntrinsics("GatherRed", HLSLBaseType_Texture2D,  HLSLBaseType_Float2);
-    AddTextureIntrinsics("GatherGreen", HLSLBaseType_Texture2D,  HLSLBaseType_Float2);
-    AddTextureIntrinsics("GatherBlue", HLSLBaseType_Texture2D,  HLSLBaseType_Float2);
-    AddTextureIntrinsics("GatherAlpha", HLSLBaseType_Texture2D,  HLSLBaseType_Float2);
-    
+
+    AddTextureIntrinsics("GatherRed", HLSLBaseType_Texture2D, HLSLBaseType_Float2);
+    AddTextureIntrinsics("GatherGreen", HLSLBaseType_Texture2D, HLSLBaseType_Float2);
+    AddTextureIntrinsics("GatherBlue", HLSLBaseType_Texture2D, HLSLBaseType_Float2);
+    AddTextureIntrinsics("GatherAlpha", HLSLBaseType_Texture2D, HLSLBaseType_Float2);
+
     // TODO: add more types cube/3d takes gradient3d in MSL
     // The Intrinsic ctor would need to have 5 args instead 4
     // first move to member functions, then add this with 4 args
     // AddTextureIntrinsics( "SampleGrad", HLSLBaseType_Texture2D, HLSLBaseType_Float, HLSLBaseType_Float2, HLSLBaseType_Float2, HLSLBaseType_Float2);
-    
+
     // These constructs are not declaring the lod or offset param which have default
     AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_Texture2D, HLSLBaseType_Int2); // TODO: needs lod
     AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_Texture3D, HLSLBaseType_Int3); // TODO: need lod
     AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_Texture2DArray, HLSLBaseType_Int2); // TODO: needs array, lod
-    //    AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_TextureCube, HLSLBaseType_Int2); // TODO: needs face, lod
-    //    AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_TextureCubeArray, HLSLBaseType_Int2); // TODO: needs face, lod, array
+    // AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_TextureCube, HLSLBaseType_Int2); // TODO: needs face, lod
+    // AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_TextureCubeArray, HLSLBaseType_Int2); // TODO: needs face, lod, array
     AddTextureLoadIntrinsic("Load", HLSLBaseType_Float4, HLSLBaseType_Texture2DMS, HLSLBaseType_Int2); // TODO: needs sampleIndex
-    
+
     // TODO: aren't these uint instead of int?
     AddTextureIntrinsics("GetDimensions", HLSLBaseType_Texture2D, HLSLBaseType_Int2);
     AddTextureIntrinsics("GetDimensions", HLSLBaseType_Texture3D, HLSLBaseType_Int3);
@@ -1325,90 +1311,87 @@ bool InitIntrinsics()
     AddTextureIntrinsics("GetDimensions", HLSLBaseType_TextureCube, HLSLBaseType_Int3);
     AddTextureIntrinsics("GetDimensions", HLSLBaseType_TextureCubeArray, HLSLBaseType_Int3);
     AddTextureIntrinsics("GetDimensions", HLSLBaseType_Texture2DMS, HLSLBaseType_Int2);
-    
+
     return true;
 };
 
-
 // The order in this array must match up with HLSLBinaryOp
 const int _binaryOpPriority[] =
     {
-        2, 1, //  &&, ||
-        8, 8, //  +,  -
-        9, 9, //  *,  /
-        7, 7, //  <,  >,
-        7, 7, //  <=, >=,
-        6, 6, //  ==, !=
+        2, 1, // &&, ||
+        8, 8, // +,  -
+        9, 9, // *,  /
+        7, 7, // <,  >,
+        7, 7, // <=, >=,
+        6, 6, // ==, !=
         5, 3, 4, // &, |, ^
-    };
-
-
+};
 
 BaseTypeDescription baseTypeDescriptions[HLSLBaseType_Count];
 
-void RegisterMatrix(HLSLBaseType type, uint32_t typeOffset, NumericType numericType,  int binaryOpRank, const char* typeName, uint32_t dim1, uint32_t dim2)
+void RegisterMatrix(HLSLBaseType type, uint32_t typeOffset, NumericType numericType, int binaryOpRank, const char* typeName, uint32_t dim1, uint32_t dim2)
 {
     char buf[32];
     snprintf(buf, sizeof(buf), "%s%dx%d", typeName, dim1, dim2);
     const char* name = gStringPool.AddString(buf);
-    
+
     HLSLBaseType baseType = (HLSLBaseType)(type + typeOffset);
-    
+
     BaseTypeDescription& desc = baseTypeDescriptions[baseType];
     desc.typeName = name;
     desc.typeNameMetal = name;
-    
+
     desc.baseType = baseType;
     desc.coreType = CoreType_Matrix;
     desc.dimensionType = DimensionType(DimensionType_Matrix2x2 + (dim2 - 2));
     desc.numericType = numericType;
-    
+
     desc.numDimensions = 2;
     desc.numComponents = dim1;
     desc.height = dim2;
     desc.binaryOpRank = binaryOpRank;
 }
 
-void RegisterVector(HLSLBaseType type, uint32_t typeOffset, NumericType numericType,  int binaryOpRank,  const char* typeName, uint32_t dim)
+void RegisterVector(HLSLBaseType type, uint32_t typeOffset, NumericType numericType, int binaryOpRank, const char* typeName, uint32_t dim)
 {
     char buf[32];
     snprintf(buf, sizeof(buf), "%s%d", typeName, dim);
     const char* name = gStringPool.AddString(buf);
-    
+
     HLSLBaseType baseType = (HLSLBaseType)(type + typeOffset);
-    
+
     BaseTypeDescription& desc = baseTypeDescriptions[type + typeOffset];
     desc.typeName = name;
     desc.typeNameMetal = name;
-    
+
     // 4 types
     desc.baseType = baseType;
     desc.coreType = CoreType_Vector;
     desc.dimensionType = DimensionType(DimensionType_Vector2 + (dim - 2));
     desc.numericType = numericType;
-    
+
     desc.numDimensions = 1;
     desc.numComponents = dim;
     desc.height = 1;
     desc.binaryOpRank = binaryOpRank;
 }
 
-void RegisterScalar(HLSLBaseType type, uint32_t typeOffset, NumericType numericType,  int binaryOpRank, const char* typeName)
+void RegisterScalar(HLSLBaseType type, uint32_t typeOffset, NumericType numericType, int binaryOpRank, const char* typeName)
 {
     const char* name = gStringPool.AddString(typeName);
-    
+
     HLSLBaseType baseType = (HLSLBaseType)(type + typeOffset);
-    
+
     BaseTypeDescription& desc = baseTypeDescriptions[baseType];
     desc.typeName = name;
     desc.typeNameMetal = name;
-    
+
     // 4 types
     desc.baseType = baseType;
     desc.coreType = CoreType_Scalar;
     desc.dimensionType = DimensionType_Scalar;
     desc.numericType = numericType;
-    
+
     desc.numDimensions = 0;
     desc.numComponents = 1;
     desc.height = 1;
@@ -1421,7 +1404,7 @@ void RegisterTexture(HLSLBaseType baseType, const char* typeName, const char* ty
     desc.baseType = baseType;
     desc.typeName = typeName;
     desc.typeNameMetal = typeNameMetal;
-    
+
     desc.coreType = CoreType_Texture;
 }
 
@@ -1431,7 +1414,7 @@ void RegisterSampler(HLSLBaseType baseType, const char* typeName, const char* ty
     desc.baseType = baseType;
     desc.typeName = typeName;
     desc.typeNameMetal = typeNameMetal;
-    
+
     desc.coreType = CoreType_Sampler;
 }
 
@@ -1441,84 +1424,77 @@ void RegisterType(HLSLBaseType baseType, CoreType coreType, const char* typeName
     desc.baseType = baseType;
     desc.typeName = typeName;
     desc.typeNameMetal = typeName;
-    
+
     desc.coreType = coreType;
 }
 
-
 bool InitBaseTypeDescriptions()
 {
     {
         const uint32_t kNumTypes = 3;
-        const char* typeNames[kNumTypes] = { "float", "half", "double" };
-        const HLSLBaseType baseTypes[kNumTypes] = { HLSLBaseType_Float, HLSLBaseType_Half, HLSLBaseType_Double };
-        const NumericType numericTypes[kNumTypes] = { NumericType_Float, NumericType_Half, NumericType_Double };
-        const int binaryOpRanks[kNumTypes] = { 0, 1, 2 };
-        
-        for (uint32_t i = 0; i < kNumTypes; ++i)
-        {
+        const char* typeNames[kNumTypes] = {"float", "half", "double"};
+        const HLSLBaseType baseTypes[kNumTypes] = {HLSLBaseType_Float, HLSLBaseType_Half, HLSLBaseType_Double};
+        const NumericType numericTypes[kNumTypes] = {NumericType_Float, NumericType_Half, NumericType_Double};
+        const int binaryOpRanks[kNumTypes] = {0, 1, 2};
+
+        for (uint32_t i = 0; i < kNumTypes; ++i) {
             const char* typeName = typeNames[i];
             HLSLBaseType baseType = baseTypes[i];
             NumericType numericType = numericTypes[i];
             int binaryOpRank = binaryOpRanks[i];
-            
+
             RegisterScalar(baseType, 0, numericType, binaryOpRank, typeName);
             RegisterVector(baseType, 1, numericType, binaryOpRank, typeName, 2);
             RegisterVector(baseType, 2, numericType, binaryOpRank, typeName, 3);
             RegisterVector(baseType, 3, numericType, binaryOpRank, typeName, 4);
-            
+
             RegisterMatrix(baseType, 4, numericType, binaryOpRank, typeName, 2, 2);
             RegisterMatrix(baseType, 5, numericType, binaryOpRank, typeName, 3, 3);
             RegisterMatrix(baseType, 6, numericType, binaryOpRank, typeName, 4, 4);
         }
     }
-    
+
     {
         const uint32_t kNumTypes = 7;
         const char* typeNames[kNumTypes] = {
             "int", "uint",
             "long", "ulong",
             "short", "ushort",
-            "bool"
-        };
+            "bool"};
         const HLSLBaseType baseTypes[kNumTypes] = {
-            HLSLBaseType_Int,  HLSLBaseType_Uint,
+            HLSLBaseType_Int, HLSLBaseType_Uint,
             HLSLBaseType_Long, HLSLBaseType_Ulong,
             HLSLBaseType_Short, HLSLBaseType_Ushort,
-            HLSLBaseType_Bool
-        };
+            HLSLBaseType_Bool};
         const NumericType numericTypes[kNumTypes] = {
-            NumericType_Int,  NumericType_Uint,
-            NumericType_Long,  NumericType_Ulong,
+            NumericType_Int, NumericType_Uint,
+            NumericType_Long, NumericType_Ulong,
             NumericType_Short, NumericType_Ushort,
-            NumericType_Bool
-        };
+            NumericType_Bool};
         const int binaryOpRanks[kNumTypes] = {
             2, 1, // Note: int seems like it should be highest
             3, 2,
             4, 3,
-            4
-        };
-        
-        for (uint32_t i = 0; i < kNumTypes; ++i)
-        {
+            4};
+
+        for (uint32_t i = 0; i < kNumTypes; ++i) {
             const char* typeName = typeNames[i];
             HLSLBaseType baseType = baseTypes[i];
             NumericType numericType = numericTypes[i];
             int binaryOpRank = binaryOpRanks[i];
-            
+
             RegisterScalar(baseType, 0, numericType, binaryOpRank, typeName);
             RegisterVector(baseType, 1, numericType, binaryOpRank, typeName, 2);
             RegisterVector(baseType, 2, numericType, binaryOpRank, typeName, 3);
             RegisterVector(baseType, 3, numericType, binaryOpRank, typeName, 4);
         }
     }
-    
+
     // TODO: add u/char, but HLSL2021 doesn't have support, but MSL does
-    
+
     // TODO: would it be better to use "texture" base type (see "buffer")
     // and then have a TextureSubType off that?
-    
+
     // texutres
     RegisterTexture(HLSLBaseType_Texture2D, "Texture2D", "texture2d");
     RegisterTexture(HLSLBaseType_Texture2DArray, "Texture2DArray", "texture2d_array");
@@ -1526,24 +1502,24 @@ bool InitBaseTypeDescriptions()
     RegisterTexture(HLSLBaseType_TextureCube, "TextureCube", "texturecube");
     RegisterTexture(HLSLBaseType_TextureCubeArray, "TextureCubeArray", "texturecube_rray");
     RegisterTexture(HLSLBaseType_Texture2DMS, "Texture2DMS", "texture2d_ms");
-    
+
     RegisterTexture(HLSLBaseType_Depth2D, "Depth2D", "depth2d");
     RegisterTexture(HLSLBaseType_Depth2DArray, "Depth2DArray", "depth2d_array");
     RegisterTexture(HLSLBaseType_DepthCube, "DepthCube", "depthcube");
-    
+
     RegisterTexture(HLSLBaseType_RWTexture2D, "RWTexture2D", "texture2d");
-    
+
     // samplers
     RegisterSampler(HLSLBaseType_SamplerState, "SamplerState", "sampler");
     RegisterSampler(HLSLBaseType_SamplerComparisonState, "SamplerComparisonState", "sampler");
-    
+
     RegisterType(HLSLBaseType_UserDefined, CoreType_Struct, "struct");
     RegisterType(HLSLBaseType_Void, CoreType_Void, "void");
     RegisterType(HLSLBaseType_Unknown, CoreType_None, "unknown");
     RegisterType(HLSLBaseType_Expression, CoreType_Expression, "expression");
     RegisterType(HLSLBaseType_Comment, CoreType_Comment, "comment");
     RegisterType(HLSLBaseType_Buffer, CoreType_Buffer, "buffer");
-    
+
     return true;
 }
 
@@ -1555,124 +1531,129 @@ static bool _initIntrinsics = InitIntrinsics();
 HLSLBaseType ArithmeticOpResultType(HLSLBinaryOp binaryOp, HLSLBaseType t1, HLSLBaseType t2)
 {
     // check that both are same numeric types
-    
+
     // add, sub, div are similar
     // mul is it's own test
 
     // most mixing of types is invalid here
-    
-    if (IsNumericTypeEqual(t1, t2))
-    {
+
+    if (IsNumericTypeEqual(t1, t2)) {
         bool isSameDimensions = IsDimensionEqual(t1, t2);
-        
-        if (IsScalarType(t1) && IsScalarType(t2))
-        {
+
+        if (IsScalarType(t1) && IsScalarType(t2)) {
             if (isSameDimensions) return t1;
         }
-        else if (IsVectorType(t1) && IsVectorType(t2))
-        {
+        else if (IsVectorType(t1) && IsVectorType(t2)) {
             if (isSameDimensions) return t1;
         }
-        else if (IsMatrixType(t1) && IsMatrixType(t2))
-        {
+        else if (IsMatrixType(t1) && IsMatrixType(t2)) {
             if (isSameDimensions) return t1;
         }
-        
+
         else if ((binaryOp == HLSLBinaryOp_Add || binaryOp == HLSLBinaryOp_Sub) &&
-                 (IsScalarType(t1) || IsScalarType(t2)))
-        {
+                 (IsScalarType(t1) || IsScalarType(t2))) {
             // allow v + 1, and 1 - v
             return (IsVectorType(t1) || IsMatrixType(t1)) ? t1 : t2;
         }
-         
+
         else if ((binaryOp == HLSLBinaryOp_Mul || binaryOp == HLSLBinaryOp_Div) &&
-                 (IsScalarType(t1) || IsScalarType(t2)))
-        {
+                 (IsScalarType(t1) || IsScalarType(t2))) {
             // v * s
             return (IsVectorType(t1) || IsMatrixType(t1)) ? t1 : t2;
         }
-        
+
         // this has to check dimension across the mul
-        else if (binaryOp == HLSLBinaryOp_Mul)
-        {
+        else if (binaryOp == HLSLBinaryOp_Mul) {
             bool isSameCrossDimension = IsCrossDimensionEqual(t1, t2);
-            
-            if (IsMatrixType(t1) && IsVectorType(t2))
-            {
+
+            if (IsMatrixType(t1) && IsVectorType(t2)) {
                 if (isSameCrossDimension) return t2;
             }
-            else if (IsVectorType(t1) && IsMatrixType(t2))
-            {
+            else if (IsVectorType(t1) && IsMatrixType(t2)) {
                 if (isSameCrossDimension) return t1;
             }
         }
     }
-    
+
     return HLSLBaseType_Unknown;
 }
-    
+
 // Priority of the ? : operator.
 const int _conditionalOpPriority = 1;
 
 const char* GetTypeNameHLSL(const HLSLType& type)
 {
-    if (type.baseType == HLSLBaseType_UserDefined)
-    {
+    if (type.baseType == HLSLBaseType_UserDefined) {
         return type.typeName;
     }
-    else
-    {
+    else {
         return baseTypeDescriptions[type.baseType].typeName;
     }
 }
 
 const char* GetTypeNameMetal(const HLSLType& type)
 {
-    if (type.baseType == HLSLBaseType_UserDefined)
-    {
+    if (type.baseType == HLSLBaseType_UserDefined) {
         return type.typeName;
     }
-    else
-    {
+    else {
         return baseTypeDescriptions[type.baseType].typeNameMetal;
     }
 }
 
 static const char* GetBinaryOpName(HLSLBinaryOp binaryOp)
 {
-    switch (binaryOp)
-    {
-    case HLSLBinaryOp_And:          return "&&";
-    case HLSLBinaryOp_Or:           return "||";
-            
-    case HLSLBinaryOp_Add:          return "+";
-    case HLSLBinaryOp_Sub:          return "-";
-    case HLSLBinaryOp_Mul:          return "*";
-    case HLSLBinaryOp_Div:          return "/";
-            
-    case HLSLBinaryOp_Less:         return "<";
-    case HLSLBinaryOp_Greater:      return ">";
-    case HLSLBinaryOp_LessEqual:    return "<=";
-    case HLSLBinaryOp_GreaterEqual: return ">=";
-    case HLSLBinaryOp_Equal:        return "==";
-    case HLSLBinaryOp_NotEqual:     return "!=";
-            
-    case HLSLBinaryOp_BitAnd:       return "&";
-    case HLSLBinaryOp_BitOr:        return "|";
-    case HLSLBinaryOp_BitXor:       return "^";
-            
-    case HLSLBinaryOp_Assign:       return "=";
-    case HLSLBinaryOp_AddAssign:    return "+=";
-    case HLSLBinaryOp_SubAssign:    return "-=";
-    case HLSLBinaryOp_MulAssign:    return "*=";
-    case HLSLBinaryOp_DivAssign:    return "/=";
-    default:
-        ASSERT(false);
-        return "???";
+    switch (binaryOp) {
+        case HLSLBinaryOp_And:
+            return "&&";
+        case HLSLBinaryOp_Or:
+            return "||";
+
+        case HLSLBinaryOp_Add:
+            return "+";
+        case HLSLBinaryOp_Sub:
+            return "-";
+        case HLSLBinaryOp_Mul:
+            return "*";
+        case HLSLBinaryOp_Div:
+            return "/";
+
+        case HLSLBinaryOp_Less:
+            return "<";
+        case HLSLBinaryOp_Greater:
+            return ">";
+        case HLSLBinaryOp_LessEqual:
+            return "<=";
+        case HLSLBinaryOp_GreaterEqual:
+            return ">=";
+        case HLSLBinaryOp_Equal:
+            return "==";
+        case HLSLBinaryOp_NotEqual:
+            return "!=";
+
+        case HLSLBinaryOp_BitAnd:
+            return "&";
+        case HLSLBinaryOp_BitOr:
+            return "|";
+        case HLSLBinaryOp_BitXor:
+            return "^";
+
+        case HLSLBinaryOp_Assign:
+            return "=";
+        case HLSLBinaryOp_AddAssign:
+            return "+=";
+        case HLSLBinaryOp_SubAssign:
+            return "-=";
+        case HLSLBinaryOp_MulAssign:
+            return "*=";
+        case HLSLBinaryOp_DivAssign:
+            return "/=";
+        default:
+            ASSERT(false);
+            return "???";
     }
 }
 
-
 /*
  * 1.) Match
  * 2.) Scalar dimension promotion (scalar -> vector/matrix)
@@ -1680,21 +1661,19 @@ static const char* GetBinaryOpName(HLSLBinaryOp binaryOp)
  * 4.) Conversion + scalar dimension promotion
  * 5.) Truncation (vector -> scalar or lower component vector, matrix -> scalar or lower component matrix)
  * 6.) Conversion + truncation
- */    
-static int GetTypeCastRank(HLSLTree * tree, const HLSLType& srcType, const HLSLType& dstType)
+ */
+static int GetTypeCastRank(HLSLTree* tree, const HLSLType& srcType, const HLSLType& dstType)
 {
     /*if (srcType.array != dstType.array || srcType.arraySize != dstType.arraySize)
     {
         return -1;
     }*/
 
-    if (srcType.array != dstType.array)
-    {
+    if (srcType.array != dstType.array) {
         return -1;
     }
 
-    if (srcType.array == true)
-    {
+    if (srcType.array == true) {
         ASSERT(dstType.array == true);
         int srcArraySize = -1;
         int dstArraySize = -1;
@@ -1707,103 +1686,86 @@ static int GetTypeCastRank(HLSLTree * tree, const HLSLType& srcType, const HLSLT
         }
     }
 
-    if (srcType.baseType == HLSLBaseType_UserDefined && dstType.baseType == HLSLBaseType_UserDefined)
-    {
+    if (srcType.baseType == HLSLBaseType_UserDefined && dstType.baseType == HLSLBaseType_UserDefined) {
         return String_Equal(srcType.typeName, dstType.typeName) ? 0 : -1;
     }
 
-    if (srcType.baseType == dstType.baseType)
-    {
+    if (srcType.baseType == dstType.baseType) {
         // This only works if textures are half or float, but not hwne
         // there are more varied texture that can be cast.
-        if (IsTextureType(srcType.baseType))
-        {
+        if (IsTextureType(srcType.baseType)) {
             return srcType.formatType == dstType.formatType ? 0 : -1;
         }
-        
+
         return 0;
     }
 
     const BaseTypeDescription& srcDesc = baseTypeDescriptions[srcType.baseType];
     const BaseTypeDescription& dstDesc = baseTypeDescriptions[dstType.baseType];
-    if (srcDesc.numericType == NumericType_NaN || dstDesc.numericType == NumericType_NaN)
-    {
+    if (srcDesc.numericType == NumericType_NaN || dstDesc.numericType == NumericType_NaN) {
         return -1;
     }
 
     // Result bits: T R R R P (T = truncation, R = conversion rank, P = dimension promotion)
     int result = _numberTypeRank[srcDesc.numericType][dstDesc.numericType] << 1;
 
-    if (srcDesc.numDimensions == 0 && dstDesc.numDimensions > 0)
-    {
+    if (srcDesc.numDimensions == 0 && dstDesc.numDimensions > 0) {
         // Scalar dimension promotion
         result |= (1 << 0);
     }
     else if ((srcDesc.numDimensions == dstDesc.numDimensions && (srcDesc.numComponents > dstDesc.numComponents || srcDesc.height > dstDesc.height)) ||
-             (srcDesc.numDimensions > 0 && dstDesc.numDimensions == 0))
-    {
+             (srcDesc.numDimensions > 0 && dstDesc.numDimensions == 0)) {
         // Truncation
         result |= (1 << 4);
     }
     else if (srcDesc.numDimensions != dstDesc.numDimensions ||
              srcDesc.numComponents != dstDesc.numComponents ||
-             srcDesc.height != dstDesc.height)
-    {
+             srcDesc.height != dstDesc.height) {
         // Can't convert
         return -1;
     }
-    
+
     return result;
-    
 }
 
 static bool GetFunctionCallCastRanks(HLSLTree* tree, const HLSLFunctionCall* call, const HLSLFunction* function, int* rankBuffer)
 {
-
-    if (function == NULL || function->numArguments < call->numArguments)
-    {
+    if (function == NULL || function->numArguments < call->numArguments) {
         // Function not viable
         return false;
     }
 
     const HLSLExpression* expression = call->argument;
     const HLSLArgument* argument = function->argument;
-   
-    for (int i = 0; i < call->numArguments; ++i)
-    {
+
+    for (int i = 0; i < call->numArguments; ++i) {
         int rank = GetTypeCastRank(tree, expression->expressionType, argument->type);
-        if (rank == -1)
-        {
+        if (rank == -1) {
             return false;
         }
 
         rankBuffer[i] = rank;
-        
+
         argument = argument->nextArgument;
         expression = expression->nextExpression;
     }
 
-    for (int i = call->numArguments; i < function->numArguments; ++i)
-    {
-        if (argument->defaultValue == NULL)
-        {
+    for (int i = call->numArguments; i < function->numArguments; ++i) {
+        if (argument->defaultValue == NULL) {
             // Function not viable.
             return false;
         }
     }
 
     return true;
-
 }
 
-struct CompareRanks
-{
-    bool operator() (const int& rank1, const int& rank2) { return rank1 > rank2; }
+struct CompareRanks {
+    bool operator()(const int& rank1, const int& rank2) { return rank1 > rank2; }
 };
 
 static CompareFunctionsResult CompareFunctions(HLSLTree* tree, const HLSLFunctionCall* call, const HLSLFunction* function1, const HLSLFunction* function2)
-{ 
-
+{
     int* function1Ranks = static_cast<int*>(alloca(sizeof(int) * call->numArguments));
     int* function2Ranks = static_cast<int*>(alloca(sizeof(int) * call->numArguments));
 
@@ -1811,85 +1773,70 @@ static CompareFunctionsResult CompareFunctions(HLSLTree* tree, const HLSLFunctio
     const bool function2Viable = GetFunctionCallCastRanks(tree, call, function2, function2Ranks);
 
     // Both functions have to be viable to be able to compare them
-    if (!(function1Viable && function2Viable))
-    {
-        if (function1Viable)
-        {
+    if (!(function1Viable && function2Viable)) {
+        if (function1Viable) {
             return Function1Better;
         }
-        else if (function2Viable)
-        {
+        else if (function2Viable) {
             return Function2Better;
         }
-        else
-        {
+        else {
             return FunctionsEqual;
         }
     }
 
     std::sort(function1Ranks, function1Ranks + call->numArguments, CompareRanks());
     std::sort(function2Ranks, function2Ranks + call->numArguments, CompareRanks());
-    
-    for (int i = 0; i < call->numArguments; ++i)
-    {
-        if (function1Ranks[i] < function2Ranks[i])
-        {
+
+    for (int i = 0; i < call->numArguments; ++i) {
+        if (function1Ranks[i] < function2Ranks[i]) {
             return Function1Better;
         }
-        else if (function2Ranks[i] < function1Ranks[i])
-        {
+        else if (function2Ranks[i] < function1Ranks[i]) {
             return Function2Better;
         }
     }
 
     return FunctionsEqual;
-
 }
 
 static bool GetBinaryOpResultType(HLSLBinaryOp binaryOp, const HLSLType& type1, const HLSLType& type2, HLSLType& result)
 {
     // only allow numeric types for binary operators
     if (!IsNumericType(type1.baseType) || type1.array ||
-        !IsNumericType(type2.baseType) || type2.array)
-    {
-         return false;
+        !IsNumericType(type2.baseType) || type2.array) {
+        return false;
     }
 
-    if (IsBitOp(binaryOp))
-    {
-        if (!IsIntegerType(type1.baseType))
-        {
+    if (IsBitOp(binaryOp)) {
+        if (!IsIntegerType(type1.baseType)) {
             return false;
         }
     }
 
-    if (IsLogicOp(binaryOp) || IsCompareOp(binaryOp))
-    {
-        int numComponents = std::max( baseTypeDescriptions[ type1.baseType ].numComponents, baseTypeDescriptions[ type2.baseType ].numComponents );
-        result.baseType = HLSLBaseType( HLSLBaseType_Bool + numComponents - 1 );
+    if (IsLogicOp(binaryOp) || IsCompareOp(binaryOp)) {
+        int numComponents = std::max(baseTypeDescriptions[type1.baseType].numComponents, baseTypeDescriptions[type2.baseType].numComponents);
+        result.baseType = HLSLBaseType(HLSLBaseType_Bool + numComponents - 1);
     }
-    else
-    {
+    else {
         // TODO: allso mulAssign, ...
         assert(!IsAssignOp(binaryOp));
-        
+
         result.baseType = ArithmeticOpResultType(binaryOp, type1.baseType, type2.baseType);
     }
 
-    result.typeName     = NULL;
-    result.array        = false;
-    result.arraySize    = NULL;
-    result.flags        = (type1.flags & type2.flags) & HLSLTypeFlag_Const; // Propagate constness.
-    
-    return result.baseType != HLSLBaseType_Unknown;
+    result.typeName = NULL;
+    result.array = false;
+    result.arraySize = NULL;
+    result.flags = (type1.flags & type2.flags) & HLSLTypeFlag_Const; // Propagate constness.
 
+    return result.baseType != HLSLBaseType_Unknown;
 }
 
-HLSLParser::HLSLParser(Allocator* allocator, const char* fileName, const char* buffer, size_t length) : 
-    m_tokenizer(fileName, buffer, length),
-    m_userTypes(allocator),
-    m_variables(allocator),
-    m_functions(allocator)
+HLSLParser::HLSLParser(Allocator* allocator, const char* fileName, const char* buffer, size_t length) : m_tokenizer(fileName, buffer, length),
+                                                                                                        m_userTypes(allocator),
+                                                                                                        m_variables(allocator),
+                                                                                                        m_functions(allocator)
 {
     m_numGlobals = 0;
     m_tree = NULL;
@@ -1897,18 +1844,16 @@ HLSLParser::HLSLParser(Allocator* allocator, const char* fileName, const char* b
 
 bool HLSLParser::Accept(int token)
 {
-    if (m_tokenizer.GetToken() == token)
-    {
-       m_tokenizer.Next();
-       return true;
+    if (m_tokenizer.GetToken() == token) {
+        m_tokenizer.Next();
+        return true;
     }
     return false;
 }
 
 bool HLSLParser::Accept(const char* token)
 {
-    if (m_tokenizer.GetToken() == HLSLToken_Identifier && String_Equal( token, m_tokenizer.GetIdentifier() ) )
-    {
+    if (m_tokenizer.GetToken() == HLSLToken_Identifier && String_Equal(token, m_tokenizer.GetIdentifier())) {
         m_tokenizer.Next();
         return true;
     }
@@ -1917,8 +1862,7 @@ bool HLSLParser::Accept(const char* token)
 
 bool HLSLParser::Expect(int token)
 {
-    if (!Accept(token))
-    {
+    if (!Accept(token)) {
         char want[HLSLTokenizer::s_maxIdentifier];
         m_tokenizer.GetTokenName(token, want);
         char near[HLSLTokenizer::s_maxIdentifier];
@@ -1929,11 +1873,10 @@ bool HLSLParser::Expect(int token)
     return true;
 }
 
-bool HLSLParser::Expect(const char * token)
+bool HLSLParser::Expect(const char* token)
 {
-    if (!Accept(token))
-    {
-        const char * want = token;
+    if (!Accept(token)) {
+        const char* want = token;
         char near[HLSLTokenizer::s_maxIdentifier];
         m_tokenizer.GetTokenName(near);
         m_tokenizer.Error("Syntax error: expected '%s' near '%s'", want, near);
@@ -1942,12 +1885,10 @@ bool HLSLParser::Expect(const char * token)
     return true;
 }
 
-
 bool HLSLParser::AcceptIdentifier(const char*& identifier)
 {
-    if (m_tokenizer.GetToken() == HLSLToken_Identifier)
-    {
-        identifier = m_tree->AddString( m_tokenizer.GetIdentifier() );
+    if (m_tokenizer.GetToken() == HLSLToken_Identifier) {
+        identifier = m_tree->AddString(m_tokenizer.GetIdentifier());
         m_tokenizer.Next();
         return true;
     }
@@ -1956,8 +1897,7 @@ bool HLSLParser::AcceptIdentifier(const char*& identifier)
 
 bool HLSLParser::ExpectIdentifier(const char*& identifier)
 {
-    if (!AcceptIdentifier(identifier))
-    {
+    if (!AcceptIdentifier(identifier)) {
         char near[HLSLTokenizer::s_maxIdentifier] = {};
         m_tokenizer.GetTokenName(near);
         m_tokenizer.Error("Syntax error: expected identifier near '%s'", near);
@@ -1969,8 +1909,7 @@ bool HLSLParser::ExpectIdentifier(const char*& identifier)
 
 bool HLSLParser::AcceptFloat(float& value)
 {
-    if (m_tokenizer.GetToken() == HLSLToken_FloatLiteral)
-    {
+    if (m_tokenizer.GetToken() == HLSLToken_FloatLiteral) {
         value = m_tokenizer.GetFloat();
         m_tokenizer.Next();
         return true;
@@ -1980,19 +1919,17 @@ bool HLSLParser::AcceptFloat(float& value)
 
 bool HLSLParser::AcceptHalf(float& value)
 {
-	if(m_tokenizer.GetToken() == HLSLToken_HalfLiteral)
-	{
-		value = m_tokenizer.GetFloat();
-		m_tokenizer.Next();
-		return true;
-	}
-	return false;
+    if (m_tokenizer.GetToken() == HLSLToken_HalfLiteral) {
+        value = m_tokenizer.GetFloat();
+        m_tokenizer.Next();
+        return true;
+    }
+    return false;
 }
 
 bool HLSLParser::AcceptInt(int& value)
 {
-    if (m_tokenizer.GetToken() == HLSLToken_IntLiteral)
-    {
+    if (m_tokenizer.GetToken() == HLSLToken_IntLiteral) {
         value = m_tokenizer.GetInt();
         m_tokenizer.Next();
         return true;
@@ -2002,12 +1939,12 @@ bool HLSLParser::AcceptInt(int& value)
 
 bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
 {
-    HLSLAttribute * attributes = NULL;
+    HLSLAttribute* attributes = NULL;
     ParseAttributeBlock(attributes);
 
-    int line             = GetLineNumber();
+    int line = GetLineNumber();
     const char* fileName = GetFileName();
-    
+
     HLSLType type;
     //HLSLBaseType type;
     //const char*  typeName = NULL;
@@ -2015,31 +1952,26 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
 
     // TODO: this cast likely isn't safe
     HLSLToken token = (HLSLToken)m_tokenizer.GetToken();
-    
+
     bool doesNotExpectSemicolon = false;
 
     // Alec add comment
-    if (ParseComment(statement))
-    {
+    if (ParseComment(statement)) {
         doesNotExpectSemicolon = true;
     }
-    else if (Accept(HLSLToken_Struct))
-    {
+    else if (Accept(HLSLToken_Struct)) {
         // Struct declaration.
 
         const char* structName = NULL;
-        if (!ExpectIdentifier(structName))
-        {
+        if (!ExpectIdentifier(structName)) {
             return false;
         }
-        if (FindUserDefinedType(structName) != NULL)
-        {
+        if (FindUserDefinedType(structName) != NULL) {
             m_tokenizer.Error("struct %s already defined", structName);
             return false;
         }
 
-        if (!Expect('{'))
-        {
+        if (!Expect('{')) {
             return false;
         }
 
@@ -2047,30 +1979,25 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
         structure->name = structName;
 
         m_userTypes.PushBack(structure);
- 
+
         HLSLStructField* lastField = NULL;
 
         // Add the struct to our list of user defined types.
-        while (!Accept('}'))
-        {
-            if (CheckForUnexpectedEndOfStream('}'))
-            {
+        while (!Accept('}')) {
+            if (CheckForUnexpectedEndOfStream('}')) {
                 return false;
             }
-            
+
             // chain fields onto struct
             HLSLStructField* field = NULL;
-            if (!ParseFieldDeclaration(field))
-            {
+            if (!ParseFieldDeclaration(field)) {
                 return false;
             }
             ASSERT(field != NULL);
-            if (lastField == NULL)
-            {
+            if (lastField == NULL) {
                 structure->field = field;
             }
-            else
-            {
+            else {
                 lastField->nextField = field;
             }
             lastField = field;
@@ -2082,71 +2009,62 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
              Accept(HLSLToken_StructuredBuffer) ||
              Accept(HLSLToken_RWStructuredBuffer) ||
              Accept(HLSLToken_ByteAddressBuffer) ||
-             Accept(HLSLToken_RWByteAddressBuffer))
-    {
+             Accept(HLSLToken_RWByteAddressBuffer)) {
         HLSLBuffer* buffer = m_tree->AddNode<HLSLBuffer>(fileName, line);
-        
+
         // these can appear on t or u slots for read vs. read/write
         // need to track what the user specified.  Load vs. Store calls.
         buffer->bufferType = ConvertTokenToBufferType(token);
-    
+
         // Is template struct type required?
-        if (Expect('<'))
-        {
+        if (Expect('<')) {
             const char* structName = nullptr;
-            
+
             // Read the templated type, should reference a struct
             // don't need to support fields on this.
-            if (!ExpectIdentifier(structName) || !Expect('>'))
-            {
+            if (!ExpectIdentifier(structName) || !Expect('>')) {
                 return false;
             }
-           
+
             buffer->bufferStruct = const_cast<HLSLStruct*>(FindUserDefinedType(structName));
-            if (!buffer->bufferStruct)
-            {
+            if (!buffer->bufferStruct) {
                 return false;
             }
         }
-        
+
         // get name of buffer
         AcceptIdentifier(buffer->name);
-    
+
         // Parse ": register(t0/u0)"
-        if (Accept(':'))
-        {
-            if (!Expect(HLSLToken_Register) || !Expect('(') || !ExpectIdentifier(buffer->registerName) || !Expect(')'))
-            {
+        if (Accept(':')) {
+            if (!Expect(HLSLToken_Register) || !Expect('(') || !ExpectIdentifier(buffer->registerName) || !Expect(')')) {
                 return false;
             }
             // TODO: Check that we aren't re-using a register.
         }
-        
+
         // Buffer needs to show up to reference the fields
         // of the struct of the templated type.
         HLSLType bufferType(HLSLBaseType_UserDefined);
         bufferType.typeName = buffer->bufferStruct->name; // this is for userDefined name (f.e. struct)
-        
-        DeclareVariable( buffer->name, bufferType );
-       
+
+        DeclareVariable(buffer->name, bufferType);
+
         // TODO: add fields as variables too?
-        
+
         statement = buffer;
     }
-    else if (Accept(HLSLToken_CBuffer) || Accept(HLSLToken_TBuffer))
-    {
+    else if (Accept(HLSLToken_CBuffer) || Accept(HLSLToken_TBuffer)) {
         // cbuffer/tbuffer declaration.
 
         HLSLBuffer* buffer = m_tree->AddNode<HLSLBuffer>(fileName, line);
         AcceptIdentifier(buffer->name);
 
         buffer->bufferType = ConvertTokenToBufferType(token);
-        
+
         // Optional register assignment.
-        if (Accept(':'))
-        {
-            if (!Expect(HLSLToken_Register) || !Expect('(') || !ExpectIdentifier(buffer->registerName) || !Expect(')'))
-            {
+        if (Accept(':')) {
+            if (!Expect(HLSLToken_Register) || !Expect('(') || !ExpectIdentifier(buffer->registerName) || !Expect(')')) {
                 return false;
             }
             // TODO: Check that we aren't re-using a register.
@@ -2155,90 +2073,76 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
         // Fields are defined inside the c/tbuffer.
         // These represent globals to the rest of the codebase which
         // is simply evil.
-        
-        if (!Expect('{'))
-        {
+
+        if (!Expect('{')) {
             return false;
         }
         HLSLDeclaration* lastField = NULL;
-        while (!Accept('}'))
-        {
-            if (CheckForUnexpectedEndOfStream('}'))
-            {
+        while (!Accept('}')) {
+            if (CheckForUnexpectedEndOfStream('}')) {
                 return false;
             }
-            
+
             // TODO: can't convert statement to fields
-            if (ParseComment(statement))
-            {
+            if (ParseComment(statement)) {
                 continue;
             }
-           
+
             HLSLDeclaration* field = NULL;
-            if (!ParseDeclaration(field))
-            {
+            if (!ParseDeclaration(field)) {
                 m_tokenizer.Error("Expected variable declaration");
                 return false;
             }
-            
+
             // These show up as global variables of the fields
-            DeclareVariable( field->name, field->type );
-            
+            DeclareVariable(field->name, field->type);
+
             // chain fields onto buffer
             field->buffer = buffer;
-            if (buffer->field == NULL)
-            {
+            if (buffer->field == NULL) {
                 buffer->field = field;
             }
-            else
-            {
+            else {
                 lastField->nextStatement = field;
             }
             lastField = field;
-            
+
             if (!Expect(';')) {
                 return false;
             }
-
         }
 
         statement = buffer;
     }
-    else if (AcceptType(true, type))
-    {
+    else if (AcceptType(true, type)) {
         // Global declaration (uniform or function).
         const char* globalName = NULL;
-        if (!ExpectIdentifier(globalName))
-        {
+        if (!ExpectIdentifier(globalName)) {
             return false;
         }
 
-        if (Accept('('))
-        {
+        if (Accept('(')) {
             // Function declaration.
 
             HLSLFunction* function = m_tree->AddNode<HLSLFunction>(fileName, line);
-            function->name                  = globalName;
-            function->returnType.baseType   = type.baseType;
-            function->returnType.typeName   = type.typeName;
-            function->attributes            = attributes;
+            function->name = globalName;
+            function->returnType.baseType = type.baseType;
+            function->returnType.typeName = type.typeName;
+            function->attributes = attributes;
 
             BeginScope();
 
-            if (!ParseArgumentList(function->argument, function->numArguments, function->numOutputArguments))
-            {
+            if (!ParseArgumentList(function->argument, function->numArguments, function->numOutputArguments)) {
                 return false;
             }
 
             const HLSLFunction* declaration = FindFunction(function);
 
             // Forward declaration
-            if (Accept(';'))
-            {
+            if (Accept(';')) {
                 // Add a function entry so that calls can refer to it
-                if (!declaration)
-                {
-                    m_functions.PushBack( function );
+                if (!declaration) {
+                    m_functions.PushBack(function);
                     statement = function;
                 }
                 EndScope();
@@ -2246,28 +2150,23 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
             }
 
             // Optional semantic.
-            if (Accept(':') && !ExpectIdentifier(function->semantic))
-            {
+            if (Accept(':') && !ExpectIdentifier(function->semantic)) {
                 return false;
             }
 
-            if (declaration)
-            {
-                if (declaration->forward || declaration->statement)
-                {
+            if (declaration) {
+                if (declaration->forward || declaration->statement) {
                     m_tokenizer.Error("Duplicate function definition");
                     return false;
                 }
 
                 const_cast<HLSLFunction*>(declaration)->forward = function;
             }
-            else
-            {
-                m_functions.PushBack( function );
+            else {
+                m_functions.PushBack(function);
             }
 
-            if (!Expect('{') || !ParseBlock(function->statement, function->returnType))
-            {
+            if (!Expect('{') || !ParseBlock(function->statement, function->returnType)) {
                 return false;
             }
 
@@ -2275,23 +2174,19 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
 
             // Note, no semi-colon at the end of a function declaration.
             statement = function;
-            
+
             return true;
         }
-        else
-        {
+        else {
             // Uniform declaration.
             HLSLDeclaration* declaration = m_tree->AddNode<HLSLDeclaration>(fileName, line);
-            declaration->name            = globalName;
-            declaration->type            = type;
+            declaration->name = globalName;
+            declaration->type = type;
 
             // Handle array syntax.
-            if (Accept('['))
-            {
-                if (!Accept(']'))
-                {
-                    if (!ParseExpression(declaration->type.arraySize) || !Expect(']'))
-                    {
+            if (Accept('[')) {
+                if (!Accept(']')) {
+                    if (!ParseExpression(declaration->type.arraySize) || !Expect(']')) {
                         return false;
                     }
                 }
@@ -2299,31 +2194,28 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
             }
 
             // Handle optional register.
-            if (Accept(':'))
-            {
+            if (Accept(':')) {
                 // @@ Currently we support either a semantic or a register, but not both.
                 if (AcceptIdentifier(declaration->semantic)) {
                     // int k = 1;
                 }
-                else if (!Expect(HLSLToken_Register) || !Expect('(') || !ExpectIdentifier(declaration->registerName) || !Expect(')'))
-                {
+                else if (!Expect(HLSLToken_Register) || !Expect('(') || !ExpectIdentifier(declaration->registerName) || !Expect(')')) {
                     return false;
                 }
             }
 
-            DeclareVariable( globalName, declaration->type );
+            DeclareVariable(globalName, declaration->type);
 
-            if (!ParseDeclarationAssignment(declaration))
-            {
+            if (!ParseDeclarationAssignment(declaration)) {
                 return false;
             }
 
             // TODO: Multiple variables declared on one line.
-            
+
             statement = declaration;
         }
     }
-    
+
     /*
     // These three are from .fx file syntax
     else if (ParseTechnique(statement)) {
@@ -2336,7 +2228,7 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
         doesNotExpectSemicolon = true;
     }
     */
-    
+
     if (statement != NULL) {
         statement->attributes = attributes;
     }
@@ -2344,28 +2236,22 @@ bool HLSLParser::ParseTopLevel(HLSLStatement*& statement)
     return doesNotExpectSemicolon || Expect(';');
 }
 
-bool HLSLParser::ParseStatementOrBlock(HLSLStatement*& firstStatement, const HLSLType& returnType, bool scoped/*=true*/)
+bool HLSLParser::ParseStatementOrBlock(HLSLStatement*& firstStatement, const HLSLType& returnType, bool scoped /*=true*/)
 {
-    if (scoped)
-    {
+    if (scoped) {
         BeginScope();
     }
-    if (Accept('{'))
-    {
-        if (!ParseBlock(firstStatement, returnType))
-        {
+    if (Accept('{')) {
+        if (!ParseBlock(firstStatement, returnType)) {
             return false;
         }
     }
-    else
-    {
-        if (!ParseStatement(firstStatement, returnType))
-        {
+    else {
+        if (!ParseStatement(firstStatement, returnType)) {
             return false;
         }
     }
-    if (scoped)
-    {
+    if (scoped) {
         EndScope();
     }
     return true;
@@ -2375,21 +2261,21 @@ bool HLSLParser::ParseComment(HLSLStatement*& statement)
 {
     if (m_tokenizer.GetToken() != HLSLToken_Comment)
         return false;
-    
+
     const char* textName = m_tree->AddString(m_tokenizer.GetComment());
-    
+
     // This has already parsed the next comment before have had a chance to
     // grab the string from the previous comment, if they were sequenential comments.
     // So grabbing a copy of comment before this parses the next comment.
     if (!Accept(HLSLToken_Comment))
         return false;
-    
+
     const char* fileName = GetFileName();
-    int         line     = GetLineNumber();
+    int line = GetLineNumber();
 
     HLSLComment* comment = m_tree->AddNode<HLSLComment>(fileName, line);
     comment->text = textName;
-    
+
     // pass it back
     statement = comment;
     return true;
@@ -2398,33 +2284,27 @@ bool HLSLParser::ParseComment(HLSLStatement*& statement)
 bool HLSLParser::ParseBlock(HLSLStatement*& firstStatement, const HLSLType& returnType)
 {
     HLSLStatement* lastStatement = NULL;
-    while (!Accept('}'))
-    {
-        if (CheckForUnexpectedEndOfStream('}'))
-        {
+    while (!Accept('}')) {
+        if (CheckForUnexpectedEndOfStream('}')) {
             return false;
         }
-        
+
         HLSLStatement* statement = NULL;
-        
-        if (!ParseStatement(statement, returnType))
-        {
+
+        if (!ParseStatement(statement, returnType)) {
             return false;
         }
-        
+
         // chain statements onto the list
-        if (statement != NULL)
-        {
-            if (firstStatement == NULL)
-            {
+        if (statement != NULL) {
+            if (firstStatement == NULL) {
                 firstStatement = statement;
             }
-            else
-            {
+            else {
                 lastStatement->nextStatement = statement;
             }
             lastStatement = statement;
-            
+
             // some statement parsing can gen more than one statement, so find end
             while (lastStatement->nextStatement)
                 lastStatement = lastStatement->nextStatement;
@@ -2436,16 +2316,15 @@ bool HLSLParser::ParseBlock(HLSLStatement*& firstStatement, const HLSLType& retu
 bool HLSLParser::ParseStatement(HLSLStatement*& statement, const HLSLType& returnType)
 {
     const char* fileName = GetFileName();
-    int         line     = GetLineNumber();
+    int line = GetLineNumber();
 
     // Empty statements.
-    if (Accept(';'))
-    {
+    if (Accept(';')) {
         return true;
     }
 
-    HLSLAttribute * attributes = NULL;
-    ParseAttributeBlock(attributes);    // @@ Leak if not assigned to node? 
+    HLSLAttribute* attributes = NULL;
+    ParseAttributeBlock(attributes); // @@ Leak if not assigned to node?
 
 #if 0
     // @@ Work in progress.
@@ -2518,78 +2397,64 @@ bool HLSLParser::ParseStatement(HLSLStatement*& statement, const HLSLType& retur
     }
 */
 #endif
-    
-    if (ParseComment(statement))
-    {
+
+    if (ParseComment(statement)) {
         return true;
     }
-    
+
     // If statement.
-    if (Accept(HLSLToken_If))
-    {
+    if (Accept(HLSLToken_If)) {
         HLSLIfStatement* ifStatement = m_tree->AddNode<HLSLIfStatement>(fileName, line);
         ifStatement->attributes = attributes;
-        if (!Expect('(') || !ParseExpression(ifStatement->condition) || !Expect(')'))
-        {
+        if (!Expect('(') || !ParseExpression(ifStatement->condition) || !Expect(')')) {
             return false;
         }
         statement = ifStatement;
-        if (!ParseStatementOrBlock(ifStatement->statement, returnType))
-        {
+        if (!ParseStatementOrBlock(ifStatement->statement, returnType)) {
             return false;
         }
-        if (Accept(HLSLToken_Else))
-        {
+        if (Accept(HLSLToken_Else)) {
             return ParseStatementOrBlock(ifStatement->elseStatement, returnType);
         }
         return true;
     }
-    
+
     // For statement.
-    if (Accept(HLSLToken_For))
-    {
+    if (Accept(HLSLToken_For)) {
         HLSLForStatement* forStatement = m_tree->AddNode<HLSLForStatement>(fileName, line);
         forStatement->attributes = attributes;
-        if (!Expect('('))
-        {
+        if (!Expect('(')) {
             return false;
         }
         BeginScope();
-        if (!ParseDeclaration(forStatement->initialization))
-        {
+        if (!ParseDeclaration(forStatement->initialization)) {
             return false;
         }
-        if (!Expect(';'))
-        {
+        if (!Expect(';')) {
             return false;
         }
         ParseExpression(forStatement->condition);
-        if (!Expect(';'))
-        {
+        if (!Expect(';')) {
             return false;
         }
         ParseExpression(forStatement->increment);
-        if (!Expect(')'))
-        {
+        if (!Expect(')')) {
             return false;
         }
         statement = forStatement;
-        if (!ParseStatementOrBlock(forStatement->statement, returnType))
-        {
+        if (!ParseStatementOrBlock(forStatement->statement, returnType)) {
             return false;
         }
         EndScope();
         return true;
     }
 
-    if (attributes != NULL)
-    {
+    if (attributes != NULL) {
         // @@ Error. Unexpected attribute. We only support attributes associated to if and for statements.
     }
 
     // Block statement.
-    if (Accept('{'))
-    {
+    if (Accept('{')) {
         HLSLBlockStatement* blockStatement = m_tree->AddNode<HLSLBlockStatement>(fileName, line);
         statement = blockStatement;
         BeginScope();
@@ -2599,41 +2464,35 @@ bool HLSLParser::ParseStatement(HLSLStatement*& statement, const HLSLType& retur
     }
 
     // Discard statement.
-    if (Accept(HLSLToken_Discard))
-    {
+    if (Accept(HLSLToken_Discard)) {
         HLSLDiscardStatement* discardStatement = m_tree->AddNode<HLSLDiscardStatement>(fileName, line);
         statement = discardStatement;
         return Expect(';');
     }
 
     // Break statement.
-    if (Accept(HLSLToken_Break))
-    {
+    if (Accept(HLSLToken_Break)) {
         HLSLBreakStatement* breakStatement = m_tree->AddNode<HLSLBreakStatement>(fileName, line);
         statement = breakStatement;
         return Expect(';');
     }
 
     // Continue statement.
-    if (Accept(HLSLToken_Continue))
-    {
+    if (Accept(HLSLToken_Continue)) {
         HLSLContinueStatement* continueStatement = m_tree->AddNode<HLSLContinueStatement>(fileName, line);
         statement = continueStatement;
         return Expect(';');
     }
 
     // Return statement
-    if (Accept(HLSLToken_Return))
-    {
+    if (Accept(HLSLToken_Return)) {
         HLSLReturnStatement* returnStatement = m_tree->AddNode<HLSLReturnStatement>(fileName, line);
-        if (!Accept(';') && !ParseExpression(returnStatement->expression))
-        {
+        if (!Accept(';') && !ParseExpression(returnStatement->expression)) {
             return false;
         }
         // Check that the return expression can be cast to the return type of the function.
         HLSLType voidType(HLSLBaseType_Void);
-        if (!CheckTypeCast(returnStatement->expression ? returnStatement->expression->expressionType : voidType, returnType))
-        {
+        if (!CheckTypeCast(returnStatement->expression ? returnStatement->expression->expressionType : voidType, returnType)) {
             return false;
         }
 
@@ -2642,14 +2501,12 @@ bool HLSLParser::ParseStatement(HLSLStatement*& statement, const HLSLType& retur
     }
 
     HLSLDeclaration* declaration = NULL;
-    HLSLExpression*  expression  = NULL;
+    HLSLExpression* expression = NULL;
 
-    if (ParseDeclaration(declaration))
-    {
+    if (ParseDeclaration(declaration)) {
         statement = declaration;
     }
-    else if (ParseExpression(expression))
-    {
+    else if (ParseExpression(expression)) {
         HLSLExpressionStatement* expressionStatement;
         expressionStatement = m_tree->AddNode<HLSLExpressionStatement>(fileName, line);
         expressionStatement->expression = expression;
@@ -2659,55 +2516,49 @@ bool HLSLParser::ParseStatement(HLSLStatement*& statement, const HLSLType& retur
     return Expect(';');
 }
 
-
 // IC: This is only used in block statements, or within control flow statements. So, it doesn't support semantics or layout modifiers.
 // @@ We should add suport for semantics for inline input/output declarations.
 bool HLSLParser::ParseDeclaration(HLSLDeclaration*& declaration)
 {
-    const char* fileName    = GetFileName();
-    int         line        = GetLineNumber();
+    const char* fileName = GetFileName();
+    int line = GetLineNumber();
 
     HLSLType type;
-    if (!AcceptType(/*allowVoid=*/false, type))
-    {
+    if (!AcceptType(/*allowVoid=*/false, type)) {
         return false;
     }
 
-    bool allowUnsizedArray = true;  // This is needed for SSBO
-    
-    HLSLDeclaration * firstDeclaration = NULL;
-    HLSLDeclaration * lastDeclaration = NULL;
+    bool allowUnsizedArray = true; // This is needed for SSBO
+
+    HLSLDeclaration* firstDeclaration = NULL;
+    HLSLDeclaration* lastDeclaration = NULL;
 
     do {
         const char* name;
-        if (!ExpectIdentifier(name))
-        {
+        if (!ExpectIdentifier(name)) {
             // TODO: false means we didn't accept a declaration and we had an error!
             return false;
         }
         // Handle array syntax.
-        if (Accept('['))
-        {
+        if (Accept('[')) {
             type.array = true;
             // Optionally allow no size to the specified for the array.
-            if (Accept(']') && allowUnsizedArray)
-            {
+            if (Accept(']') && allowUnsizedArray) {
                 return true;
             }
-            if (!ParseExpression(type.arraySize) || !Expect(']'))
-            {
+            if (!ParseExpression(type.arraySize) || !Expect(']')) {
                 return false;
             }
         }
 
-        HLSLDeclaration * parsedDeclaration = m_tree->AddNode<HLSLDeclaration>(fileName, line);
-        parsedDeclaration->type  = type;
-        parsedDeclaration->name  = name;
+        HLSLDeclaration* parsedDeclaration = m_tree->AddNode<HLSLDeclaration>(fileName, line);
+        parsedDeclaration->type = type;
+        parsedDeclaration->name = name;
 
-        DeclareVariable( parsedDeclaration->name, parsedDeclaration->type );
+        DeclareVariable(parsedDeclaration->name, parsedDeclaration->type);
 
         // Handle option assignment of the declared variables(s).
-        if (!ParseDeclarationAssignment( parsedDeclaration )) {
+        if (!ParseDeclarationAssignment(parsedDeclaration)) {
             return false;
         }
 
@@ -2715,7 +2566,7 @@ bool HLSLParser::ParseDeclaration(HLSLDeclaration*& declaration)
         if (lastDeclaration != NULL) lastDeclaration->nextDeclaration = parsedDeclaration;
         lastDeclaration = parsedDeclaration;
 
-    } while(Accept(','));
+    } while (Accept(','));
 
     declaration = firstDeclaration;
 
@@ -2724,26 +2575,22 @@ bool HLSLParser::ParseDeclaration(HLSLDeclaration*& declaration)
 
 bool HLSLParser::ParseDeclarationAssignment(HLSLDeclaration* declaration)
 {
-    if (Accept('='))
-    {
+    if (Accept('=')) {
         // Handle array initialization syntax.
-        if (declaration->type.array)
-        {
+        if (declaration->type.array) {
             int numValues = 0;
-            if (!Expect('{') || !ParseExpressionList('}', true, declaration->assignment, numValues))
-            {
+            if (!Expect('{') || !ParseExpressionList('}', true, declaration->assignment, numValues)) {
                 return false;
             }
         }
-//        else if (IsSamplerType(declaration->type.baseType)) // TODO: should be for SamplerStateBlock, not Sampler
-//        {
-//            if (!ParseSamplerState(declaration->assignment))
-//            {
-//                return false;
-//            }
-//        }
-        else if (!ParseExpression(declaration->assignment))
-        {
+        // else if (IsSamplerType(declaration->type.baseType)) // TODO: should be for SamplerStateBlock, not Sampler
+        // {
+        //     if (!ParseSamplerState(declaration->assignment))
+        //     {
+        //         return false;
+        //     }
+        // }
+        else if (!ParseExpression(declaration->assignment)) {
             return false;
         }
     }
@@ -2752,16 +2599,13 @@ bool HLSLParser::ParseDeclarationAssignment(HLSLDeclaration* declaration)
 
 bool HLSLParser::ParseFieldDeclaration(HLSLStructField*& field)
 {
-    field = m_tree->AddNode<HLSLStructField>( GetFileName(), GetLineNumber() );
-    if (!ExpectDeclaration(false, field->type, field->name))
-    {
+    field = m_tree->AddNode<HLSLStructField>(GetFileName(), GetLineNumber());
+    if (!ExpectDeclaration(false, field->type, field->name)) {
         return false;
     }
     // Handle optional semantics.
-    if (Accept(':'))
-    {
-        if (!ExpectIdentifier(field->semantic))
-        {
+    if (Accept(':')) {
+        if (!ExpectIdentifier(field->semantic)) {
             return false;
         }
     }
@@ -2795,8 +2639,7 @@ bool HLSLParser::ParseFieldDeclaration(HLSLStructField*& field)
 
 bool HLSLParser::CheckTypeCast(const HLSLType& srcType, const HLSLType& dstType)
 {
-    if (GetTypeCastRank(m_tree, srcType, dstType) == -1)
-    {
+    if (GetTypeCastRank(m_tree, srcType, dstType) == -1) {
         const char* srcTypeName = GetTypeNameHLSL(srcType);
         const char* dstTypeName = GetTypeNameHLSL(dstType);
         m_tokenizer.Error("Cannot implicitly convert from '%s' to '%s'", srcTypeName, dstTypeName);
@@ -2807,17 +2650,14 @@ bool HLSLParser::CheckTypeCast(const HLSLType& srcType, const HLSLType& dstType)
 
 bool HLSLParser::ParseExpression(HLSLExpression*& expression)
 {
-    if (!ParseBinaryExpression(0, expression))
-    {
+    if (!ParseBinaryExpression(0, expression)) {
         return false;
     }
 
     HLSLBinaryOp assignOp;
-    if (AcceptAssign(assignOp))
-    {
+    if (AcceptAssign(assignOp)) {
         HLSLExpression* expression2 = NULL;
-        if (!ParseExpression(expression2))
-        {
+        if (!ParseExpression(expression2)) {
             return false;
         }
         HLSLBinaryExpression* binaryExpression = m_tree->AddNode<HLSLBinaryExpression>(expression->fileName, expression->line);
@@ -2828,8 +2668,7 @@ bool HLSLParser::ParseExpression(HLSLExpression*& expression)
         // However, for our usage of the types it should be sufficient.
         binaryExpression->expressionType = expression->expressionType;
 
-        if (!CheckTypeCast(expression2->expressionType, expression->expressionType))
-        {
+        if (!CheckTypeCast(expression2->expressionType, expression->expressionType)) {
             const char* srcTypeName = GetTypeNameHLSL(expression2->expressionType);
             const char* dstTypeName = GetTypeNameHLSL(expression->expressionType);
             m_tokenizer.Error("Cannot implicitly convert from '%s' to '%s'", srcTypeName, dstTypeName);
@@ -2845,28 +2684,56 @@ bool HLSLParser::ParseExpression(HLSLExpression*& expression)
 bool HLSLParser::AcceptBinaryOperator(int priority, HLSLBinaryOp& binaryOp)
 {
     int token = m_tokenizer.GetToken();
-    switch (token)
-    {
-    case HLSLToken_LogicalAnd:      binaryOp = HLSLBinaryOp_And;          break;
-    case HLSLToken_LogicalOr:       binaryOp = HLSLBinaryOp_Or;           break;
-    case '+':                       binaryOp = HLSLBinaryOp_Add;          break;
-    case '-':                       binaryOp = HLSLBinaryOp_Sub;          break;
-    case '*':                       binaryOp = HLSLBinaryOp_Mul;          break;
-    case '/':                       binaryOp = HLSLBinaryOp_Div;          break;
-    case '<':                       binaryOp = HLSLBinaryOp_Less;         break;
-    case '>':                       binaryOp = HLSLBinaryOp_Greater;      break;
-    case HLSLToken_LessEqual:       binaryOp = HLSLBinaryOp_LessEqual;    break;
-    case HLSLToken_GreaterEqual:    binaryOp = HLSLBinaryOp_GreaterEqual; break;
-    case HLSLToken_EqualEqual:      binaryOp = HLSLBinaryOp_Equal;        break;
-    case HLSLToken_NotEqual:        binaryOp = HLSLBinaryOp_NotEqual;     break;
-    case '&':                       binaryOp = HLSLBinaryOp_BitAnd;       break;
-    case '|':                       binaryOp = HLSLBinaryOp_BitOr;        break;
-    case '^':                       binaryOp = HLSLBinaryOp_BitXor;       break;
-    default:
-        return false;
+    switch (token) {
+        case HLSLToken_LogicalAnd:
+            binaryOp = HLSLBinaryOp_And;
+            break;
+        case HLSLToken_LogicalOr:
+            binaryOp = HLSLBinaryOp_Or;
+            break;
+        case '+':
+            binaryOp = HLSLBinaryOp_Add;
+            break;
+        case '-':
+            binaryOp = HLSLBinaryOp_Sub;
+            break;
+        case '*':
+            binaryOp = HLSLBinaryOp_Mul;
+            break;
+        case '/':
+            binaryOp = HLSLBinaryOp_Div;
+            break;
+        case '<':
+            binaryOp = HLSLBinaryOp_Less;
+            break;
+        case '>':
+            binaryOp = HLSLBinaryOp_Greater;
+            break;
+        case HLSLToken_LessEqual:
+            binaryOp = HLSLBinaryOp_LessEqual;
+            break;
+        case HLSLToken_GreaterEqual:
+            binaryOp = HLSLBinaryOp_GreaterEqual;
+            break;
+        case HLSLToken_EqualEqual:
+            binaryOp = HLSLBinaryOp_Equal;
+            break;
+        case HLSLToken_NotEqual:
+            binaryOp = HLSLBinaryOp_NotEqual;
+            break;
+        case '&':
+            binaryOp = HLSLBinaryOp_BitAnd;
+            break;
+        case '|':
+            binaryOp = HLSLBinaryOp_BitOr;
+            break;
+        case '^':
+            binaryOp = HLSLBinaryOp_BitXor;
+            break;
+        default:
+            return false;
     }
-    if (_binaryOpPriority[binaryOp] > priority)
-    {
+    if (_binaryOpPriority[binaryOp] > priority) {
         m_tokenizer.Next();
         return true;
     }
@@ -2876,32 +2743,25 @@ bool HLSLParser::AcceptBinaryOperator(int priority, HLSLBinaryOp& binaryOp)
 bool HLSLParser::AcceptUnaryOperator(bool pre, HLSLUnaryOp& unaryOp)
 {
     int token = m_tokenizer.GetToken();
-    if (token == HLSLToken_PlusPlus)
-    {
+    if (token == HLSLToken_PlusPlus) {
         unaryOp = pre ? HLSLUnaryOp_PreIncrement : HLSLUnaryOp_PostIncrement;
     }
-    else if (token == HLSLToken_MinusMinus)
-    {
+    else if (token == HLSLToken_MinusMinus) {
         unaryOp = pre ? HLSLUnaryOp_PreDecrement : HLSLUnaryOp_PostDecrement;
     }
-    else if (pre && token == '-')
-    {
+    else if (pre && token == '-') {
         unaryOp = HLSLUnaryOp_Negative;
     }
-    else if (pre && token == '+')
-    {
+    else if (pre && token == '+') {
         unaryOp = HLSLUnaryOp_Positive;
     }
-    else if (pre && token == '!')
-    {
+    else if (pre && token == '!') {
         unaryOp = HLSLUnaryOp_Not;
     }
-    else if (pre && token == '~')
-    {
+    else if (pre && token == '~') {
         unaryOp = HLSLUnaryOp_Not;
     }
-    else
-    {
+    else {
         return false;
     }
     m_tokenizer.Next();
@@ -2910,28 +2770,22 @@ bool HLSLParser::AcceptUnaryOperator(bool pre, HLSLUnaryOp& unaryOp)
 
 bool HLSLParser::AcceptAssign(HLSLBinaryOp& binaryOp)
 {
-    if (Accept('='))
-    {
+    if (Accept('=')) {
         binaryOp = HLSLBinaryOp_Assign;
     }
-    else if (Accept(HLSLToken_PlusEqual))
-    {
+    else if (Accept(HLSLToken_PlusEqual)) {
         binaryOp = HLSLBinaryOp_AddAssign;
     }
-    else if (Accept(HLSLToken_MinusEqual))
-    {
+    else if (Accept(HLSLToken_MinusEqual)) {
         binaryOp = HLSLBinaryOp_SubAssign;
-    }     
-    else if (Accept(HLSLToken_TimesEqual))
-    {
+    }
+    else if (Accept(HLSLToken_TimesEqual)) {
         binaryOp = HLSLBinaryOp_MulAssign;
-    }     
-    else if (Accept(HLSLToken_DivideEqual))
-    {
+    }
+    else if (Accept(HLSLToken_DivideEqual)) {
         binaryOp = HLSLBinaryOp_DivAssign;
-    }     
-    else
-    {
+    }
+    else {
         return false;
     }
     return true;
@@ -2940,89 +2794,77 @@ bool HLSLParser::AcceptAssign(HLSLBinaryOp& binaryOp)
 bool HLSLParser::ParseBinaryExpression(int priority, HLSLExpression*& expression)
 {
     const char* fileName = GetFileName();
-    int         line     = GetLineNumber();
+    int line = GetLineNumber();
 
     bool needsEndParen;
 
-    if (!ParseTerminalExpression(expression, needsEndParen))
-    {
+    if (!ParseTerminalExpression(expression, needsEndParen)) {
         return false;
     }
 
-	// reset priority cause openned parenthesis
-	if( needsEndParen )
-		priority = 0;
+    // reset priority cause openned parenthesis
+    if (needsEndParen)
+        priority = 0;
 
-    while (1)
-    {
+    while (1) {
         HLSLBinaryOp binaryOp;
-        if (AcceptBinaryOperator(priority, binaryOp))
-        {
-
+        if (AcceptBinaryOperator(priority, binaryOp)) {
             HLSLExpression* expression2 = NULL;
-            ASSERT( binaryOp < sizeof(_binaryOpPriority) / sizeof(int) );
-            if (!ParseBinaryExpression(_binaryOpPriority[binaryOp], expression2))
-            {
+            ASSERT(binaryOp < sizeof(_binaryOpPriority) / sizeof(int));
+            if (!ParseBinaryExpression(_binaryOpPriority[binaryOp], expression2)) {
                 return false;
             }
             HLSLBinaryExpression* binaryExpression = m_tree->AddNode<HLSLBinaryExpression>(fileName, line);
-            binaryExpression->binaryOp    = binaryOp;
+            binaryExpression->binaryOp = binaryOp;
             binaryExpression->expression1 = expression;
             binaryExpression->expression2 = expression2;
-            if (!GetBinaryOpResultType( binaryOp, expression->expressionType, expression2->expressionType, binaryExpression->expressionType ))
-            {
-                const char* typeName1 = GetTypeNameHLSL( binaryExpression->expression1->expressionType );
-                const char* typeName2 = GetTypeNameHLSL( binaryExpression->expression2->expressionType );
+            if (!GetBinaryOpResultType(binaryOp, expression->expressionType, expression2->expressionType, binaryExpression->expressionType)) {
+                const char* typeName1 = GetTypeNameHLSL(binaryExpression->expression1->expressionType);
+                const char* typeName2 = GetTypeNameHLSL(binaryExpression->expression2->expressionType);
                 m_tokenizer.Error("binary '%s' : no global operator found which takes types '%s' and '%s' (or there is no acceptable conversion)",
-                    GetBinaryOpName(binaryOp), typeName1, typeName2);
+                                  GetBinaryOpName(binaryOp), typeName1, typeName2);
 
                 return false;
             }
-            
+
             // Propagate constness.
             binaryExpression->expressionType.flags = (expression->expressionType.flags | expression2->expressionType.flags) & HLSLTypeFlag_Const;
-            
+
             expression = binaryExpression;
         }
-        else if (_conditionalOpPriority > priority && Accept('?'))
-        {
-
+        else if (_conditionalOpPriority > priority && Accept('?')) {
             HLSLConditionalExpression* conditionalExpression = m_tree->AddNode<HLSLConditionalExpression>(fileName, line);
             conditionalExpression->condition = expression;
-            
+
             HLSLExpression* expression1 = NULL;
             HLSLExpression* expression2 = NULL;
-            if (!ParseBinaryExpression(_conditionalOpPriority, expression1) || !Expect(':') || !ParseBinaryExpression(_conditionalOpPriority, expression2))
-            {
+            if (!ParseBinaryExpression(_conditionalOpPriority, expression1) || !Expect(':') || !ParseBinaryExpression(_conditionalOpPriority, expression2)) {
                 return false;
             }
 
             // Make sure both cases have compatible types.
-            if (GetTypeCastRank(m_tree, expression1->expressionType, expression2->expressionType) == -1)
-            {
+            if (GetTypeCastRank(m_tree, expression1->expressionType, expression2->expressionType) == -1) {
                 const char* srcTypeName = GetTypeNameHLSL(expression2->expressionType);
                 const char* dstTypeName = GetTypeNameHLSL(expression1->expressionType);
                 m_tokenizer.Error("':' no possible conversion from from '%s' to '%s'", srcTypeName, dstTypeName);
                 return false;
             }
 
-            conditionalExpression->trueExpression  = expression1;
+            conditionalExpression->trueExpression = expression1;
             conditionalExpression->falseExpression = expression2;
-            conditionalExpression->expressionType  = expression1->expressionType;
+            conditionalExpression->expressionType = expression1->expressionType;
 
             expression = conditionalExpression;
         }
-        else
-        {
+        else {
             break;
         }
 
-		if( needsEndParen )
-		{
-			if( !Expect( ')' ) )
-				return false;
-			needsEndParen = false;
-		}
+        if (needsEndParen) {
+            if (!Expect(')'))
+                return false;
+            needsEndParen = false;
+        }
     }
 
     return !needsEndParen || Expect(')');
@@ -3031,16 +2873,15 @@ bool HLSLParser::ParseBinaryExpression(int priority, HLSLExpression*& expression
 bool HLSLParser::ParsePartialConstructor(HLSLExpression*& expression, HLSLBaseType type, const char* typeName)
 {
     const char* fileName = GetFileName();
-    int         line     = GetLineNumber();
+    int line = GetLineNumber();
 
     HLSLConstructorExpression* constructorExpression = m_tree->AddNode<HLSLConstructorExpression>(fileName, line);
     constructorExpression->type.baseType = type;
     constructorExpression->type.typeName = typeName;
     int numArguments = 0;
-    if (!ParseExpressionList(')', false, constructorExpression->argument, numArguments))
-    {
+    if (!ParseExpressionList(')', false, constructorExpression->argument, numArguments)) {
         return false;
-    }    
+    }
     constructorExpression->expressionType = constructorExpression->type;
     constructorExpression->expressionType.flags = HLSLTypeFlag_Const;
     expression = constructorExpression;
@@ -3050,53 +2891,44 @@ bool HLSLParser::ParsePartialConstructor(HLSLExpression*& expression, HLSLBaseTy
 bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& needsEndParen)
 {
     const char* fileName = GetFileName();
-    int         line     = GetLineNumber();
+    int line = GetLineNumber();
 
     needsEndParen = false;
 
     HLSLUnaryOp unaryOp;
-    if (AcceptUnaryOperator(true, unaryOp))
-    {
+    if (AcceptUnaryOperator(true, unaryOp)) {
         HLSLUnaryExpression* unaryExpression = m_tree->AddNode<HLSLUnaryExpression>(fileName, line);
         unaryExpression->unaryOp = unaryOp;
-        if (!ParseTerminalExpression(unaryExpression->expression, needsEndParen))
-        {
+        if (!ParseTerminalExpression(unaryExpression->expression, needsEndParen)) {
             return false;
         }
-        if (unaryOp == HLSLUnaryOp_BitNot)
-        {
-            if (!IsIntegerType(unaryExpression->expression->expressionType.baseType))
-            {
-                const char * typeName = GetTypeNameHLSL(unaryExpression->expression->expressionType);
+        if (unaryOp == HLSLUnaryOp_BitNot) {
+            if (!IsIntegerType(unaryExpression->expression->expressionType.baseType)) {
+                const char* typeName = GetTypeNameHLSL(unaryExpression->expression->expressionType);
                 m_tokenizer.Error("unary '~' : no global operator found which takes type '%s' (or there is no acceptable conversion)", typeName);
                 return false;
             }
         }
-        if (unaryOp == HLSLUnaryOp_Not)
-        {
+        if (unaryOp == HLSLUnaryOp_Not) {
             unaryExpression->expressionType = HLSLType(HLSLBaseType_Bool);
-            
+
             // Propagate constness.
             unaryExpression->expressionType.flags = unaryExpression->expression->expressionType.flags & HLSLTypeFlag_Const;
         }
-        else
-        {
+        else {
             unaryExpression->expressionType = unaryExpression->expression->expressionType;
         }
         expression = unaryExpression;
         return true;
     }
-    
+
     // Expressions inside parenthesis or casts.
-    if (Accept('('))
-    {
+    if (Accept('(')) {
         // Check for a casting operator.
         HLSLType type;
-        if (AcceptType(false, type))
-        {
+        if (AcceptType(false, type)) {
             // This is actually a type constructor like (float2(...
-            if (Accept('('))
-            {
+            if (Accept('(')) {
                 needsEndParen = true;
                 return ParsePartialConstructor(expression, type.baseType, type.typeName);
             }
@@ -3106,43 +2938,38 @@ bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& need
             castingExpression->expressionType = type;
             return Expect(')') && ParseExpression(castingExpression->expression);
         }
-        
-        if (!ParseExpression(expression) || !Expect(')'))
-        {
+
+        if (!ParseExpression(expression) || !Expect(')')) {
             return false;
         }
     }
-    else
-    {
+    else {
         // Terminal values.
         float fValue = 0.0f;
-        int   iValue = 0;
-        
+        int iValue = 0;
+
         // literals
-        if (AcceptFloat(fValue))
-        {
+        if (AcceptFloat(fValue)) {
             HLSLLiteralExpression* literalExpression = m_tree->AddNode<HLSLLiteralExpression>(fileName, line);
-            literalExpression->type   = HLSLBaseType_Float;
+            literalExpression->type = HLSLBaseType_Float;
             literalExpression->fValue = fValue;
             literalExpression->expressionType.baseType = literalExpression->type;
             literalExpression->expressionType.flags = HLSLTypeFlag_Const;
             expression = literalExpression;
             return true;
         }
-		if(AcceptHalf(fValue))
-		{
-			HLSLLiteralExpression* literalExpression = m_tree->AddNode<HLSLLiteralExpression>( fileName, line );
-			literalExpression->type = HLSLBaseType_Half;
-			literalExpression->fValue = fValue;
-			literalExpression->expressionType.baseType = literalExpression->type;
-			literalExpression->expressionType.flags = HLSLTypeFlag_Const;
-			expression = literalExpression;
-			return true;
-		}
-        if (AcceptInt(iValue))
-        {
+        if (AcceptHalf(fValue)) {
+            HLSLLiteralExpression* literalExpression = m_tree->AddNode<HLSLLiteralExpression>(fileName, line);
+            literalExpression->type = HLSLBaseType_Half;
+            literalExpression->fValue = fValue;
+            literalExpression->expressionType.baseType = literalExpression->type;
+            literalExpression->expressionType.flags = HLSLTypeFlag_Const;
+            expression = literalExpression;
+            return true;
+        }
+        if (AcceptInt(iValue)) {
             HLSLLiteralExpression* literalExpression = m_tree->AddNode<HLSLLiteralExpression>(fileName, line);
-            literalExpression->type   = HLSLBaseType_Int;
+            literalExpression->type = HLSLBaseType_Int;
             literalExpression->iValue = iValue;
             literalExpression->expressionType.baseType = literalExpression->type;
             literalExpression->expressionType.flags = HLSLTypeFlag_Const;
@@ -3150,22 +2977,20 @@ bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& need
             return true;
         }
         // TODO: need uint, u/short, double
-        
+
         // boolean
-        if (Accept(HLSLToken_True))
-        {
+        if (Accept(HLSLToken_True)) {
             HLSLLiteralExpression* literalExpression = m_tree->AddNode<HLSLLiteralExpression>(fileName, line);
-            literalExpression->type   = HLSLBaseType_Bool;
+            literalExpression->type = HLSLBaseType_Bool;
             literalExpression->bValue = true;
             literalExpression->expressionType.baseType = literalExpression->type;
             literalExpression->expressionType.flags = HLSLTypeFlag_Const;
             expression = literalExpression;
             return true;
         }
-        if (Accept(HLSLToken_False))
-        {
+        if (Accept(HLSLToken_False)) {
             HLSLLiteralExpression* literalExpression = m_tree->AddNode<HLSLLiteralExpression>(fileName, line);
-            literalExpression->type   = HLSLBaseType_Bool;
+            literalExpression->type = HLSLBaseType_Bool;
             literalExpression->bValue = false;
             literalExpression->expressionType.baseType = literalExpression->type;
             literalExpression->expressionType.flags = HLSLTypeFlag_Const;
@@ -3175,47 +3000,37 @@ bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& need
 
         // Type constructor.
         HLSLType type;
-        if (AcceptType(/*allowVoid=*/false, type))
-        {
+        if (AcceptType(/*allowVoid=*/false, type)) {
             Expect('(');
-            if (!ParsePartialConstructor(expression, type.baseType, type.typeName))
-            {
+            if (!ParsePartialConstructor(expression, type.baseType, type.typeName)) {
                 return false;
             }
         }
-        else
-        {
+        else {
             HLSLIdentifierExpression* identifierExpression = m_tree->AddNode<HLSLIdentifierExpression>(fileName, line);
-            if (!ExpectIdentifier(identifierExpression->name))
-            {
+            if (!ExpectIdentifier(identifierExpression->name)) {
                 return false;
             }
 
             bool undeclaredIdentifier = false;
- 
+
             const HLSLType* identifierType = FindVariable(identifierExpression->name, identifierExpression->global);
-            if (identifierType != NULL)
-            {
+            if (identifierType != NULL) {
                 identifierExpression->expressionType = *identifierType;
             }
-            else
-            {
-                if (GetIsFunction(identifierExpression->name))
-                {
+            else {
+                if (GetIsFunction(identifierExpression->name)) {
                     // Functions are always global scope.
                     // TODO: what about member functions?
                     identifierExpression->global = true;
                 }
-                else
-                {
+                else {
                     undeclaredIdentifier = true;
                 }
             }
 
-            if (undeclaredIdentifier)
-            {
-                if (m_allowUndeclaredIdentifiers)
-                {
+            if (undeclaredIdentifier) {
+                if (m_allowUndeclaredIdentifiers) {
                     HLSLLiteralExpression* literalExpression = m_tree->AddNode<HLSLLiteralExpression>(fileName, line);
                     literalExpression->bValue = false;
                     literalExpression->type = HLSLBaseType_Bool;
@@ -3223,8 +3038,7 @@ bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& need
                     literalExpression->expressionType.flags = HLSLTypeFlag_Const;
                     expression = literalExpression;
                 }
-                else
-                {
+                else {
                     m_tokenizer.Error("Undeclared identifier '%s'", identifierExpression->name);
                     return false;
                 }
@@ -3236,14 +3050,12 @@ bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& need
     }
 
     bool done = false;
-    while (!done)
-    {
+    while (!done) {
         done = true;
 
         // Post fix unary operator
         HLSLUnaryOp unaryOp2;
-        while (AcceptUnaryOperator(false, unaryOp2))
-        {
+        while (AcceptUnaryOperator(false, unaryOp2)) {
             HLSLUnaryExpression* unaryExpression = m_tree->AddNode<HLSLUnaryExpression>(fileName, line);
             unaryExpression->unaryOp = unaryOp2;
             unaryExpression->expression = expression;
@@ -3253,204 +3065,188 @@ bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& need
         }
 
         // Member access operator.
-        while (Accept('.'))
-        {
+        while (Accept('.')) {
             // member or member function
             const char* text = NULL;
-            if (!ExpectIdentifier(text))
-            {
+            if (!ExpectIdentifier(text)) {
                 return false;
             }
-            
+
             //const HLSLMemberFuction* memberFunction = FindMemberFunction(text);
             //if (function != NULL)
             {
                 // check parent type, and args to see if it's a match
-                
+
                 // copied from intrinsic lookup at end
-                if (Accept('('))
-                {
+                if (Accept('(')) {
                     HLSLMemberFunctionCall* functionCall = m_tree->AddNode<HLSLMemberFunctionCall>(fileName, line);
-                      
+
                     done = false;
-                    
+
                     // parse the args
-                    if (!ParseExpressionList(')', false, functionCall->argument, functionCall->numArguments))
-                    {
+                    if (!ParseExpressionList(')', false, functionCall->argument, functionCall->numArguments)) {
                         return false;
                     }
-                    
-                    if (expression->nodeType != HLSLNodeType_IdentifierExpression)
-                    {
+
+                    if (expression->nodeType != HLSLNodeType_IdentifierExpression) {
                         m_tokenizer.Error("Expected function identifier");
                         return false;
                     }
-                    
+
                     // This is "tex" of tex.Sample(...)
                     const HLSLIdentifierExpression* identifierExpression = static_cast<const HLSLIdentifierExpression*>(expression);
-                    
+
                     // TODO: what if it's a chain of member functions?
                     functionCall->memberIdentifier = identifierExpression;
-                    
+
                     // TODO: lookup texture, buffer, struct for identiferExpression
                     // TODO: prob need formatType to match half/float return type.
-                    
+
                     // TODO: could lookup only float memberFunctions if spirv
                     // which can't handle fp16 samplers.
-                    
+
                     // This is matching to a member function (mostly intrinsics)
-                    const HLSLFunction* function = MatchFunctionCall( functionCall, text, &identifierExpression->expressionType );
-                    if (function == NULL)
-                    {
+                    const HLSLFunction* function = MatchFunctionCall(functionCall, text, &identifierExpression->expressionType);
+                    if (function == NULL) {
                         return false;
                     }
-                    
+
                     functionCall->function = function;
                     functionCall->expressionType = function->returnType;
-                    
+
                     // or is it the identiferExpression?
                     expression = functionCall;
-                    
+
                     // for now don't allow chained member functions
                     return true;
                 }
-                
-           }
-           //else
-           {
-               // member variable
-               HLSLMemberAccess* memberAccess = m_tree->AddNode<HLSLMemberAccess>(fileName, line);
-               memberAccess->object = expression;
-               memberAccess->field = text;
-               
-               if (!GetMemberType(expression->expressionType, memberAccess))
-               {
-                   m_tokenizer.Error("Couldn't access '%s'", memberAccess->field);
-                   
-                   // this leaks memberAccess allocated above, but
-                   // all allocated from single allocator, so just free/reset that
-                   return false;
-               }
-               expression = memberAccess;
-               done = false;
-           }
+            }
+            //else
+            {
+                // member variable
+                HLSLMemberAccess* memberAccess = m_tree->AddNode<HLSLMemberAccess>(fileName, line);
+                memberAccess->object = expression;
+                memberAccess->field = text;
+
+                if (!GetMemberType(expression->expressionType, memberAccess)) {
+                    m_tokenizer.Error("Couldn't access '%s'", memberAccess->field);
+
+                    // this leaks memberAccess allocated above, but
+                    // all allocated from single allocator, so just free/reset that
+                    return false;
+                }
+                expression = memberAccess;
+                done = false;
+            }
         }
 
         // Handle array access.
-        while (Accept('['))
-        {
+        while (Accept('[')) {
             HLSLArrayAccess* arrayAccess = m_tree->AddNode<HLSLArrayAccess>(fileName, line);
             arrayAccess->array = expression;
-            if (!ParseExpression(arrayAccess->index) || !Expect(']'))
-            {
+            if (!ParseExpression(arrayAccess->index) || !Expect(']')) {
                 return false;
             }
 
-            if (expression->expressionType.baseType == HLSLBaseType_UserDefined)
-            {
+            if (expression->expressionType.baseType == HLSLBaseType_UserDefined) {
                 // some buffer types (!IsGlobalFields) have array notation
                 arrayAccess->expressionType.baseType = HLSLBaseType_UserDefined;
                 arrayAccess->expressionType.typeName = expression->expressionType.typeName;
-                arrayAccess->expressionType.array     = true;
+                arrayAccess->expressionType.array = true;
                 arrayAccess->expressionType.arraySize = NULL;
-                
             }
-            else if (expression->expressionType.array)
-            {
+            else if (expression->expressionType.array) {
                 arrayAccess->expressionType = expression->expressionType;
-                arrayAccess->expressionType.array     = false;
+                arrayAccess->expressionType.array = false;
                 arrayAccess->expressionType.arraySize = NULL;
             }
-            else
-            {
-                switch (expression->expressionType.baseType)
-                {
-                case HLSLBaseType_Float2:
-                case HLSLBaseType_Float3:
-                case HLSLBaseType_Float4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Float;
-                    break;
-				case HLSLBaseType_Float2x2:
-					arrayAccess->expressionType.baseType = HLSLBaseType_Float2;
-					break;
-                case HLSLBaseType_Float3x3:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Float3;
-                    break;
-                case HLSLBaseType_Float4x4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Float4;
-                    break;
-
-                case HLSLBaseType_Half2:
-                case HLSLBaseType_Half3:
-                case HLSLBaseType_Half4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Half;
-                    break;
-				case HLSLBaseType_Half2x2:
-					arrayAccess->expressionType.baseType = HLSLBaseType_Half2;
-					break;
-                case HLSLBaseType_Half3x3:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Half3;
-                    break;
-                case HLSLBaseType_Half4x4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Half4;
-                    break;
-
-                case HLSLBaseType_Double2:
-                case HLSLBaseType_Double3:
-                case HLSLBaseType_Double4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Double;
-                    break;
-                case HLSLBaseType_Double2x2:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Double2;
-                    break;
-                case HLSLBaseType_Double3x3:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Double3;
-                    break;
-                case HLSLBaseType_Double4x4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Double4;
-                    break;
-
-                        
-                case HLSLBaseType_Int2:
-                case HLSLBaseType_Int3:
-                case HLSLBaseType_Int4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Int;
-                    break;
-                case HLSLBaseType_Uint2:
-                case HLSLBaseType_Uint3:
-                case HLSLBaseType_Uint4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Uint;
-                    break;
-                case HLSLBaseType_Bool2:
-                case HLSLBaseType_Bool3:
-                case HLSLBaseType_Bool4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Bool;
-                    break;
-                case HLSLBaseType_Ushort2:
-                case HLSLBaseType_Ushort3:
-                case HLSLBaseType_Ushort4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Ushort;
-                    break;
-                case HLSLBaseType_Short2:
-                case HLSLBaseType_Short3:
-                case HLSLBaseType_Short4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Short;
-                    break;
-                case HLSLBaseType_Ulong2:
-                case HLSLBaseType_Ulong3:
-                case HLSLBaseType_Ulong4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Ulong;
-                    break;
-                case HLSLBaseType_Long2:
-                case HLSLBaseType_Long3:
-                case HLSLBaseType_Long4:
-                    arrayAccess->expressionType.baseType = HLSLBaseType_Long;
-                    break;
-                        
-                // TODO: u/char
-                default:
-                    m_tokenizer.Error("array, matrix, vector, or indexable object type expected in index expression");
-                    return false;
+            else {
+                switch (expression->expressionType.baseType) {
+                    case HLSLBaseType_Float2:
+                    case HLSLBaseType_Float3:
+                    case HLSLBaseType_Float4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Float;
+                        break;
+                    case HLSLBaseType_Float2x2:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Float2;
+                        break;
+                    case HLSLBaseType_Float3x3:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Float3;
+                        break;
+                    case HLSLBaseType_Float4x4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Float4;
+                        break;
+
+                    case HLSLBaseType_Half2:
+                    case HLSLBaseType_Half3:
+                    case HLSLBaseType_Half4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Half;
+                        break;
+                    case HLSLBaseType_Half2x2:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Half2;
+                        break;
+                    case HLSLBaseType_Half3x3:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Half3;
+                        break;
+                    case HLSLBaseType_Half4x4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Half4;
+                        break;
+
+                    case HLSLBaseType_Double2:
+                    case HLSLBaseType_Double3:
+                    case HLSLBaseType_Double4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Double;
+                        break;
+                    case HLSLBaseType_Double2x2:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Double2;
+                        break;
+                    case HLSLBaseType_Double3x3:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Double3;
+                        break;
+                    case HLSLBaseType_Double4x4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Double4;
+                        break;
+
+                    case HLSLBaseType_Int2:
+                    case HLSLBaseType_Int3:
+                    case HLSLBaseType_Int4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Int;
+                        break;
+                    case HLSLBaseType_Uint2:
+                    case HLSLBaseType_Uint3:
+                    case HLSLBaseType_Uint4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Uint;
+                        break;
+                    case HLSLBaseType_Bool2:
+                    case HLSLBaseType_Bool3:
+                    case HLSLBaseType_Bool4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Bool;
+                        break;
+                    case HLSLBaseType_Ushort2:
+                    case HLSLBaseType_Ushort3:
+                    case HLSLBaseType_Ushort4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Ushort;
+                        break;
+                    case HLSLBaseType_Short2:
+                    case HLSLBaseType_Short3:
+                    case HLSLBaseType_Short4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Short;
+                        break;
+                    case HLSLBaseType_Ulong2:
+                    case HLSLBaseType_Ulong3:
+                    case HLSLBaseType_Ulong4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Ulong;
+                        break;
+                    case HLSLBaseType_Long2:
+                    case HLSLBaseType_Long3:
+                    case HLSLBaseType_Long4:
+                        arrayAccess->expressionType.baseType = HLSLBaseType_Long;
+                        break;
+
+                    // TODO: u/char
+                    default:
+                        m_tokenizer.Error("array, matrix, vector, or indexable object type expected in index expression");
+                        return false;
                 }
             }
 
@@ -3461,69 +3257,56 @@ bool HLSLParser::ParseTerminalExpression(HLSLExpression*& expression, bool& need
         // Handle function calls. Note, HLSL functions aren't like C function
         // pointers -- we can only directly call on an identifier, not on an
         // expression.
-        if (Accept('('))
-        {
+        if (Accept('(')) {
             HLSLFunctionCall* functionCall = m_tree->AddNode<HLSLFunctionCall>(fileName, line);
             done = false;
-            if (!ParseExpressionList(')', false, functionCall->argument, functionCall->numArguments))
-            {
+            if (!ParseExpressionList(')', false, functionCall->argument, functionCall->numArguments)) {
                 return false;
             }
-            
-            if (expression->nodeType != HLSLNodeType_IdentifierExpression)
-            {
+
+            if (expression->nodeType != HLSLNodeType_IdentifierExpression) {
                 m_tokenizer.Error("Expected function identifier");
                 return false;
             }
-            
+
             const HLSLIdentifierExpression* identifierExpression = static_cast<const HLSLIdentifierExpression*>(expression);
-            const HLSLFunction* function = MatchFunctionCall( functionCall, identifierExpression->name );
-            if (function == NULL)
-            {
+            const HLSLFunction* function = MatchFunctionCall(functionCall, identifierExpression->name);
+            if (function == NULL) {
                 return false;
             }
-            
+
             functionCall->function = function;
             functionCall->expressionType = function->returnType;
             expression = functionCall;
         }
-
     }
     return true;
-
 }
 
 bool HLSLParser::ParseExpressionList(int endToken, bool allowEmptyEnd, HLSLExpression*& firstExpression, int& numExpressions)
 {
     numExpressions = 0;
     HLSLExpression* lastExpression = NULL;
-    while (!Accept(endToken))
-    {
-        if (CheckForUnexpectedEndOfStream(endToken))
-        {
+    while (!Accept(endToken)) {
+        if (CheckForUnexpectedEndOfStream(endToken)) {
             return false;
         }
-        if (numExpressions > 0 && !Expect(','))
-        {
+        if (numExpressions > 0 && !Expect(',')) {
             return false;
         }
         // It is acceptable for the final element in the initialization list to
         // have a trailing comma in some cases, like array initialization such as {1, 2, 3,}
-        if (allowEmptyEnd && Accept(endToken))
-        {
+        if (allowEmptyEnd && Accept(endToken)) {
             break;
         }
         HLSLExpression* expression = NULL;
-        if (!ParseExpression(expression))
-        {
+        if (!ParseExpression(expression)) {
             return false;
         }
-        if (firstExpression == NULL)
-        {
+        if (firstExpression == NULL) {
             firstExpression = expression;
         }
-        else
-        {
+        else {
             lastExpression->nextExpression = expression;
         }
         lastExpression = expression;
@@ -3535,64 +3318,65 @@ bool HLSLParser::ParseExpressionList(int endToken, bool allowEmptyEnd, HLSLExpre
 bool HLSLParser::ParseArgumentList(HLSLArgument*& firstArgument, int& numArguments, int& numOutputArguments)
 {
     const char* fileName = GetFileName();
-    int         line     = GetLineNumber();
-        
+    int line = GetLineNumber();
+
     HLSLArgument* lastArgument = NULL;
     numArguments = 0;
 
-    while (!Accept(')'))
-    {
-        if (CheckForUnexpectedEndOfStream(')'))
-        {
+    while (!Accept(')')) {
+        if (CheckForUnexpectedEndOfStream(')')) {
             return false;
         }
-        if (numArguments > 0 && !Expect(','))
-        {
+        if (numArguments > 0 && !Expect(',')) {
             return false;
         }
 
-        HLSLArgument* argument = m_tree->AddNode<HLSLArgument>(fileName, line);
-
-        // what is unifor modifier ?
-        if (Accept(HLSLToken_Uniform))     { argument->modifier = HLSLArgumentModifier_Uniform; }
-        
-        else if (Accept(HLSLToken_In))     { argument->modifier = HLSLArgumentModifier_In;      }
-        else if (Accept(HLSLToken_Out))    { argument->modifier = HLSLArgumentModifier_Out;     }
-        else if (Accept(HLSLToken_InOut))  { argument->modifier = HLSLArgumentModifier_Inout;   }
-        else if (Accept(HLSLToken_Const))  { argument->modifier = HLSLArgumentModifier_Const;   }
-
-        if (!ExpectDeclaration(/*allowUnsizedArray=*/true, argument->type, argument->name))
-        {
+        HLSLArgument* argument = m_tree->AddNode<HLSLArgument>(fileName, line);
+
+        // what is unifor modifier ?
+        if (Accept(HLSLToken_Uniform)) {
+            argument->modifier = HLSLArgumentModifier_Uniform;
+        }
+
+        else if (Accept(HLSLToken_In)) {
+            argument->modifier = HLSLArgumentModifier_In;
+        }
+        else if (Accept(HLSLToken_Out)) {
+            argument->modifier = HLSLArgumentModifier_Out;
+        }
+        else if (Accept(HLSLToken_InOut)) {
+            argument->modifier = HLSLArgumentModifier_Inout;
+        }
+        else if (Accept(HLSLToken_Const)) {
+            argument->modifier = HLSLArgumentModifier_Const;
+        }
+
+        if (!ExpectDeclaration(/*allowUnsizedArray=*/true, argument->type, argument->name)) {
             return false;
         }
 
-        DeclareVariable( argument->name, argument->type );
+        DeclareVariable(argument->name, argument->type);
 
         // Optional semantic.
-        if (Accept(':') && !ExpectIdentifier(argument->semantic))
-        {
+        if (Accept(':') && !ExpectIdentifier(argument->semantic)) {
             return false;
         }
 
-        if (Accept('=') && !ParseExpression(argument->defaultValue))
-        {
+        if (Accept('=') && !ParseExpression(argument->defaultValue)) {
             // @@ Print error!
             return false;
         }
 
-        if (lastArgument != NULL)
-        {
+        if (lastArgument != NULL) {
             lastArgument->nextArgument = argument;
         }
-        else
-        {
+        else {
             firstArgument = argument;
         }
         lastArgument = argument;
 
         ++numArguments;
-        if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout)
-        {
+        if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout) {
             ++numOutputArguments;
         }
     }
@@ -3810,7 +3594,7 @@ const EffectState* GetEffectState(const char* name, bool isSamplerState, bool is
 {
     const EffectState* validStates = effectStates;
     int count = sizeof(effectStates)/sizeof(effectStates[0]);
-    
+
     if (isPipeline)
     {
         validStates = pipelineStates;
@@ -3826,7 +3610,7 @@ const EffectState* GetEffectState(const char* name, bool isSamplerState, bool is
     // Case insensitive comparison.
     for (int i = 0; i < count; i++)
     {
-        if (String_EqualNoCase(name, validStates[i].name)) 
+        if (String_EqualNoCase(name, validStates[i].name))
         {
             return &validStates[i];
         }
@@ -3838,12 +3622,12 @@ const EffectState* GetEffectState(const char* name, bool isSamplerState, bool is
 static const EffectStateValue* GetStateValue(const char* name, const EffectState* state)
 {
     // Case insensitive comparison.
-    for (int i = 0; ; i++) 
+    for (int i = 0; ; i++)
     {
         const EffectStateValue & value = state->values[i];
         if (value.name == NULL) break;
 
-        if (String_EqualNoCase(name, value.name)) 
+        if (String_EqualNoCase(name, value.name))
         {
             return &value;
         }
@@ -3909,7 +3693,7 @@ bool HLSLParser::ParseStateValue(const EffectState * state, HLSLStateAssignment*
     const bool expectsFloat = state->values == floatValues;
     const bool expectsBoolean = state->values == booleanValues;
 
-    if (!expectsExpression && !expectsInteger && !expectsFloat && !expectsBoolean) 
+    if (!expectsExpression && !expectsInteger && !expectsFloat && !expectsBoolean)
     {
         if (m_tokenizer.GetToken() != HLSLToken_Identifier)
         {
@@ -3986,7 +3770,7 @@ bool HLSLParser::ParseStateValue(const EffectState * state, HLSLStateAssignment*
                 return false;
             }
         }
-        else 
+        else
         {
             // Expect one of the allowed values.
             const EffectStateValue * stateValue = GetStateValue(m_tokenizer.GetIdentifier(), state);
@@ -4042,17 +3826,17 @@ bool HLSLParser::ParseStateAssignment(HLSLStateAssignment*& stateAssignment, boo
 bool HLSLParser::ParseAttributeList(HLSLAttribute*& firstAttribute)
 {
     const char* fileName = GetFileName();
-    int         line     = GetLineNumber();
-    
-    HLSLAttribute * lastAttribute = firstAttribute;
+    int line = GetLineNumber();
+
+    HLSLAttribute* lastAttribute = firstAttribute;
     do {
-        const char * identifier = NULL;
+        const char* identifier = NULL;
         if (!ExpectIdentifier(identifier)) {
             return false;
         }
 
-        HLSLAttribute * attribute = m_tree->AddNode<HLSLAttribute>(fileName, line);
-        
+        HLSLAttribute* attribute = m_tree->AddNode<HLSLAttribute>(fileName, line);
+
         if (String_Equal(identifier, "unroll"))
             attribute->attributeType = HLSLAttributeType_Unroll;
         else if (String_Equal(identifier, "flatten"))
@@ -4061,20 +3845,18 @@ bool HLSLParser::ParseAttributeList(HLSLAttribute*& firstAttribute)
             attribute->attributeType = HLSLAttributeType_Branch;
         else if (String_Equal(identifier, "nofastmath"))
             attribute->attributeType = HLSLAttributeType_NoFastMath;
-        
+
         // @@ parse arguments, () not required if attribute constructor has no arguments.
 
-        if (firstAttribute == NULL)
-        {
+        if (firstAttribute == NULL) {
             firstAttribute = attribute;
         }
-        else
-        {
+        else {
             lastAttribute->nextAttribute = attribute;
         }
         lastAttribute = attribute;
-        
-    } while(Accept(','));
+
+    } while (Accept(','));
 
     return true;
 }
@@ -4089,19 +3871,19 @@ bool HLSLParser::ParseAttributeList(HLSLAttribute*& firstAttribute)
 //   [A(a)] statement;
 bool HLSLParser::ParseAttributeBlock(HLSLAttribute*& attribute)
 {
-    HLSLAttribute ** lastAttribute = &attribute;
-    while (*lastAttribute != NULL) { lastAttribute = &(*lastAttribute)->nextAttribute; }
+    HLSLAttribute** lastAttribute = &attribute;
+    while (*lastAttribute != NULL) {
+        lastAttribute = &(*lastAttribute)->nextAttribute;
+    }
 
-    if (!Accept('['))
-    {
+    if (!Accept('[')) {
         return false;
     }
 
     // Parse list of attribute constructors.
     ParseAttributeList(*lastAttribute);
 
-    if (!Expect(']'))
-    {
+    if (!Expect(']')) {
         return false;
     }
 
@@ -4151,31 +3933,24 @@ bool HLSLParser::ParseStage(HLSLStatement*& statement)
 }
 */
 
-
-
 bool HLSLParser::Parse(HLSLTree* tree, const HLSLParserOptions& options)
 {
     m_tree = tree;
     m_options = options;
-    
+
     HLSLRoot* root = m_tree->GetRoot();
     HLSLStatement* lastStatement = NULL;
 
-    while (!Accept(HLSLToken_EndOfStream))
-    {
+    while (!Accept(HLSLToken_EndOfStream)) {
         HLSLStatement* statement = NULL;
-        if (!ParseTopLevel(statement))
-        {
+        if (!ParseTopLevel(statement)) {
             return false;
         }
-        if (statement != NULL)
-        {   
-            if (lastStatement == NULL)
-            {
+        if (statement != NULL) {
+            if (lastStatement == NULL) {
                 root->statement = statement;
             }
-            else
-            {
+            else {
                 lastStatement->nextStatement = statement;
             }
             lastStatement = statement;
@@ -4187,23 +3962,19 @@ bool HLSLParser::Parse(HLSLTree* tree, const HLSLParserOptions& options)
 
 bool HLSLParser::AcceptTypeModifier(int& flags)
 {
-    if (Accept(HLSLToken_Const))
-    {
+    if (Accept(HLSLToken_Const)) {
         flags |= HLSLTypeFlag_Const;
         return true;
     }
-    else if (Accept(HLSLToken_Static))
-    {
+    else if (Accept(HLSLToken_Static)) {
         flags |= HLSLTypeFlag_Static;
         return true;
     }
-    else if (Accept(HLSLToken_Uniform))
-    {
+    else if (Accept(HLSLToken_Uniform)) {
         //flags |= HLSLTypeFlag_Uniform;      // @@ Ignored.
         return true;
     }
-    else if (Accept(HLSLToken_Inline))
-    {
+    else if (Accept(HLSLToken_Inline)) {
         //flags |= HLSLTypeFlag_Uniform;      // @@ Ignored. In HLSL all functions are inline.
         return true;
     }
@@ -4224,28 +3995,23 @@ bool HLSLParser::AcceptTypeModifier(int& flags)
 
 bool HLSLParser::AcceptInterpolationModifier(int& flags)
 {
-    if (Accept("linear"))
-    { 
-        flags |= HLSLTypeFlag_Linear; 
+    if (Accept("linear")) {
+        flags |= HLSLTypeFlag_Linear;
         return true;
     }
-    else if (Accept("centroid"))
-    { 
+    else if (Accept("centroid")) {
         flags |= HLSLTypeFlag_Centroid;
         return true;
     }
-    else if (Accept("nointerpolation"))
-    {
+    else if (Accept("nointerpolation")) {
         flags |= HLSLTypeFlag_NoInterpolation;
         return true;
     }
-    else if (Accept("noperspective"))
-    {
+    else if (Accept("noperspective")) {
         flags |= HLSLTypeFlag_NoPerspective;
         return true;
     }
-    else if (Accept("sample"))
-    {
+    else if (Accept("sample")) {
         flags |= HLSLTypeFlag_Sample;
         return true;
     }
@@ -4253,263 +4019,254 @@ bool HLSLParser::AcceptInterpolationModifier(int& flags)
     return false;
 }
 
-
-bool HLSLParser::AcceptType(bool allowVoid, HLSLType& type/*, bool acceptFlags*/)
+bool HLSLParser::AcceptType(bool allowVoid, HLSLType& type /*, bool acceptFlags*/)
 {
     //if (type.flags != NULL)
     {
         type.flags = 0;
-        while(AcceptTypeModifier(type.flags) || AcceptInterpolationModifier(type.flags)) {}
+        while (AcceptTypeModifier(type.flags) || AcceptInterpolationModifier(type.flags)) {
+        }
     }
 
     int token = m_tokenizer.GetToken();
 
-    if (token == HLSLToken_Comment)
-    {
+    if (token == HLSLToken_Comment) {
         // TODO: should this advance the tokenizer?
         // m_tokenizer.Next();
-        
+
         type.baseType = HLSLBaseType_Comment;
         return true;
     }
-    
+
     // Check built-in types.
     type.baseType = HLSLBaseType_Void;
-    switch (token)
-    {
-    case HLSLToken_Float:
-        type.baseType = HLSLBaseType_Float;
-        break;
-    case HLSLToken_Float2:      
-        type.baseType = HLSLBaseType_Float2;
-        break;
-    case HLSLToken_Float3:
-        type.baseType = HLSLBaseType_Float3;
-        break;
-    case HLSLToken_Float4:
-        type.baseType = HLSLBaseType_Float4;
-        break;
-            
-	case HLSLToken_Float2x2:
-		type.baseType = HLSLBaseType_Float2x2;
-		break;
-    case HLSLToken_Float3x3:
-        type.baseType = HLSLBaseType_Float3x3;
-        break;
-    case HLSLToken_Float4x4:
-        type.baseType = HLSLBaseType_Float4x4;
-        break;
-          
-    // The parser is remapping the type here
-    case HLSLToken_Halfio:
-        type.baseType = m_options.isHalfio ? HLSLBaseType_Half : HLSLBaseType_Float;
-        break;
-    case HLSLToken_Half2io:
-        type.baseType = m_options.isHalfio ? HLSLBaseType_Half2 : HLSLBaseType_Float2;
-        break;
-    case HLSLToken_Half3io:
-        type.baseType = m_options.isHalfio ? HLSLBaseType_Half3 : HLSLBaseType_Float3;
-        break;
-    case HLSLToken_Half4io:
-        type.baseType = m_options.isHalfio ? HLSLBaseType_Half4 : HLSLBaseType_Float4;
-        break;
-            
-    // The parser is remapping the type here
-    case HLSLToken_Halfst:
-        type.baseType = m_options.isHalfst ? HLSLBaseType_Half : HLSLBaseType_Float;
-        break;
-    case HLSLToken_Half2st:
-        type.baseType = m_options.isHalfst ? HLSLBaseType_Half2 : HLSLBaseType_Float2;
-        break;
-    case HLSLToken_Half3st:
-        type.baseType = m_options.isHalfst ? HLSLBaseType_Half3 : HLSLBaseType_Float3;
-        break;
-    case HLSLToken_Half4st:
-        type.baseType = m_options.isHalfst ? HLSLBaseType_Half4 : HLSLBaseType_Float4;
-        break;
-            
-    case HLSLToken_Half:
-        type.baseType = HLSLBaseType_Half;
-        break;
-    case HLSLToken_Half2:      
-        type.baseType = HLSLBaseType_Half2;
-        break;
-    case HLSLToken_Half3:
-        type.baseType = HLSLBaseType_Half3;
-        break;
-    case HLSLToken_Half4:
-        type.baseType = HLSLBaseType_Half4;
-        break;
-            
-	case HLSLToken_Half2x2:
-		type.baseType = HLSLBaseType_Half2x2;
-		break;
-    case HLSLToken_Half3x3:
-        type.baseType = HLSLBaseType_Half3x3;
-        break;
-    case HLSLToken_Half4x4:
-        type.baseType = HLSLBaseType_Half4x4;
-        break;
-
-    case HLSLToken_Bool:
-        type.baseType = HLSLBaseType_Bool;
-        break;
-	case HLSLToken_Bool2:
-		type.baseType = HLSLBaseType_Bool2;
-		break;
-	case HLSLToken_Bool3:
-		type.baseType = HLSLBaseType_Bool3;
-		break;
-	case HLSLToken_Bool4:
-		type.baseType = HLSLBaseType_Bool4;
-		break;
-            
-    case HLSLToken_Int:
-        type.baseType = HLSLBaseType_Int;
-        break;
-    case HLSLToken_Int2:
-        type.baseType = HLSLBaseType_Int2;
-        break;
-    case HLSLToken_Int3:
-        type.baseType = HLSLBaseType_Int3;
-        break;
-    case HLSLToken_Int4:
-        type.baseType = HLSLBaseType_Int4;
-        break;
-            
-    case HLSLToken_Uint:
-        type.baseType = HLSLBaseType_Uint;
-        break;
-    case HLSLToken_Uint2:
-        type.baseType = HLSLBaseType_Uint2;
-        break;
-    case HLSLToken_Uint3:
-        type.baseType = HLSLBaseType_Uint3;
-        break;
-    case HLSLToken_Uint4:
-        type.baseType = HLSLBaseType_Uint4;
-        break;
-         
-    case HLSLToken_Ushort:
-        type.baseType = HLSLBaseType_Ushort;
-        break;
-    case HLSLToken_Ushort2:
-        type.baseType = HLSLBaseType_Ushort2;
-        break;
-    case HLSLToken_Ushort3:
-        type.baseType = HLSLBaseType_Ushort3;
-        break;
-    case HLSLToken_Ushort4:
-        type.baseType = HLSLBaseType_Ushort4;
-        break;
-        
-    case HLSLToken_Short:
-        type.baseType = HLSLBaseType_Short;
-        break;
-    case HLSLToken_Short2:
-        type.baseType = HLSLBaseType_Short2;
-        break;
-    case HLSLToken_Short3:
-        type.baseType = HLSLBaseType_Short3;
-        break;
-    case HLSLToken_Short4:
-        type.baseType = HLSLBaseType_Short4;
-        break;
-            
-    // Textures (TODO: could have baseType be texture, with subtype like buffer)
-    case HLSLToken_Texture2D:
-        type.baseType = HLSLBaseType_Texture2D;
-        break;
-    case HLSLToken_Texture2DArray:
-        type.baseType = HLSLBaseType_Texture2DArray;
-        break;
-    case HLSLToken_Texture3D:
-        type.baseType = HLSLBaseType_Texture3D;
-        break;
-    case HLSLToken_TextureCube:
-        type.baseType = HLSLBaseType_TextureCube;
-        break;
-    case HLSLToken_Texture2DMS:
-        type.baseType = HLSLBaseType_Texture2DMS;
-        break;
-    case HLSLToken_TextureCubeArray:
-        type.baseType = HLSLBaseType_TextureCubeArray;
-        break;
-       
-    case HLSLToken_Depth2D:
-        type.baseType = HLSLBaseType_Depth2D;
-        break;
-    case HLSLToken_Depth2DArray:
-        type.baseType = HLSLBaseType_Depth2DArray;
-        break;
-    case HLSLToken_DepthCube:
-        type.baseType = HLSLBaseType_DepthCube;
-        break;
-            
-    case HLSLToken_RWTexture2D:
-        type.baseType = HLSLBaseType_RWTexture2D;
-        break;
-            
-    // samplers
-    case HLSLToken_SamplerState:
-        type.baseType = HLSLBaseType_SamplerState;
-        break;
-    case HLSLToken_SamplerComparisonState:
-        type.baseType = HLSLBaseType_SamplerComparisonState;
-        break;
-            
-    // older constants
-    case HLSLToken_CBuffer:
-    case HLSLToken_TBuffer:
-        // might make these BufferGlobals?
-        type.baseType = HLSLBaseType_Buffer;
-        break;
-            
-    // SSBO
-    case HLSLToken_StructuredBuffer:
-    case HLSLToken_RWStructuredBuffer:
-    case HLSLToken_ByteAddressBuffer:
-    case HLSLToken_RWByteAddressBuffer:
-    case HLSLToken_ConstantBuffer:
-        type.baseType = HLSLBaseType_Buffer;
-        break;
-    }
-    if (type.baseType != HLSLBaseType_Void)
-    {
+    switch (token) {
+        case HLSLToken_Float:
+            type.baseType = HLSLBaseType_Float;
+            break;
+        case HLSLToken_Float2:
+            type.baseType = HLSLBaseType_Float2;
+            break;
+        case HLSLToken_Float3:
+            type.baseType = HLSLBaseType_Float3;
+            break;
+        case HLSLToken_Float4:
+            type.baseType = HLSLBaseType_Float4;
+            break;
+
+        case HLSLToken_Float2x2:
+            type.baseType = HLSLBaseType_Float2x2;
+            break;
+        case HLSLToken_Float3x3:
+            type.baseType = HLSLBaseType_Float3x3;
+            break;
+        case HLSLToken_Float4x4:
+            type.baseType = HLSLBaseType_Float4x4;
+            break;
+
+        // The parser is remapping the type here
+        case HLSLToken_Halfio:
+            type.baseType = m_options.isHalfio ? HLSLBaseType_Half : HLSLBaseType_Float;
+            break;
+        case HLSLToken_Half2io:
+            type.baseType = m_options.isHalfio ? HLSLBaseType_Half2 : HLSLBaseType_Float2;
+            break;
+        case HLSLToken_Half3io:
+            type.baseType = m_options.isHalfio ? HLSLBaseType_Half3 : HLSLBaseType_Float3;
+            break;
+        case HLSLToken_Half4io:
+            type.baseType = m_options.isHalfio ? HLSLBaseType_Half4 : HLSLBaseType_Float4;
+            break;
+
+        // The parser is remapping the type here
+        case HLSLToken_Halfst:
+            type.baseType = m_options.isHalfst ? HLSLBaseType_Half : HLSLBaseType_Float;
+            break;
+        case HLSLToken_Half2st:
+            type.baseType = m_options.isHalfst ? HLSLBaseType_Half2 : HLSLBaseType_Float2;
+            break;
+        case HLSLToken_Half3st:
+            type.baseType = m_options.isHalfst ? HLSLBaseType_Half3 : HLSLBaseType_Float3;
+            break;
+        case HLSLToken_Half4st:
+            type.baseType = m_options.isHalfst ? HLSLBaseType_Half4 : HLSLBaseType_Float4;
+            break;
+
+        case HLSLToken_Half:
+            type.baseType = HLSLBaseType_Half;
+            break;
+        case HLSLToken_Half2:
+            type.baseType = HLSLBaseType_Half2;
+            break;
+        case HLSLToken_Half3:
+            type.baseType = HLSLBaseType_Half3;
+            break;
+        case HLSLToken_Half4:
+            type.baseType = HLSLBaseType_Half4;
+            break;
+
+        case HLSLToken_Half2x2:
+            type.baseType = HLSLBaseType_Half2x2;
+            break;
+        case HLSLToken_Half3x3:
+            type.baseType = HLSLBaseType_Half3x3;
+            break;
+        case HLSLToken_Half4x4:
+            type.baseType = HLSLBaseType_Half4x4;
+            break;
+
+        case HLSLToken_Bool:
+            type.baseType = HLSLBaseType_Bool;
+            break;
+        case HLSLToken_Bool2:
+            type.baseType = HLSLBaseType_Bool2;
+            break;
+        case HLSLToken_Bool3:
+            type.baseType = HLSLBaseType_Bool3;
+            break;
+        case HLSLToken_Bool4:
+            type.baseType = HLSLBaseType_Bool4;
+            break;
+
+        case HLSLToken_Int:
+            type.baseType = HLSLBaseType_Int;
+            break;
+        case HLSLToken_Int2:
+            type.baseType = HLSLBaseType_Int2;
+            break;
+        case HLSLToken_Int3:
+            type.baseType = HLSLBaseType_Int3;
+            break;
+        case HLSLToken_Int4:
+            type.baseType = HLSLBaseType_Int4;
+            break;
+
+        case HLSLToken_Uint:
+            type.baseType = HLSLBaseType_Uint;
+            break;
+        case HLSLToken_Uint2:
+            type.baseType = HLSLBaseType_Uint2;
+            break;
+        case HLSLToken_Uint3:
+            type.baseType = HLSLBaseType_Uint3;
+            break;
+        case HLSLToken_Uint4:
+            type.baseType = HLSLBaseType_Uint4;
+            break;
+
+        case HLSLToken_Ushort:
+            type.baseType = HLSLBaseType_Ushort;
+            break;
+        case HLSLToken_Ushort2:
+            type.baseType = HLSLBaseType_Ushort2;
+            break;
+        case HLSLToken_Ushort3:
+            type.baseType = HLSLBaseType_Ushort3;
+            break;
+        case HLSLToken_Ushort4:
+            type.baseType = HLSLBaseType_Ushort4;
+            break;
+
+        case HLSLToken_Short:
+            type.baseType = HLSLBaseType_Short;
+            break;
+        case HLSLToken_Short2:
+            type.baseType = HLSLBaseType_Short2;
+            break;
+        case HLSLToken_Short3:
+            type.baseType = HLSLBaseType_Short3;
+            break;
+        case HLSLToken_Short4:
+            type.baseType = HLSLBaseType_Short4;
+            break;
+
+        // Textures (TODO: could have baseType be texture, with subtype like buffer)
+        case HLSLToken_Texture2D:
+            type.baseType = HLSLBaseType_Texture2D;
+            break;
+        case HLSLToken_Texture2DArray:
+            type.baseType = HLSLBaseType_Texture2DArray;
+            break;
+        case HLSLToken_Texture3D:
+            type.baseType = HLSLBaseType_Texture3D;
+            break;
+        case HLSLToken_TextureCube:
+            type.baseType = HLSLBaseType_TextureCube;
+            break;
+        case HLSLToken_Texture2DMS:
+            type.baseType = HLSLBaseType_Texture2DMS;
+            break;
+        case HLSLToken_TextureCubeArray:
+            type.baseType = HLSLBaseType_TextureCubeArray;
+            break;
+
+        case HLSLToken_Depth2D:
+            type.baseType = HLSLBaseType_Depth2D;
+            break;
+        case HLSLToken_Depth2DArray:
+            type.baseType = HLSLBaseType_Depth2DArray;
+            break;
+        case HLSLToken_DepthCube:
+            type.baseType = HLSLBaseType_DepthCube;
+            break;
+
+        case HLSLToken_RWTexture2D:
+            type.baseType = HLSLBaseType_RWTexture2D;
+            break;
+
+        // samplers
+        case HLSLToken_SamplerState:
+            type.baseType = HLSLBaseType_SamplerState;
+            break;
+        case HLSLToken_SamplerComparisonState:
+            type.baseType = HLSLBaseType_SamplerComparisonState;
+            break;
+
+        // older constants
+        case HLSLToken_CBuffer:
+        case HLSLToken_TBuffer:
+            // might make these BufferGlobals?
+            type.baseType = HLSLBaseType_Buffer;
+            break;
+
+        // SSBO
+        case HLSLToken_StructuredBuffer:
+        case HLSLToken_RWStructuredBuffer:
+        case HLSLToken_ByteAddressBuffer:
+        case HLSLToken_RWByteAddressBuffer:
+        case HLSLToken_ConstantBuffer:
+            type.baseType = HLSLBaseType_Buffer;
+            break;
+    }
+    if (type.baseType != HLSLBaseType_Void) {
         m_tokenizer.Next();
-        
-        if (IsTextureType(type.baseType))
-        {
+
+        if (IsTextureType(type.baseType)) {
             // Parse optional sampler type.
-            if (Accept('<'))
-            {
+            if (Accept('<')) {
                 token = m_tokenizer.GetToken();
-                
+
                 // TODO: need more format types
                 // TODO: double, u/long, and other types
-                if (token >= HLSLToken_Float && token <= HLSLToken_Float4)
-                {
+                if (token >= HLSLToken_Float && token <= HLSLToken_Float4) {
                     // TODO: code only tests if texture formatType exactly matches
                     // when looking for Intrinsics, need to fix that before changing
                     // this.
-                    
+
                     type.formatType = HLSLBaseType_Float;
                     // (HLSLBaseType)(HLSLBaseType_Float + (token - HLSLToken_Float));
                 }
-                else if (token >= HLSLToken_Half && token <= HLSLToken_Half4)
-                {
-                    type.formatType =  HLSLBaseType_Half;
+                else if (token >= HLSLToken_Half && token <= HLSLToken_Half4) {
+                    type.formatType = HLSLBaseType_Half;
                     // (HLSLBaseType)(HLSLBaseType_Half + (token - HLSLToken_Half));
                 }
-                else
-                {
+                else {
                     m_tokenizer.Error("Expected half or float format type on texture.");
                     return false;
                 }
                 m_tokenizer.Next();
-                
-                if (!Expect('>'))
-                {
+
+                if (!Expect('>')) {
                     return false;
                 }
             }
@@ -4517,18 +4274,15 @@ bool HLSLParser::AcceptType(bool allowVoid, HLSLType& type/*, bool acceptFlags*/
         return true;
     }
 
-    if (allowVoid && Accept(HLSLToken_Void))
-    {
+    if (allowVoid && Accept(HLSLToken_Void)) {
         type.baseType = HLSLBaseType_Void;
         return true;
     }
-    if (token == HLSLToken_Identifier)
-    {
-        const char* identifier = m_tree->AddString( m_tokenizer.GetIdentifier() );
-        if (FindUserDefinedType(identifier) != NULL)
-        {
+    if (token == HLSLToken_Identifier) {
+        const char* identifier = m_tree->AddString(m_tokenizer.GetIdentifier());
+        if (FindUserDefinedType(identifier) != NULL) {
             m_tokenizer.Next();
-            
+
             type.baseType = HLSLBaseType_UserDefined;
             type.typeName = identifier;
             return true;
@@ -4539,8 +4293,7 @@ bool HLSLParser::AcceptType(bool allowVoid, HLSLType& type/*, bool acceptFlags*/
 
 bool HLSLParser::ExpectType(bool allowVoid, HLSLType& type)
 {
-    if (!AcceptType(allowVoid, type))
-    {
+    if (!AcceptType(allowVoid, type)) {
         m_tokenizer.Error("Expected type");
         return false;
     }
@@ -4549,27 +4302,22 @@ bool HLSLParser::ExpectType(bool allowVoid, HLSLType& type)
 
 bool HLSLParser::AcceptDeclaration(bool allowUnsizedArray, HLSLType& type, const char*& name)
 {
-    if (!AcceptType(/*allowVoid=*/false, type))
-    {
+    if (!AcceptType(/*allowVoid=*/false, type)) {
         return false;
     }
 
-    if (!ExpectIdentifier(name))
-    {
+    if (!ExpectIdentifier(name)) {
         // TODO: false means we didn't accept a declaration and we had an error!
         return false;
     }
     // Handle array syntax.
-    if (Accept('['))
-    {
+    if (Accept('[')) {
         type.array = true;
         // Optionally allow no size to the specified for the array.
-        if (Accept(']') && allowUnsizedArray)
-        {
+        if (Accept(']') && allowUnsizedArray) {
             return true;
         }
-        if (!ParseExpression(type.arraySize) || !Expect(']'))
-        {
+        if (!ParseExpression(type.arraySize) || !Expect(']')) {
             return false;
         }
     }
@@ -4578,8 +4326,7 @@ bool HLSLParser::AcceptDeclaration(bool allowUnsizedArray, HLSLType& type, const
 
 bool HLSLParser::ExpectDeclaration(bool allowUnsizedArray, HLSLType& type, const char*& name)
 {
-    if (!AcceptDeclaration(allowUnsizedArray, type, name))
-    {
+    if (!AcceptDeclaration(allowUnsizedArray, type, name)) {
         m_tokenizer.Error("Expected declaration");
         return false;
     }
@@ -4590,10 +4337,8 @@ const HLSLStruct* HLSLParser::FindUserDefinedType(const char* name) const
 {
     // Pointer comparison is sufficient for strings since they exist in the
     // string pool.
-    for (int i = 0; i < m_userTypes.GetSize(); ++i)
-    {
-        if (m_userTypes[i]->name == name)
-        {
+    for (int i = 0; i < m_userTypes.GetSize(); ++i) {
+        if (m_userTypes[i]->name == name) {
             return m_userTypes[i];
         }
     }
@@ -4602,8 +4347,7 @@ const HLSLStruct* HLSLParser::FindUserDefinedType(const char* name) const
 
 bool HLSLParser::CheckForUnexpectedEndOfStream(int endToken)
 {
-    if (Accept(HLSLToken_EndOfStream))
-    {
+    if (Accept(HLSLToken_EndOfStream)) {
         char what[HLSLTokenizer::s_maxIdentifier];
         m_tokenizer.GetTokenName(endToken, what);
         m_tokenizer.Error("Unexpected end of file while looking for '%s'", what);
@@ -4619,7 +4363,7 @@ int HLSLParser::GetLineNumber() const
 
 const char* HLSLParser::GetFileName()
 {
-    return m_tree->AddString( m_tokenizer.GetFileName() );
+    return m_tree->AddString(m_tokenizer.GetFileName());
 }
 
 void HLSLParser::BeginScope()
@@ -4632,8 +4376,7 @@ void HLSLParser::BeginScope()
 void HLSLParser::EndScope()
 {
     int numVariables = m_variables.GetSize() - 1;
-    while (m_variables[numVariables].name != NULL)
-    {
+    while (m_variables[numVariables].name != NULL) {
         --numVariables;
         ASSERT(numVariables >= 0);
     }
@@ -4642,10 +4385,8 @@ void HLSLParser::EndScope()
 
 const HLSLType* HLSLParser::FindVariable(const char* name, bool& global) const
 {
-    for (int i = m_variables.GetSize() - 1; i >= 0; --i)
-    {
-        if (m_variables[i].name == name)
-        {
+    for (int i = m_variables.GetSize() - 1; i >= 0; --i) {
+        if (m_variables[i].name == name) {
             global = (i < m_numGlobals);
             return &m_variables[i].type;
         }
@@ -4656,10 +4397,8 @@ const HLSLType* HLSLParser::FindVariable(const char* name, bool& global) const
 // This only search user-defined c-style functions.  Intrinsics are not in this.
 const HLSLFunction* HLSLParser::FindFunction(const char* name) const
 {
-    for (int i = 0; i < m_functions.GetSize(); ++i)
-    {
-        if (m_functions[i]->name == name)
-        {
+    for (int i = 0; i < m_functions.GetSize(); ++i) {
+        if (m_functions[i]->name == name) {
             return m_functions[i];
         }
     }
@@ -4673,8 +4412,7 @@ static bool AreTypesEqual(HLSLTree* tree, const HLSLType& lhs, const HLSLType& r
 
 static bool AreArgumentListsEqual(HLSLTree* tree, HLSLArgument* lhs, HLSLArgument* rhs)
 {
-    while (lhs && rhs)
-    {
+    while (lhs && rhs) {
         if (!AreTypesEqual(tree, lhs->type, rhs->type))
             return false;
 
@@ -4693,12 +4431,10 @@ static bool AreArgumentListsEqual(HLSLTree* tree, HLSLArgument* lhs, HLSLArgumen
 
 const HLSLFunction* HLSLParser::FindFunction(const HLSLFunction* fun) const
 {
-    for (int i = 0; i < m_functions.GetSize(); ++i)
-    {
+    for (int i = 0; i < m_functions.GetSize(); ++i) {
         if (m_functions[i]->name == fun->name &&
             AreTypesEqual(m_tree, m_functions[i]->returnType, fun->returnType) &&
-            AreArgumentListsEqual(m_tree, m_functions[i]->argument, fun->argument))
-        {
+            AreArgumentListsEqual(m_tree, m_functions[i]->argument, fun->argument)) {
             return m_functions[i];
         }
     }
@@ -4707,8 +4443,7 @@ const HLSLFunction* HLSLParser::FindFunction(const HLSLFunction* fun) const
 
 void HLSLParser::DeclareVariable(const char* name, const HLSLType& type)
 {
-    if (m_variables.GetSize() == m_numGlobals)
-    {
+    if (m_variables.GetSize() == m_numGlobals) {
         ++m_numGlobals;
     }
     Variable& variable = m_variables.PushBackNew();
@@ -4719,15 +4454,13 @@ void HLSLParser::DeclareVariable(const char* name, const HLSLType& type)
 bool HLSLParser::GetIsFunction(const char* name) const
 {
     // check user defined functions
-    for (int i = 0; i < m_functions.GetSize(); ++i)
-    {
+    for (int i = 0; i < m_functions.GetSize(); ++i) {
         // == is ok here because we're passed the strings through the string pool.
-        if (m_functions[i]->name == name)
-        {
+        if (m_functions[i]->name == name) {
             return true;
         }
     }
-    
+
     // see if it's an intrinsic
     const auto& it = _intrinsicRangeMap.find(name);
     return it != _intrinsicRangeMap.end();
@@ -4735,49 +4468,43 @@ bool HLSLParser::GetIsFunction(const char* name) const
 
 const HLSLFunction* HLSLParser::MatchFunctionCall(const HLSLFunctionCall* functionCall, const char* name, const HLSLType* memberType)
 {
-    const HLSLFunction* matchedFunction     = NULL;
+    const HLSLFunction* matchedFunction = NULL;
 
     //int  numArguments           = functionCall->numArguments;
-    int  numMatchedOverloads    = 0;
-    bool nameMatches            = false;
+    int numMatchedOverloads = 0;
+    bool nameMatches = false;
 
     // Get the user defined c functions with the specified name.
     // There may be more than one, and these are not ordered.
-    for (int i = 0; i < m_functions.GetSize(); ++i)
-    {
+    for (int i = 0; i < m_functions.GetSize(); ++i) {
         const HLSLFunction* function = m_functions[i];
-        if (function->name == name)
-        {
+        if (function->name == name) {
             nameMatches = true;
-            
+
             // if caller requests member function, then memberType must match
             bool isMemberFunc = function->IsMemberFunction();
-           
-            if (memberType)
-            {
+
+            if (memberType) {
                 if (!isMemberFunc)
                     continue;
-                
+
                 if (memberType->baseType != function->memberType)
                     continue;
-                
+
                 if (memberType->formatType != GetScalarType(function->returnType.baseType))
                     continue;
             }
-            else
-            {
+            else {
                 if (isMemberFunc)
                     continue;
             }
-            
-            CompareFunctionsResult result = CompareFunctions( m_tree, functionCall, function, matchedFunction );
-            if (result == Function1Better)
-            {
+
+            CompareFunctionsResult result = CompareFunctions(m_tree, functionCall, function, matchedFunction);
+            if (result == Function1Better) {
                 matchedFunction = function;
                 numMatchedOverloads = 1;
             }
-            else if (result == FunctionsEqual)
-            {
+            else if (result == FunctionsEqual) {
                 ++numMatchedOverloads;
             }
         }
@@ -4785,63 +4512,53 @@ const HLSLFunction* HLSLParser::MatchFunctionCall(const HLSLFunctionCall* functi
 
     // Get the intrinsic functions with the specified name.
     const auto& iter = _intrinsicRangeMap.find(name);
-    if (iter != _intrinsicRangeMap.end())
-    {
+    if (iter != _intrinsicRangeMap.end()) {
         Range range = iter->second;
-        for (int i = 0; i < range.count; ++i)
-        {
+        for (int i = 0; i < range.count; ++i) {
             uint32_t idx = range.start + i;
             const HLSLFunction* function = &_intrinsics[idx].function;
-            
+
             // if caller requests member function, then memberType must match
             bool isMemberFunc = function->IsMemberFunction();
-            if (memberType)
-            {
+            if (memberType) {
                 if (!isMemberFunc)
                     break;
-            
+
                 if (memberType->baseType != function->memberType)
                     continue;
-                
+
                 if (memberType->formatType != GetScalarType(function->returnType.baseType))
                     continue;
             }
-            else
-            {
+            else {
                 if (isMemberFunc)
                     break;
             }
             ASSERT(String_Equal(function->name, name));
-                   
+
             nameMatches = true;
-            
-            CompareFunctionsResult result = CompareFunctions( m_tree, functionCall, function, matchedFunction );
-            if (result == Function1Better)
-            {
+
+            CompareFunctionsResult result = CompareFunctions(m_tree, functionCall, function, matchedFunction);
+            if (result == Function1Better) {
                 matchedFunction = function;
                 numMatchedOverloads = 1;
             }
-            else if (result == FunctionsEqual)
-            {
+            else if (result == FunctionsEqual) {
                 ++numMatchedOverloads;
             }
         }
     }
-    
-    if (matchedFunction != NULL && numMatchedOverloads > 1)
-    {
+
+    if (matchedFunction != NULL && numMatchedOverloads > 1) {
         // Multiple overloads match.
         m_tokenizer.Error("'%s' %d overloads have similar conversions", name, numMatchedOverloads);
         return NULL;
     }
-    else if (matchedFunction == NULL)
-    {
-        if (nameMatches)
-        {
+    else if (matchedFunction == NULL) {
+        if (nameMatches) {
             m_tokenizer.Error("'%s' no overloaded function matched all of the arguments", name);
         }
-        else
-        {
+        else {
             m_tokenizer.Error("Undeclared identifier '%s'", name);
         }
     }
@@ -4851,29 +4568,26 @@ const HLSLFunction* HLSLParser::MatchFunctionCall(const HLSLFunctionCall* functi
 
 inline bool IsSwizzle(char c)
 {
-    return c == 'x' || c == 'y' || c == 'z' || c ==  'w' ||
-           c == 'r' || c == 'g' || c == 'b' || c ==  'a';
+    return c == 'x' || c == 'y' || c == 'z' || c == 'w' ||
+           c == 'r' || c == 'g' || c == 'b' || c == 'a';
 }
 
-bool HLSLParser::GetMemberType(const HLSLType& objectType, HLSLMemberAccess * memberAccess)
+bool HLSLParser::GetMemberType(const HLSLType& objectType, HLSLMemberAccess* memberAccess)
 {
     const char* fieldName = memberAccess->field;
 
     HLSLBaseType baseType = objectType.baseType;
 
     // pull field from struct
-    if (baseType == HLSLBaseType_UserDefined)
-    {
-        const HLSLStruct* structure = FindUserDefinedType( objectType.typeName );
+    if (baseType == HLSLBaseType_UserDefined) {
+        const HLSLStruct* structure = FindUserDefinedType(objectType.typeName);
         ASSERT(structure != NULL);
         if (structure == NULL)
             return false;
-        
+
         const HLSLStructField* field = structure->field;
-        while (field != NULL)
-        {
-            if (field->name == fieldName)
-            {
+        while (field != NULL) {
+            if (field->name == fieldName) {
                 memberAccess->expressionType = field->type;
                 return true;
             }
@@ -4883,21 +4597,17 @@ bool HLSLParser::GetMemberType(const HLSLType& objectType, HLSLMemberAccess * me
         return false;
     }
 
-    if (baseTypeDescriptions[objectType.baseType].numericType == NumericType_NaN)
-    {
+    if (baseTypeDescriptions[objectType.baseType].numericType == NumericType_NaN) {
         // Currently we don't have an non-numeric types that allow member access.
         return false;
     }
 
     int swizzleLength = 0;
 
-    if (IsScalarType(baseType) || IsVectorType(baseType))
-    {
+    if (IsScalarType(baseType) || IsVectorType(baseType)) {
         // Check for a swizzle on the scalar/vector types.
-        for (int i = 0; fieldName[i] != 0; ++i)
-        {
-            if (!IsSwizzle(fieldName[i]))
-            {
+        for (int i = 0; fieldName[i] != 0; ++i) {
+            if (!IsSwizzle(fieldName[i])) {
                 m_tokenizer.Error("Invalid swizzle '%s'", fieldName);
                 return false;
             }
@@ -4907,102 +4617,89 @@ bool HLSLParser::GetMemberType(const HLSLType& objectType, HLSLMemberAccess * me
         if (swizzleLength == 0)
             return false;
     }
-    else if (IsMatrixType(baseType))
-    {
-
+    else if (IsMatrixType(baseType)) {
         // Check for a matrix element access (e.g. _m00 or _11)
 
         const char* n = fieldName;
-        while (n[0] == '_')
-        {
+        while (n[0] == '_') {
             ++n;
             int base = 1;
-            if (n[0] == 'm')
-            {
+            if (n[0] == 'm') {
                 base = 0;
                 ++n;
             }
-            if (!isdigit(n[0]) || !isdigit(n[1]))
-            {
+            if (!isdigit(n[0]) || !isdigit(n[1])) {
                 m_tokenizer.Error("Invalid matrix digit");
                 return false;
             }
 
             int r = (n[0] - '0') - base;
             int c = (n[1] - '0') - base;
-            if (r >= baseTypeDescriptions[objectType.baseType].height)
-            {
+            if (r >= baseTypeDescriptions[objectType.baseType].height) {
                 m_tokenizer.Error("Invalid matrix dimension %d", r);
                 return false;
             }
-            if (c >= baseTypeDescriptions[objectType.baseType].numComponents)
-            {
+            if (c >= baseTypeDescriptions[objectType.baseType].numComponents) {
                 m_tokenizer.Error("Invalid matrix dimension %d", c);
                 return false;
             }
             ++swizzleLength;
             n += 2;
-
         }
 
-        if (n[0] != 0)
-        {
+        if (n[0] != 0) {
             return false;
         }
-
     }
-    else
-    {
+    else {
         return false;
     }
 
-    if (swizzleLength > 4)
-    {
+    if (swizzleLength > 4) {
         m_tokenizer.Error("Invalid swizzle '%s'", fieldName);
         return false;
     }
- 
-    switch (baseTypeDescriptions[objectType.baseType].numericType)
-    {
-    case NumericType_Float:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Float + swizzleLength - 1);
-        break;
-    case NumericType_Half:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Half + swizzleLength - 1);
-        break;
-    case NumericType_Double:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Double + swizzleLength - 1);
-        break;
-        
-    case NumericType_Int:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Int + swizzleLength - 1);
-        break;
-    case NumericType_Uint:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Uint + swizzleLength - 1);
+
+    switch (baseTypeDescriptions[objectType.baseType].numericType) {
+        case NumericType_Float:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Float + swizzleLength - 1);
+            break;
+        case NumericType_Half:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Half + swizzleLength - 1);
             break;
-    case NumericType_Bool:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Bool + swizzleLength - 1);
+        case NumericType_Double:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Double + swizzleLength - 1);
             break;
-    case NumericType_Short:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Short + swizzleLength - 1);
+
+        case NumericType_Int:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Int + swizzleLength - 1);
+            break;
+        case NumericType_Uint:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Uint + swizzleLength - 1);
+            break;
+        case NumericType_Bool:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Bool + swizzleLength - 1);
+            break;
+        case NumericType_Short:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Short + swizzleLength - 1);
             break;
-    case NumericType_Ushort:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Ushort + swizzleLength - 1);
+        case NumericType_Ushort:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Ushort + swizzleLength - 1);
             break;
-    case NumericType_Long:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Long + swizzleLength - 1);
+        case NumericType_Long:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Long + swizzleLength - 1);
             break;
-    case NumericType_Ulong:
-        memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Ulong + swizzleLength - 1);
+        case NumericType_Ulong:
+            memberAccess->expressionType.baseType = (HLSLBaseType)(HLSLBaseType_Ulong + swizzleLength - 1);
             break;
-    // TODO: u/char
-    default:
-        ASSERT(false);
+        // TODO: u/char
+        default:
+            ASSERT(false);
     }
 
     memberAccess->swizzle = true;
-    
+
     return true;
 }
 
-}
+} //namespace M4
diff --git a/hlslparser/src/HLSLParser.h b/hlslparser/src/HLSLParser.h
index 1a09da60..f963babf 100644
--- a/hlslparser/src/HLSLParser.h
+++ b/hlslparser/src/HLSLParser.h
@@ -10,35 +10,28 @@
 #pragma once
 
 #include "Engine.h"
-
 #include "HLSLTokenizer.h"
 #include "HLSLTree.h"
 
-namespace M4
-{
+namespace M4 {
 
 struct EffectState;
 
 // This wouldn't be needed if could preprocess prior to calling parser.
-struct HLSLParserOptions
-{
+struct HLSLParserOptions {
     bool isHalfst = false;
-    
+
     bool isHalfio = false;
 };
 
-class HLSLParser
-{
-
+class HLSLParser {
 public:
-
     HLSLParser(Allocator* allocator, const char* fileName, const char* buffer, size_t length);
     void SetKeepComments(bool enable) { m_tokenizer.SetKeepComments(enable); }
-    
+
     bool Parse(HLSLTree* tree, const HLSLParserOptions& options = HLSLParserOptions());
 
 private:
-
     bool Accept(int token);
     bool Expect(int token);
 
@@ -53,14 +46,14 @@ class HLSLParser
     bool AcceptIdentifier(const char*& identifier);
     bool ExpectIdentifier(const char*& identifier);
     bool AcceptFloat(float& value);
-	bool AcceptHalf( float& value );
+    bool AcceptHalf(float& value);
     bool AcceptInt(int& value);
     bool AcceptType(bool allowVoid, HLSLType& type);
     bool ExpectType(bool allowVoid, HLSLType& type);
     bool AcceptBinaryOperator(int priority, HLSLBinaryOp& binaryOp);
     bool AcceptUnaryOperator(bool pre, HLSLUnaryOp& unaryOp);
     bool AcceptAssign(HLSLBinaryOp& binaryOp);
-    bool AcceptTypeModifier(int & typeFlags);
+    bool AcceptTypeModifier(int& typeFlags);
     bool AcceptInterpolationModifier(int& flags);
 
     /**
@@ -85,18 +78,18 @@ class HLSLParser
     bool ParseDeclarationAssignment(HLSLDeclaration* declaration);
     bool ParsePartialConstructor(HLSLExpression*& expression, HLSLBaseType type, const char* typeName);
 
-    bool ParseStateName(bool isSamplerState, bool isPipelineState, const char*& name, const EffectState *& state);
+    bool ParseStateName(bool isSamplerState, bool isPipelineState, const char*& name, const EffectState*& state);
     bool ParseColorMask(int& mask);
-    
-// FX file
-//    bool ParseStateValue(const EffectState * state, HLSLStateAssignment* stateAssignment);
-//    bool ParseStateAssignment(HLSLStateAssignment*& stateAssignment, bool isSamplerState, bool isPipelineState);
-//    bool ParseSamplerState(HLSLExpression*& expression);
-//    bool ParseTechnique(HLSLStatement*& statement);
-//    bool ParsePass(HLSLPass*& pass);
-//    bool ParsePipeline(HLSLStatement*& pipeline);
-//    bool ParseStage(HLSLStatement*& stage);
-    
+
+    // FX file
+    //    bool ParseStateValue(const EffectState * state, HLSLStateAssignment* stateAssignment);
+    //    bool ParseStateAssignment(HLSLStateAssignment*& stateAssignment, bool isSamplerState, bool isPipelineState);
+    //    bool ParseSamplerState(HLSLExpression*& expression);
+    //    bool ParseTechnique(HLSLStatement*& statement);
+    //    bool ParsePass(HLSLPass*& pass);
+    //    bool ParsePipeline(HLSLStatement*& pipeline);
+    //    bool ParseStage(HLSLStatement*& stage);
+
     bool ParseComment(HLSLStatement*& statement);
 
     bool ParseAttributeList(HLSLAttribute*& attribute);
@@ -111,20 +104,20 @@ class HLSLParser
 
     void DeclareVariable(const char* name, const HLSLType& type);
 
-    /// Returned pointer is only valid until Declare or Begin/EndScope is called. 
+    /// Returned pointer is only valid until Declare or Begin/EndScope is called.
     const HLSLType* FindVariable(const char* name, bool& global) const;
 
     const HLSLFunction* FindFunction(const char* name) const;
     const HLSLFunction* FindFunction(const HLSLFunction* fun) const;
 
     bool GetIsFunction(const char* name) const;
-    
+
     /// Finds the overloaded function that matches the specified call.
     /// Pass memberType to match member functions.
     const HLSLFunction* MatchFunctionCall(const HLSLFunctionCall* functionCall, const char* name, const HLSLType* memberType = NULL);
 
     /// Gets the type of the named field on the specified object type (fieldName can also specify a swizzle. )
-    bool GetMemberType(const HLSLType& objectType, HLSLMemberAccess * memberAccess);
+    bool GetMemberType(const HLSLType& objectType, HLSLMemberAccess* memberAccess);
 
     bool CheckTypeCast(const HLSLType& srcType, const HLSLType& dstType);
 
@@ -132,33 +125,30 @@ class HLSLParser
     int GetLineNumber() const;
 
 private:
-
-    struct Variable
-    {
-        const char*     name;
-        HLSLType        type;
+    struct Variable {
+        const char* name;
+        HLSLType type;
     };
 
-    HLSLTokenizer           m_tokenizer;
-    Array<HLSLStruct*>      m_userTypes;
-    Array<Variable>         m_variables;
-    Array<HLSLFunction*>    m_functions;
-    int                     m_numGlobals;
-
-    HLSLTree*               m_tree;
-    
-    bool                    m_allowUndeclaredIdentifiers = false;
-    bool                    m_disableSemanticValidation = false;
-    
-    HLSLParserOptions       m_options;
+    HLSLTokenizer m_tokenizer;
+    Array<HLSLStruct*> m_userTypes;
+    Array<Variable> m_variables;
+    Array<HLSLFunction*> m_functions;
+    int m_numGlobals;
+
+    HLSLTree* m_tree;
+
+    bool m_allowUndeclaredIdentifiers = false;
+    bool m_disableSemanticValidation = false;
+
+    HLSLParserOptions m_options;
 };
 
-enum NumericType
-{
+enum NumericType {
     NumericType_Float,
     NumericType_Half,
     NumericType_Double, // not in MSL
-    
+
     NumericType_Bool,
     NumericType_Int,
     NumericType_Uint,
@@ -166,13 +156,13 @@ enum NumericType
     NumericType_Ushort,
     NumericType_Ulong,
     NumericType_Long,
-    
+
     // TODO: HLSL doesn't have byte/ubyte, MSL does
     // NumericType_UByte,
     // NumericType_Byte,
-    
+
     NumericType_Count,
-    
+
     NumericType_NaN, // not in count?
 };
 
@@ -226,4 +216,4 @@ HLSLBaseType GetScalarType(HLSLBaseType type);
 // returns 1 for scalar or 2/3/4 for vector types.
 int32_t GetVectorDimension(HLSLBaseType type);
 
-}
+} //namespace M4
diff --git a/hlslparser/src/HLSLTokenizer.cpp b/hlslparser/src/HLSLTokenizer.cpp
index 965fb6ea..c8d89983 100644
--- a/hlslparser/src/HLSLTokenizer.cpp
+++ b/hlslparser/src/HLSLTokenizer.cpp
@@ -1,170 +1,177 @@
 #include "HLSLTokenizer.h"
 
-#include "Engine.h"
-
 #include <ctype.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdarg.h>
 
-namespace M4
-{
+#include "Engine.h"
+
+namespace M4 {
 // The order here must match the order in the Token enum.
 static const char* _reservedWords[] =
-{
-    "float",
-    "float2",
-    "float3",
-    "float4",
-    "float2x2",
-    "float3x3",
-    "float4x4",
-    
-    // for Nvidia/Adreno
-    "halfio",
-    "half2io",
-    "half3io",
-    "half4io",
-    
-    // for Android
-    "halfst",
-    "half2st",
-    "half3st",
-    "half4st",
-    
-    "half",
-    "half2",
-    "half3",
-    "half4",
-    "half2x2",
-    "half3x3",
-    "half4x4",
-    
-    "double",
-    "double2",
-    "double3",
-    "double4",
-    "double2x2",
-    "double3x3",
-    "double4x4",
-    
-    "bool",
-    "bool2",
-    "bool3",
-    "bool4",
-    
-    "int",
-    "int2",
-    "int3",
-    "int4",
-    
-    "uint",
-    "uint2",
-    "uint3",
-    "uint4",
-    
-    "short",
-    "short2",
-    "short3",
-    "short4",
-    
-    "ushort",
-    "ushort2",
-    "ushort3",
-    "ushort4",
-    
-    "long",
-    "long2",
-    "long3",
-    "long4",
-    
-    "ulong",
-    "ulong2",
-    "ulong3",
-    "ulong4",
-    
-    // TODO: u/char
-    
-    "Texture2D",
-    "Texture3D",
-    "TextureCube",
-    "Texture2DArray",
-    "TextureCubeArray",
-    "Texture2DMS",
-    
-    "Depth2D",
-    "Depth2DArray", // cascades
-    "DepthCube",
-    
-    "RWTexture2D",
-    
-    "SamplerState",
-    "SamplerComparisonState",
-
-    "if",
-    "else",
-    "for",
-    "while",
-    "break",
-    "true",
-    "false",
-    "void",
-    "struct",
-    
-    // DX9 buffer types (tons of globals)
-    "cbuffer",
-    "tbuffer",
-    
-    // DX10 buffer templated types
-    "ConstantBuffer", // indexable cbuffer
-    "StructuredBuffer",
-    "RWStructuredBuffer",
-    "ByteAddressBuffer",
-    "RWByteAddressBuffer",
-    
-    "register",
-    "return",
-    "continue",
-    "discard",
-    
-    "const",
-    "static",
-    "inline",
-
-    "uniform",
-    "in",
-    "out",
-    "inout",
-    
-    "#include",
-    
-    // these are from fx file
-    //"sampler_state",
-    //"technique",
-    //"pass",
+    {
+        "float",
+        "float2",
+        "float3",
+        "float4",
+        "float2x2",
+        "float3x3",
+        "float4x4",
+
+        // for Nvidia/Adreno
+        "halfio",
+        "half2io",
+        "half3io",
+        "half4io",
+
+        // for Android
+        "halfst",
+        "half2st",
+        "half3st",
+        "half4st",
+
+        "half",
+        "half2",
+        "half3",
+        "half4",
+        "half2x2",
+        "half3x3",
+        "half4x4",
+
+        "double",
+        "double2",
+        "double3",
+        "double4",
+        "double2x2",
+        "double3x3",
+        "double4x4",
+
+        "bool",
+        "bool2",
+        "bool3",
+        "bool4",
+
+        "int",
+        "int2",
+        "int3",
+        "int4",
+
+        "uint",
+        "uint2",
+        "uint3",
+        "uint4",
+
+        "short",
+        "short2",
+        "short3",
+        "short4",
+
+        "ushort",
+        "ushort2",
+        "ushort3",
+        "ushort4",
+
+        "long",
+        "long2",
+        "long3",
+        "long4",
+
+        "ulong",
+        "ulong2",
+        "ulong3",
+        "ulong4",
+
+        // TODO: u/char
+
+        "Texture2D",
+        "Texture3D",
+        "TextureCube",
+        "Texture2DArray",
+        "TextureCubeArray",
+        "Texture2DMS",
+
+        "Depth2D",
+        "Depth2DArray", // cascades
+        "DepthCube",
+
+        "RWTexture2D",
+
+        "SamplerState",
+        "SamplerComparisonState",
+
+        "if",
+        "else",
+        "for",
+        "while",
+        "break",
+        "true",
+        "false",
+        "void",
+        "struct",
+
+        // DX9 buffer types (tons of globals)
+        "cbuffer",
+        "tbuffer",
+
+        // DX10 buffer templated types
+        "ConstantBuffer", // indexable cbuffer
+        "StructuredBuffer",
+        "RWStructuredBuffer",
+        "ByteAddressBuffer",
+        "RWByteAddressBuffer",
+
+        "register",
+        "return",
+        "continue",
+        "discard",
+
+        "const",
+        "static",
+        "inline",
+
+        "uniform",
+        "in",
+        "out",
+        "inout",
+
+        "#include",
+
+        // these are from fx file
+        //"sampler_state",
+        //"technique",
+        //"pass",
 };
 
 static bool GetIsSymbol(char c)
 {
-    switch (c)
-    {
-    case ';':
-    case ':':
-    case '(': case ')':
-    case '[': case ']':
-    case '{': case '}':
-    case '-': case '+':
-    case '*': case '/':
-    case '?':
-    case '!':
-    case ',':
-    case '=':
-    case '.':
-    case '<': case '>':
-    case '|': case '&': case '^': case '~':
-    case '@':
-        return true;
+    switch (c) {
+        case ';':
+        case ':':
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case '{':
+        case '}':
+        case '-':
+        case '+':
+        case '*':
+        case '/':
+        case '?':
+        case '!':
+        case ',':
+        case '=':
+        case '.':
+        case '<':
+        case '>':
+        case '|':
+        case '&':
+        case '^':
+        case '~':
+        case '@':
+            return true;
     }
     return false;
 }
@@ -177,31 +184,28 @@ static bool GetIsNumberSeparator(char c)
 
 HLSLTokenizer::HLSLTokenizer(const char* fileName, const char* buffer, size_t length)
 {
-    m_buffer            = buffer;
-    m_bufferEnd         = buffer + length;
-    m_fileName          = fileName;
-    m_lineNumber        = 1;
-    m_tokenLineNumber   = 1;
-    m_error             = false;
+    m_buffer = buffer;
+    m_bufferEnd = buffer + length;
+    m_fileName = fileName;
+    m_lineNumber = 1;
+    m_tokenLineNumber = 1;
+    m_error = false;
     Next();
 }
 
 void HLSLTokenizer::Next()
 {
-	while(SkipWhitespace() || SkipComment() || ScanLineDirective() || SkipPragmaDirective() || SkipInclude())
-    {
+    while (SkipWhitespace() || SkipComment() || ScanLineDirective() || SkipPragmaDirective() || SkipInclude()) {
     }
 
-    if (m_error)
-    {
+    if (m_error) {
         m_token = HLSLToken_EndOfStream;
         return;
     }
 
     m_tokenLineNumber = m_lineNumber;
 
-    if (m_buffer >= m_bufferEnd || *m_buffer == '\0')
-    {
+    if (m_buffer >= m_bufferEnd || *m_buffer == '\0') {
         m_token = HLSLToken_EndOfStream;
         return;
     }
@@ -209,168 +213,144 @@ void HLSLTokenizer::Next()
     const char* start = m_buffer;
 
     // single line comments
-    if (m_keepComments && (m_buffer[0] == '/' && m_buffer[1] == '/'))
-    {
+    if (m_keepComments && (m_buffer[0] == '/' && m_buffer[1] == '/')) {
         m_token = HLSLToken_Comment;
         m_buffer += 2;
-        
+
         m_comment[0] = 0;
-        
+
         // How to count the remaining string as tokens of the comment
         // typically expecting a single string, not a sequence of strings.
-        
+
         // skip the newline too, but would need to increment lineNumber
         uint32_t commentLen = 0;
-        while (m_buffer < m_bufferEnd)
-        {
-            if (*(m_buffer) == '\n')
-            {
+        while (m_buffer < m_bufferEnd) {
+            if (*(m_buffer) == '\n') {
                 m_buffer++;
                 m_lineNumber++;
                 break;
             }
-            
+
             // store comment to temporary string
             if (commentLen < (s_maxComment - 1))
                 m_comment[commentLen++] = *m_buffer;
-            
+
             m_buffer++;
         }
-    
+
         m_comment[commentLen] = 0;
-        
+
         return;
     }
-    
+
     // +=, -=, *=, /=, ==, <=, >=
-    if (m_buffer[0] == '+' && m_buffer[1] == '=')
-    {
+    if (m_buffer[0] == '+' && m_buffer[1] == '=') {
         m_token = HLSLToken_PlusEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '-' && m_buffer[1] == '=')
-    {
+    else if (m_buffer[0] == '-' && m_buffer[1] == '=') {
         m_token = HLSLToken_MinusEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '*' && m_buffer[1] == '=')
-    {
+    else if (m_buffer[0] == '*' && m_buffer[1] == '=') {
         m_token = HLSLToken_TimesEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '/' && m_buffer[1] == '=')
-    {
+    else if (m_buffer[0] == '/' && m_buffer[1] == '=') {
         m_token = HLSLToken_DivideEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '=' && m_buffer[1] == '=')
-    {
+    else if (m_buffer[0] == '=' && m_buffer[1] == '=') {
         m_token = HLSLToken_EqualEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '!' && m_buffer[1] == '=')
-    {
+    else if (m_buffer[0] == '!' && m_buffer[1] == '=') {
         m_token = HLSLToken_NotEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '<' && m_buffer[1] == '=')
-    {
+    else if (m_buffer[0] == '<' && m_buffer[1] == '=') {
         m_token = HLSLToken_LessEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '>' && m_buffer[1] == '=')
-    {
+    else if (m_buffer[0] == '>' && m_buffer[1] == '=') {
         m_token = HLSLToken_GreaterEqual;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '&' && m_buffer[1] == '&')
-    {
+    else if (m_buffer[0] == '&' && m_buffer[1] == '&') {
         m_token = HLSLToken_LogicalAnd;
         m_buffer += 2;
         return;
     }
-    else if (m_buffer[0] == '|' && m_buffer[1] == '|')
-    {
+    else if (m_buffer[0] == '|' && m_buffer[1] == '|') {
         m_token = HLSLToken_LogicalOr;
         m_buffer += 2;
         return;
     }
 
     // ++, --
-    if ((m_buffer[0] == '-' && m_buffer[1] == '-'))
-    {
+    if ((m_buffer[0] == '-' && m_buffer[1] == '-')) {
         m_token = HLSLToken_MinusMinus;
         m_buffer += 2;
         return;
     }
-    if ((m_buffer[0] == '+' && m_buffer[1] == '+'))
-    {
+    if ((m_buffer[0] == '+' && m_buffer[1] == '+')) {
         m_token = HLSLToken_PlusPlus;
         m_buffer += 2;
         return;
     }
-    
+
     // Check for the start of a number.
-    if (ScanNumber())
-    {
+    if (ScanNumber()) {
         return;
     }
-    
-    if (GetIsSymbol(m_buffer[0]))
-    {
+
+    if (GetIsSymbol(m_buffer[0])) {
         m_token = static_cast<unsigned char>(m_buffer[0]);
         ++m_buffer;
         return;
     }
 
     // Must be an identifier or a reserved word.
-    while (m_buffer < m_bufferEnd && m_buffer[0] != 0 && !GetIsSymbol(m_buffer[0]) && !isspace(m_buffer[0]))
-    {
+    while (m_buffer < m_bufferEnd && m_buffer[0] != 0 && !GetIsSymbol(m_buffer[0]) && !isspace(m_buffer[0])) {
         ++m_buffer;
     }
 
     size_t length = m_buffer - start;
     memcpy(m_identifier, start, length);
     m_identifier[length] = 0;
-    
+
     const int numReservedWords = sizeof(_reservedWords) / sizeof(const char*);
-    for (int i = 0; i < numReservedWords; ++i)
-    {
+    for (int i = 0; i < numReservedWords; ++i) {
         // TODO: remove O(N) search of strings, need unordered_map
-        if (String_Equal(_reservedWords[i], m_identifier))
-        {
+        if (String_Equal(_reservedWords[i], m_identifier)) {
             m_token = 256 + i;
             return;
         }
     }
 
     m_token = HLSLToken_Identifier;
-
 }
 
 bool HLSLTokenizer::SkipInclude()
 {
     bool result = false;
-    
+
     static const char* keyword = "#include";
     static uint32_t keywordLen = (uint32_t)strlen(keyword);
-    
-    if( strncmp( m_buffer, keyword, keywordLen ) == 0 && isspace( m_buffer[ keywordLen ] ) )
-    {
+
+    if (strncmp(m_buffer, keyword, keywordLen) == 0 && isspace(m_buffer[keywordLen])) {
         m_buffer += keywordLen;
         result = true;
-        while( m_buffer < m_bufferEnd )
-        {
-            if( *( m_buffer++ ) == '\n' )
-            {
+        while (m_buffer < m_bufferEnd) {
+            if (*(m_buffer++) == '\n') {
                 ++m_lineNumber;
                 break;
             }
@@ -379,15 +359,12 @@ bool HLSLTokenizer::SkipInclude()
     return result;
 }
 
-    
 bool HLSLTokenizer::SkipWhitespace()
 {
     bool result = false;
-    while (m_buffer < m_bufferEnd && isspace(m_buffer[0]))
-    {
+    while (m_buffer < m_bufferEnd && isspace(m_buffer[0])) {
         result = true;
-        if (m_buffer[0] == '\n')
-        {
+        if (m_buffer[0] == '\n') {
             ++m_lineNumber;
         }
         ++m_buffer;
@@ -398,41 +375,32 @@ bool HLSLTokenizer::SkipWhitespace()
 bool HLSLTokenizer::SkipComment()
 {
     bool result = false;
-    if (m_buffer[0] == '/')
-    {
-        if ((!m_keepComments) && m_buffer[1] == '/')
-        {
+    if (m_buffer[0] == '/') {
+        if ((!m_keepComments) && m_buffer[1] == '/') {
             // Single line comment.
             result = true;
             m_buffer += 2;
-            while (m_buffer < m_bufferEnd)
-            {
-                if (*(m_buffer++) == '\n')
-                {
+            while (m_buffer < m_bufferEnd) {
+                if (*(m_buffer++) == '\n') {
                     ++m_lineNumber;
                     break;
                 }
             }
         }
-        else if (m_buffer[1] == '*')
-        {
+        else if (m_buffer[1] == '*') {
             // Multi-line comment.
             result = true;
             m_buffer += 2;
-            while (m_buffer < m_bufferEnd)
-            {
-                if (m_buffer[0] == '\n')
-                {
+            while (m_buffer < m_bufferEnd) {
+                if (m_buffer[0] == '\n') {
                     ++m_lineNumber;
                 }
-                if (m_buffer[0] == '*' && m_buffer[1] == '/')
-                {
+                if (m_buffer[0] == '*' && m_buffer[1] == '/') {
                     break;
                 }
                 ++m_buffer;
             }
-            if (m_buffer < m_bufferEnd)
-            {
+            if (m_buffer < m_bufferEnd) {
                 m_buffer += 2;
             }
         }
@@ -442,46 +410,39 @@ bool HLSLTokenizer::SkipComment()
 
 bool HLSLTokenizer::SkipPragmaDirective()
 {
-	bool result = false;
-	
+    bool result = false;
+
     static const char* keyword = "#include";
     static uint32_t keywordLen = (uint32_t)strlen(keyword);
 
-    if( strncmp( m_buffer, keyword, keywordLen ) == 0 && isspace( m_buffer[ keywordLen ] ) )
-    {
+    if (strncmp(m_buffer, keyword, keywordLen) == 0 && isspace(m_buffer[keywordLen])) {
         m_buffer += keywordLen;
         result = true;
-        while( m_buffer < m_bufferEnd )
-        {
-            if( *( m_buffer++ ) == '\n' )
-            {
+        while (m_buffer < m_bufferEnd) {
+            if (*(m_buffer++) == '\n') {
                 ++m_lineNumber;
                 break;
             }
         }
     }
 
-	return result;
+    return result;
 }
 
 bool HLSLTokenizer::ScanNumber()
 {
-
     // Don't treat the + or - as part of the number.
-    if (m_buffer[0] == '+' || m_buffer[0] == '-')
-    {
+    if (m_buffer[0] == '+' || m_buffer[0] == '-') {
         return false;
     }
 
     // Parse hex literals.
-    if (m_bufferEnd - m_buffer > 2 && m_buffer[0] == '0' && m_buffer[1] == 'x')
-    {
-        char*   hEnd = NULL;
-        int     iValue = (int)String_ToIntHex(m_buffer+2, &hEnd);
-        if (GetIsNumberSeparator(hEnd[0]))
-        {
+    if (m_bufferEnd - m_buffer > 2 && m_buffer[0] == '0' && m_buffer[1] == 'x') {
+        char* hEnd = NULL;
+        int iValue = (int)String_ToIntHex(m_buffer + 2, &hEnd);
+        if (GetIsNumberSeparator(hEnd[0])) {
             m_buffer = hEnd;
-            m_token  = HLSLToken_IntLiteral; // TODO: handle uint, etc.
+            m_token = HLSLToken_IntLiteral; // TODO: handle uint, etc.
             m_iValue = iValue;
             return true;
         }
@@ -490,37 +451,33 @@ bool HLSLTokenizer::ScanNumber()
     char* fEnd = NULL;
     double fValue = String_ToDouble(m_buffer, &fEnd);
 
-    if (fEnd == m_buffer)
-    {
+    if (fEnd == m_buffer) {
         return false;
     }
 
-    char*  iEnd = NULL;
-    int    iValue = String_ToInt(m_buffer, &iEnd);
+    char* iEnd = NULL;
+    int iValue = String_ToInt(m_buffer, &iEnd);
 
     // TODO: handle lf, etc.  Double not really worth adding, since it's
     // so hobbled.
-    
+
     // If the character after the number is an f then the f is treated as part
     // of the number (to handle 1.0f syntax).
     bool isHalf = false;
-	if( ( fEnd[ 0 ] == 'f' || fEnd[ 0 ] == 'h' ) && fEnd < m_bufferEnd )
-	{
-        isHalf = fEnd[ 0 ] == 'h';
+    if ((fEnd[0] == 'f' || fEnd[0] == 'h') && fEnd < m_bufferEnd) {
+        isHalf = fEnd[0] == 'h';
         ++fEnd;
-	}
+    }
 
-	if( fEnd > iEnd && GetIsNumberSeparator( fEnd[ 0 ] ) )
-	{
-		m_buffer = fEnd;
-        m_token = ( isHalf || fEnd[ 0 ] == 'h' ) ? HLSLToken_HalfLiteral : HLSLToken_FloatLiteral;
+    if (fEnd > iEnd && GetIsNumberSeparator(fEnd[0])) {
+        m_buffer = fEnd;
+        m_token = (isHalf || fEnd[0] == 'h') ? HLSLToken_HalfLiteral : HLSLToken_FloatLiteral;
         m_fValue = static_cast<float>(fValue);
         return true;
     }
-    else if (iEnd > m_buffer && GetIsNumberSeparator(iEnd[0]))
-    {
+    else if (iEnd > m_buffer && GetIsNumberSeparator(iEnd[0])) {
         m_buffer = iEnd;
-        m_token  = HLSLToken_IntLiteral; // TODO: uint/short/ushort
+        m_token = HLSLToken_IntLiteral; // TODO: uint/short/ushort
         m_iValue = iValue;
         return true;
     }
@@ -532,15 +489,12 @@ bool HLSLTokenizer::ScanLineDirective()
 {
     static const char* keyword = "#line";
     static uint32_t keywordLen = (uint32_t)strlen(keyword);
-    
-    if (strncmp(m_buffer, keyword, keywordLen) == 0 && isspace(m_buffer[keywordLen]))
-    {
+
+    if (strncmp(m_buffer, keyword, keywordLen) == 0 && isspace(m_buffer[keywordLen])) {
         m_buffer += keywordLen;
-        
-        while (m_buffer < m_bufferEnd && isspace(m_buffer[0]))
-        {
-            if (m_buffer[0] == '\n')
-            {
+
+        while (m_buffer < m_bufferEnd && isspace(m_buffer[0])) {
+            if (m_buffer[0] == '\n') {
                 Error("Syntax error: expected line number after #line");
                 return false;
             }
@@ -550,43 +504,36 @@ bool HLSLTokenizer::ScanLineDirective()
         char* iEnd = NULL;
         int lineNumber = String_ToInt(m_buffer, &iEnd);
 
-        if (!isspace(*iEnd))
-        {
+        if (!isspace(*iEnd)) {
             Error("Syntax error: expected line number after #line");
             return false;
         }
 
         m_buffer = iEnd;
-        while (m_buffer < m_bufferEnd && isspace(m_buffer[0]))
-        {
+        while (m_buffer < m_bufferEnd && isspace(m_buffer[0])) {
             char c = m_buffer[0];
             ++m_buffer;
-            if (c == '\n')
-            {
+            if (c == '\n') {
                 m_lineNumber = lineNumber;
                 return true;
             }
         }
 
-        if (m_buffer >= m_bufferEnd)
-        {
+        if (m_buffer >= m_bufferEnd) {
             m_lineNumber = lineNumber;
             return true;
         }
 
-        if (m_buffer[0] != '"')
-        {
+        if (m_buffer[0] != '"') {
             Error("Syntax error: expected '\"' after line number near #line");
             return false;
         }
-            
+
         ++m_buffer;
-        
+
         int i = 0;
-        while (i + 1 < s_maxIdentifier && m_buffer < m_bufferEnd && m_buffer[0] != '"')
-        {
-            if (m_buffer[0] == '\n')
-            {
+        while (i + 1 < s_maxIdentifier && m_buffer < m_bufferEnd && m_buffer[0] != '"') {
+            if (m_buffer[0] == '\n') {
                 Error("Syntax error: expected '\"' before end of line near #line");
                 return false;
             }
@@ -595,28 +542,24 @@ bool HLSLTokenizer::ScanLineDirective()
             ++m_buffer;
             ++i;
         }
-        
+
         m_lineDirectiveFileName[i] = 0;
-        
-        if (m_buffer >= m_bufferEnd)
-        {
+
+        if (m_buffer >= m_bufferEnd) {
             Error("Syntax error: expected '\"' before end of file near #line");
             return false;
         }
 
-        if (i + 1 >= s_maxIdentifier)
-        {
+        if (i + 1 >= s_maxIdentifier) {
             Error("Syntax error: file name too long near #line");
             return false;
         }
 
         // Skip the closing quote
         ++m_buffer;
-        
-        while (m_buffer < m_bufferEnd && m_buffer[0] != '\n')
-        {
-            if (!isspace(m_buffer[0]))
-            {
+
+        while (m_buffer < m_bufferEnd && m_buffer[0] != '\n') {
+            if (!isspace(m_buffer[0])) {
                 Error("Syntax error: unexpected input after file name near #line");
                 return false;
             }
@@ -630,11 +573,9 @@ bool HLSLTokenizer::ScanLineDirective()
         m_fileName = m_lineDirectiveFileName;
 
         return true;
-
     }
 
     return false;
-
 }
 
 int HLSLTokenizer::GetToken() const
@@ -677,12 +618,11 @@ void HLSLTokenizer::Error(const char* format, ...)
     // It's not always convenient to stop executing when an error occurs,
     // so just track once we've hit an error and stop reporting them until
     // we successfully bail out of execution.
-    if (m_error)
-    {
+    if (m_error) {
         return;
     }
     m_error = true;
-    
+
     va_list args;
     va_start(args, format);
     Log_ErrorArgList(format, args, m_fileName, m_lineNumber);
@@ -694,27 +634,23 @@ void HLSLTokenizer::Error(const char* format, ...)
     // Gcc/lcang convention (must be absolute filename for clickthrough)
     // Visual Stuidio can pick up on this formatting too
     //Log_Error("%s:%d: %s: %s\n", m_fileName, m_lineNumber, isError ? "error" : "warning", buffer);
-} 
+}
 
 void HLSLTokenizer::GetTokenName(char buffer[s_maxIdentifier]) const
 {
-    if (m_token == HLSLToken_FloatLiteral || m_token == HLSLToken_HalfLiteral )
-    {
+    if (m_token == HLSLToken_FloatLiteral || m_token == HLSLToken_HalfLiteral) {
         snprintf(buffer, s_maxIdentifier, "%f", m_fValue);
-        
+
         String_StripTrailingFloatZeroes(buffer);
     }
-    else if (m_token == HLSLToken_IntLiteral)
-    {
+    else if (m_token == HLSLToken_IntLiteral) {
         snprintf(buffer, s_maxIdentifier, "%d", m_iValue);
     }
     // TODO: short/ushort/uint
-    else if (m_token == HLSLToken_Identifier)
-    {
+    else if (m_token == HLSLToken_Identifier) {
         String_Copy(buffer, m_identifier, s_maxIdentifier);
     }
-    else
-    {
+    else {
         GetTokenName(m_token, buffer);
     }
 }
@@ -722,89 +658,84 @@ void HLSLTokenizer::GetTokenName(char buffer[s_maxIdentifier]) const
 void HLSLTokenizer::GetTokenName(int token, char buffer[s_maxIdentifier])
 {
     // ascii
-    if (token < 256)
-    {
+    if (token < 256) {
         buffer[0] = (char)token;
         buffer[1] = 0;
     }
-    else if (token < HLSLToken_LessEqual)
-    {
+    else if (token < HLSLToken_LessEqual) {
         strcpy(buffer, _reservedWords[token - 256]);
     }
-    else
-    {
-        switch (token)
-        {
-        case HLSLToken_PlusPlus:
-            strcpy(buffer, "++");
-            break;
-        case HLSLToken_MinusMinus:
-            strcpy(buffer, "--");
-            break;
-                
-        case HLSLToken_PlusEqual:
-            strcpy(buffer, "+=");
-            break;
-        case HLSLToken_MinusEqual:
-            strcpy(buffer, "-=");
-            break;
-        case HLSLToken_TimesEqual:
-            strcpy(buffer, "*=");
-            break;
-        case HLSLToken_DivideEqual:
-            strcpy(buffer, "/=");
-            break;
-                
-        // DONE: Missing several token types
-        case HLSLToken_LessEqual:
-            strcpy(buffer, "<=");
-            break;
-        case HLSLToken_GreaterEqual:
-            strcpy(buffer, ">=");
-            break;
-        case HLSLToken_EqualEqual:
-            strcpy(buffer, "==");
-            break;
-        case HLSLToken_NotEqual:
-            strcpy(buffer, "!=");
-            break;
-
-        case HLSLToken_LogicalAnd:
-            strcpy(buffer, "&&");
-            break;
-        case HLSLToken_LogicalOr:
-            strcpy(buffer, "||");
-            break;
-                
-        // literals
-		case HLSLToken_HalfLiteral:
-			strcpy( buffer, "half" );
-			break;
-        case HLSLToken_FloatLiteral:
-            strcpy(buffer, "float");
-            break;
-        case HLSLToken_IntLiteral:
-            strcpy(buffer, "int");
-            break;
-        // TODO: need uint, short, ushort
-                
-        case HLSLToken_Identifier:
-            strcpy(buffer, "identifier");
-            break;
-        case HLSLToken_EndOfStream:
-            strcpy(buffer, "<eof>");
-            break;
-                
-        case HLSLToken_Comment:
-            strcpy(buffer, "comment");
-            break;
-            
-        default:
-            strcpy(buffer, "unknown");
-            break;
+    else {
+        switch (token) {
+            case HLSLToken_PlusPlus:
+                strcpy(buffer, "++");
+                break;
+            case HLSLToken_MinusMinus:
+                strcpy(buffer, "--");
+                break;
+
+            case HLSLToken_PlusEqual:
+                strcpy(buffer, "+=");
+                break;
+            case HLSLToken_MinusEqual:
+                strcpy(buffer, "-=");
+                break;
+            case HLSLToken_TimesEqual:
+                strcpy(buffer, "*=");
+                break;
+            case HLSLToken_DivideEqual:
+                strcpy(buffer, "/=");
+                break;
+
+            // DONE: Missing several token types
+            case HLSLToken_LessEqual:
+                strcpy(buffer, "<=");
+                break;
+            case HLSLToken_GreaterEqual:
+                strcpy(buffer, ">=");
+                break;
+            case HLSLToken_EqualEqual:
+                strcpy(buffer, "==");
+                break;
+            case HLSLToken_NotEqual:
+                strcpy(buffer, "!=");
+                break;
+
+            case HLSLToken_LogicalAnd:
+                strcpy(buffer, "&&");
+                break;
+            case HLSLToken_LogicalOr:
+                strcpy(buffer, "||");
+                break;
+
+            // literals
+            case HLSLToken_HalfLiteral:
+                strcpy(buffer, "half");
+                break;
+            case HLSLToken_FloatLiteral:
+                strcpy(buffer, "float");
+                break;
+            case HLSLToken_IntLiteral:
+                strcpy(buffer, "int");
+                break;
+                // TODO: need uint, short, ushort
+
+            case HLSLToken_Identifier:
+                strcpy(buffer, "identifier");
+                break;
+            case HLSLToken_EndOfStream:
+                strcpy(buffer, "<eof>");
+                break;
+
+            case HLSLToken_Comment:
+                strcpy(buffer, "comment");
+                break;
+
+            default:
+                strcpy(buffer, "unknown");
+                break;
         }
     }
-
 }
 
-}
+} //namespace M4
diff --git a/hlslparser/src/HLSLTokenizer.h b/hlslparser/src/HLSLTokenizer.h
index 166309a4..b48d451a 100644
--- a/hlslparser/src/HLSLTokenizer.h
+++ b/hlslparser/src/HLSLTokenizer.h
@@ -2,44 +2,42 @@
 
 #include "Engine.h"
 
-namespace M4
-{
+namespace M4 {
 
 /** In addition to the values in this enum, all of the ASCII characters are
 valid tokens. */
-enum HLSLToken
-{
+enum HLSLToken {
     // The order here must match the order in the _reservedWords
-    
+
     // Built-in types.
-    HLSLToken_Float         = 256,
+    HLSLToken_Float = 256,
     HLSLToken_Float2,
     HLSLToken_Float3,
     HLSLToken_Float4,
-	HLSLToken_Float2x2,
+    HLSLToken_Float2x2,
     HLSLToken_Float3x3,
     HLSLToken_Float4x4,
-    
+
     // for Nvidia/Adreno
     HLSLToken_Halfio,
     HLSLToken_Half2io,
     HLSLToken_Half3io,
     HLSLToken_Half4io,
-    
+
     // for Android w/o fp16 storage
     HLSLToken_Halfst,
     HLSLToken_Half2st,
     HLSLToken_Half3st,
     HLSLToken_Half4st,
-    
+
     HLSLToken_Half,
     HLSLToken_Half2,
     HLSLToken_Half3,
     HLSLToken_Half4,
-	HLSLToken_Half2x2,
+    HLSLToken_Half2x2,
     HLSLToken_Half3x3,
     HLSLToken_Half4x4,
-    
+
     HLSLToken_Double,
     HLSLToken_Double2,
     HLSLToken_Double3,
@@ -47,17 +45,17 @@ enum HLSLToken
     HLSLToken_Double2x2,
     HLSLToken_Double3x3,
     HLSLToken_Double4x4,
-    
+
     HLSLToken_Bool,
-	HLSLToken_Bool2,
-	HLSLToken_Bool3,
-	HLSLToken_Bool4,
-    
+    HLSLToken_Bool2,
+    HLSLToken_Bool3,
+    HLSLToken_Bool4,
+
     HLSLToken_Int,
     HLSLToken_Int2,
     HLSLToken_Int3,
     HLSLToken_Int4,
-    
+
     HLSLToken_Uint,
     HLSLToken_Uint2,
     HLSLToken_Uint3,
@@ -67,22 +65,22 @@ enum HLSLToken
     HLSLToken_Short2,
     HLSLToken_Short3,
     HLSLToken_Short4,
-    
+
     HLSLToken_Ushort,
     HLSLToken_Ushort2,
     HLSLToken_Ushort3,
     HLSLToken_Ushort4,
-    
+
     HLSLToken_Long,
     HLSLToken_Long2,
     HLSLToken_Long3,
     HLSLToken_Long4,
-    
+
     HLSLToken_Ulong,
     HLSLToken_Ulong2,
     HLSLToken_Ulong3,
     HLSLToken_Ulong4,
-    
+
     // TODO: u/char
     HLSLToken_Texture2D,
     HLSLToken_Texture3D,
@@ -90,17 +88,17 @@ enum HLSLToken
     HLSLToken_Texture2DArray,
     HLSLToken_TextureCubeArray,
     HLSLToken_Texture2DMS,
-    
+
     HLSLToken_Depth2D,
     HLSLToken_Depth2DArray,
     HLSLToken_DepthCube,
     // TODO: other depth types
-    
+
     HLSLToken_RWTexture2D,
-    
+
     HLSLToken_SamplerState,
     HLSLToken_SamplerComparisonState,
-    
+
     // Reserved words.
     HLSLToken_If,
     HLSLToken_Else,
@@ -111,11 +109,11 @@ enum HLSLToken
     HLSLToken_False,
     HLSLToken_Void,
     HLSLToken_Struct,
-    
+
     // dx9
     HLSLToken_CBuffer,
     HLSLToken_TBuffer,
-    
+
     // dx10 templated types (TODO: hook to parser and generator)
     HLSLToken_ConstantBuffer,
     HLSLToken_StructuredBuffer,
@@ -125,12 +123,12 @@ enum HLSLToken
     HLSLToken_ByteAddressBuffer,
     HLSLToken_RWByteAddressBuffer,
     // RWTexture, and other types
-    
+
     HLSLToken_Register,
     HLSLToken_Return,
     HLSLToken_Continue,
     HLSLToken_Discard,
-    
+
     HLSLToken_Const,
     HLSLToken_Static,
     HLSLToken_Inline,
@@ -150,10 +148,10 @@ enum HLSLToken
     HLSLToken_Include,
     // HLSLToken_Pragma
     // HLSLToken_Line
-    
+
     //===================
     // End of strings that have to match in _reservedWords in .cpp
-    
+
     // Multi-character symbols.
     HLSLToken_LessEqual,
     HLSLToken_GreaterEqual,
@@ -165,29 +163,26 @@ enum HLSLToken
     HLSLToken_MinusEqual,
     HLSLToken_TimesEqual,
     HLSLToken_DivideEqual,
-    HLSLToken_LogicalAnd,       // &&
-    HLSLToken_LogicalOr,        // ||
-    
+    HLSLToken_LogicalAnd, // &&
+    HLSLToken_LogicalOr, // ||
+
     // Other token types.
     HLSLToken_FloatLiteral,
-	HLSLToken_HalfLiteral,
+    HLSLToken_HalfLiteral,
     HLSLToken_IntLiteral,
-    
+
     HLSLToken_Identifier,
-    HLSLToken_Comment,          // Alec added this
-    
+    HLSLToken_Comment, // Alec added this
+
     HLSLToken_EndOfStream,
 };
 
-class HLSLTokenizer
-{
-
+class HLSLTokenizer {
 public:
-
     /// Maximum string length of an identifier.
     constexpr static int s_maxIdentifier = 255 + 1;
     constexpr static int s_maxComment = 4096;
-    
+
     /// The file name is only used for error reporting.
     HLSLTokenizer(const char* fileName, const char* buffer, size_t length);
 
@@ -199,7 +194,7 @@ class HLSLTokenizer
 
     /// Returns the number of the current token.
     float GetFloat() const;
-    int   GetInt() const;
+    int GetInt() const;
 
     /// Returns the identifier for the current token.
     const char* GetIdentifier() const;
@@ -225,35 +220,31 @@ class HLSLTokenizer
 
     /// Tokenizer will default to strip double-slash comments, but this tries to preserve them if true
     void SetKeepComments(bool enable) { m_keepComments = enable; }
-    
-private:
 
+private:
     bool SkipWhitespace();
     bool SkipComment();
-	bool SkipPragmaDirective();
+    bool SkipPragmaDirective();
     bool SkipInclude();
-    
+
     bool ScanNumber();
     bool ScanLineDirective();
 
 private:
-
-    const char*         m_fileName = nullptr;
-    const char*         m_buffer = nullptr;
-    const char*         m_bufferEnd = nullptr;
-    int                 m_lineNumber = 0;
-    bool                m_error = false;
-
-    int                 m_token = 0;
-    float               m_fValue = 0.0f;
-    int                 m_iValue = 0;
-    char                m_identifier[s_maxIdentifier] = {};
-    char                m_comment[s_maxComment] = {};
-    char                m_lineDirectiveFileName[s_maxIdentifier] = {};
-    int                 m_tokenLineNumber = 0;
-    bool                m_keepComments = false;
-
+    const char* m_fileName = nullptr;
+    const char* m_buffer = nullptr;
+    const char* m_bufferEnd = nullptr;
+    int m_lineNumber = 0;
+    bool m_error = false;
+
+    int m_token = 0;
+    float m_fValue = 0.0f;
+    int m_iValue = 0;
+    char m_identifier[s_maxIdentifier] = {};
+    char m_comment[s_maxComment] = {};
+    char m_lineDirectiveFileName[s_maxIdentifier] = {};
+    int m_tokenLineNumber = 0;
+    bool m_keepComments = false;
 };
 
-}
-
+} //namespace M4
diff --git a/hlslparser/src/HLSLTree.cpp b/hlslparser/src/HLSLTree.cpp
index 6a0392aa..4332b6b5 100644
--- a/hlslparser/src/HLSLTree.cpp
+++ b/hlslparser/src/HLSLTree.cpp
@@ -2,12 +2,11 @@
 
 #include "Engine.h"
 
-namespace M4
-{
+namespace M4 {
 
 // TODO: split helper calls out to new .h, so can include that
 // over to HLSLParser.cpp
-extern bool IsSamplerType(const HLSLType & type);
+extern bool IsSamplerType(const HLSLType &type);
 
 extern bool IsScalarType(HLSLBaseType type);
 extern bool IsIntegerType(HLSLBaseType type);
@@ -15,26 +14,22 @@ extern bool IsFloatingType(HLSLBaseType type);
 
 extern int32_t GetVectorDimension(HLSLBaseType type);
 
-
-
-HLSLTree::HLSLTree(Allocator* allocator) :
-    m_allocator(allocator), m_stringPool(allocator)
+HLSLTree::HLSLTree(Allocator *allocator) : m_allocator(allocator), m_stringPool(allocator)
 {
-    m_firstPage         = m_allocator->New<NodePage>();
-    m_firstPage->next   = NULL;
+    m_firstPage = m_allocator->New<NodePage>();
+    m_firstPage->next = NULL;
 
-    m_currentPage       = m_firstPage;
+    m_currentPage = m_firstPage;
     m_currentPageOffset = 0;
 
-    m_root              = AddNode<HLSLRoot>(NULL, 1);
+    m_root = AddNode<HLSLRoot>(NULL, 1);
 }
 
 HLSLTree::~HLSLTree()
 {
-    NodePage* page = m_firstPage;
-    while (page != NULL)
-    {
-        NodePage* next = page->next;
+    NodePage *page = m_firstPage;
+    while (page != NULL) {
+        NodePage *next = page->next;
         m_allocator->Delete(page);
         page = next;
     }
@@ -42,59 +37,55 @@ HLSLTree::~HLSLTree()
 
 void HLSLTree::AllocatePage()
 {
-    NodePage* newPage    = m_allocator->New<NodePage>();
-    newPage->next        = NULL;
-    m_currentPage->next  = newPage;
-    m_currentPageOffset  = 0;
-    m_currentPage        = newPage;
+    NodePage *newPage = m_allocator->New<NodePage>();
+    newPage->next = NULL;
+    m_currentPage->next = newPage;
+    m_currentPageOffset = 0;
+    m_currentPage = newPage;
 }
 
-const char* HLSLTree::AddString(const char* string)
-{   
+const char *HLSLTree::AddString(const char *string)
+{
     return m_stringPool.AddString(string);
 }
 
-const char* HLSLTree::AddStringFormat(const char* format, ...)
+const char *HLSLTree::AddStringFormat(const char *format, ...)
 {
     va_list args;
     va_start(args, format);
-    const char * string = m_stringPool.AddStringFormatList(format, args);
+    const char *string = m_stringPool.AddStringFormatList(format, args);
     va_end(args);
     return string;
 }
 
-bool HLSLTree::GetContainsString(const char* string) const
+bool HLSLTree::GetContainsString(const char *string) const
 {
     return m_stringPool.GetContainsString(string);
 }
 
-HLSLRoot* HLSLTree::GetRoot() const
+HLSLRoot *HLSLTree::GetRoot() const
 {
     return m_root;
 }
 
-void* HLSLTree::AllocateMemory(size_t size)
+void *HLSLTree::AllocateMemory(size_t size)
 {
-    if (m_currentPageOffset + size > s_nodePageSize)
-    {
+    if (m_currentPageOffset + size > s_nodePageSize) {
         AllocatePage();
     }
-    void* buffer = m_currentPage->buffer + m_currentPageOffset;
+    void *buffer = m_currentPage->buffer + m_currentPageOffset;
     m_currentPageOffset += size;
     return buffer;
 }
 
 // @@ This doesn't do any parameter matching. Simply returns the first function with that name.
-HLSLFunction * HLSLTree::FindFunction(const char * name)
+HLSLFunction *HLSLTree::FindFunction(const char *name)
 {
-    HLSLStatement * statement = m_root->statement;
-    while (statement != NULL)
-    {
-        if (statement->nodeType == HLSLNodeType_Function)
-        {
-            HLSLFunction * function = (HLSLFunction *)statement;
-            if (String_Equal(name, function->name))
-            {
+    HLSLStatement *statement = m_root->statement;
+    while (statement != NULL) {
+        if (statement->nodeType == HLSLNodeType_Function) {
+            HLSLFunction *function = (HLSLFunction *)statement;
+            if (String_Equal(name, function->name)) {
                 return function;
             }
         }
@@ -105,56 +96,47 @@ HLSLFunction * HLSLTree::FindFunction(const char * name)
     return NULL;
 }
 
-HLSLDeclaration * HLSLTree::FindGlobalDeclaration(const char * name, HLSLBuffer ** buffer_out/*=NULL*/)
+HLSLDeclaration *HLSLTree::FindGlobalDeclaration(const char *name, HLSLBuffer **buffer_out /*=NULL*/)
 {
-    HLSLStatement * statement = m_root->statement;
-    while (statement != NULL)
-    {
-        if (statement->nodeType == HLSLNodeType_Declaration)
-        {
-            HLSLDeclaration * declaration = (HLSLDeclaration *)statement;
-            if (String_Equal(name, declaration->name))
-            {
+    HLSLStatement *statement = m_root->statement;
+    while (statement != NULL) {
+        if (statement->nodeType == HLSLNodeType_Declaration) {
+            HLSLDeclaration *declaration = (HLSLDeclaration *)statement;
+            if (String_Equal(name, declaration->name)) {
                 if (buffer_out) *buffer_out = NULL;
                 return declaration;
             }
         }
-        else if (statement->nodeType == HLSLNodeType_Buffer)
-        {
-            HLSLBuffer* buffer = (HLSLBuffer*)statement;
-            
+        else if (statement->nodeType == HLSLNodeType_Buffer) {
+            HLSLBuffer *buffer = (HLSLBuffer *)statement;
+
             // This searches the fields to find the buffer,
             // since cbuffer/tbuffer represent globals.
-            if (buffer->IsGlobalFields())
-            {
-                HLSLDeclaration* field = buffer->field;
-                while (field != NULL)
-                {
+            if (buffer->IsGlobalFields()) {
+                HLSLDeclaration *field = buffer->field;
+                while (field != NULL) {
                     ASSERT(field->nodeType == HLSLNodeType_Declaration);
-                    if (String_Equal(name, field->name))
-                    {
+                    if (String_Equal(name, field->name)) {
                         if (buffer_out) *buffer_out = buffer;
                         return field;
                     }
-                    field = (HLSLDeclaration*)field->nextStatement;
+                    field = (HLSLDeclaration *)field->nextStatement;
                 }
             }
-            else
-            {
-                if (String_Equal(name, buffer->name))
-                {
+            else {
+                if (String_Equal(name, buffer->name)) {
                     if (buffer_out) *buffer_out = buffer;
                     return NULL;
                 }
-                
+
                 /* This isn't same type...
-                 
+
                 // Note: should pass buffers, but buffer/texture
                 // and cbuffer fields can be global to entire shader.
-                
+
                 // find struct first
                 const HLSLStruct* bufferStruct = buffer->bufferStruct;
-                
+
                 // new search those for the fields
                 HLSLStructField* field = bufferStruct->field;
                 while (field != NULL)
@@ -178,16 +160,13 @@ HLSLDeclaration * HLSLTree::FindGlobalDeclaration(const char * name, HLSLBuffer
     return NULL;
 }
 
-HLSLStruct * HLSLTree::FindGlobalStruct(const char * name)
+HLSLStruct *HLSLTree::FindGlobalStruct(const char *name)
 {
-    HLSLStatement * statement = m_root->statement;
-    while (statement != NULL)
-    {
-        if (statement->nodeType == HLSLNodeType_Struct)
-        {
-            HLSLStruct * declaration = (HLSLStruct *)statement;
-            if (String_Equal(name, declaration->name))
-            {
+    HLSLStatement *statement = m_root->statement;
+    while (statement != NULL) {
+        if (statement->nodeType == HLSLNodeType_Struct) {
+            HLSLStruct *declaration = (HLSLStruct *)statement;
+            if (String_Equal(name, declaration->name)) {
                 return declaration;
             }
         }
@@ -261,16 +240,13 @@ HLSLPipeline * HLSLTree::FindPipeline(const char * name)
 }
 */
 
-HLSLBuffer * HLSLTree::FindBuffer(const char * name)
+HLSLBuffer *HLSLTree::FindBuffer(const char *name)
 {
-    HLSLStatement * statement = m_root->statement;
-    while (statement != NULL)
-    {
-        if (statement->nodeType == HLSLNodeType_Buffer)
-        {
-            HLSLBuffer * buffer = (HLSLBuffer *)statement;
-            if (String_Equal(name, buffer->name))
-            {
+    HLSLStatement *statement = m_root->statement;
+    while (statement != NULL) {
+        if (statement->nodeType == HLSLNodeType_Buffer) {
+            HLSLBuffer *buffer = (HLSLBuffer *)statement;
+            if (String_Equal(name, buffer->name)) {
                 return buffer;
             }
         }
@@ -281,15 +257,12 @@ HLSLBuffer * HLSLTree::FindBuffer(const char * name)
     return NULL;
 }
 
-
-
-bool HLSLTree::GetExpressionValue(HLSLExpression * expression, int & value)
+bool HLSLTree::GetExpressionValue(HLSLExpression *expression, int &value)
 {
-    ASSERT (expression != NULL);
+    ASSERT(expression != NULL);
 
     // Expression must be constant.
-    if ((expression->expressionType.flags & HLSLTypeFlag_Const) == 0) 
-    {
+    if ((expression->expressionType.flags & HLSLTypeFlag_Const) == 0) {
         return false;
     }
 
@@ -298,30 +271,25 @@ bool HLSLTree::GetExpressionValue(HLSLExpression * expression, int & value)
     if (expression->expressionType.baseType != HLSLBaseType_Long &&
         expression->expressionType.baseType != HLSLBaseType_Short &&
         expression->expressionType.baseType != HLSLBaseType_Int &&
-        
-        expression->expressionType.baseType != HLSLBaseType_Bool)
-    {
+
+        expression->expressionType.baseType != HLSLBaseType_Bool) {
         return false;
     }
 
-    if (expression->expressionType.array) 
-    {
+    if (expression->expressionType.array) {
         return false;
     }
 
-    if (expression->nodeType == HLSLNodeType_BinaryExpression) 
-    {
-        HLSLBinaryExpression * binaryExpression = (HLSLBinaryExpression *)expression;
+    if (expression->nodeType == HLSLNodeType_BinaryExpression) {
+        HLSLBinaryExpression *binaryExpression = (HLSLBinaryExpression *)expression;
 
         int value1, value2;
         if (!GetExpressionValue(binaryExpression->expression1, value1) ||
-            !GetExpressionValue(binaryExpression->expression2, value2))
-        {
+            !GetExpressionValue(binaryExpression->expression2, value2)) {
             return false;
         }
 
-        switch(binaryExpression->binaryOp)
-        {
+        switch (binaryExpression->binaryOp) {
             case HLSLBinaryOp_And:
                 value = value1 && value2;
                 return true;
@@ -376,17 +344,14 @@ bool HLSLTree::GetExpressionValue(HLSLExpression * expression, int & value)
                 return false;
         }
     }
-    else if (expression->nodeType == HLSLNodeType_UnaryExpression) 
-    {
-        HLSLUnaryExpression * unaryExpression = (HLSLUnaryExpression *)expression;
+    else if (expression->nodeType == HLSLNodeType_UnaryExpression) {
+        HLSLUnaryExpression *unaryExpression = (HLSLUnaryExpression *)expression;
 
-        if (!GetExpressionValue(unaryExpression->expression, value))
-        {
+        if (!GetExpressionValue(unaryExpression->expression, value)) {
             return false;
         }
 
-        switch(unaryExpression->unaryOp)
-        {
+        switch (unaryExpression->unaryOp) {
             case HLSLUnaryOp_Negative:
                 value = -value;
                 return true;
@@ -407,38 +372,34 @@ bool HLSLTree::GetExpressionValue(HLSLExpression * expression, int & value)
                 return false;
         }
     }
-    else if (expression->nodeType == HLSLNodeType_IdentifierExpression)
-    {
-        HLSLIdentifierExpression * identifier = (HLSLIdentifierExpression *)expression;
+    else if (expression->nodeType == HLSLNodeType_IdentifierExpression) {
+        HLSLIdentifierExpression *identifier = (HLSLIdentifierExpression *)expression;
 
-        HLSLDeclaration * declaration = FindGlobalDeclaration(identifier->name);
-        if (declaration == NULL) 
-        {
+        HLSLDeclaration *declaration = FindGlobalDeclaration(identifier->name);
+        if (declaration == NULL) {
             return false;
         }
-        if ((declaration->type.flags & HLSLTypeFlag_Const) == 0)
-        {
+        if ((declaration->type.flags & HLSLTypeFlag_Const) == 0) {
             return false;
         }
 
         return GetExpressionValue(declaration->assignment, value);
     }
-    else if (expression->nodeType == HLSLNodeType_LiteralExpression) 
-    {
-        HLSLLiteralExpression * literal = (HLSLLiteralExpression *)expression;
-   
+    else if (expression->nodeType == HLSLNodeType_LiteralExpression) {
+        HLSLLiteralExpression *literal = (HLSLLiteralExpression *)expression;
+
         if (literal->expressionType.baseType == HLSLBaseType_Int)
             value = literal->iValue;
         else if (literal->expressionType.baseType == HLSLBaseType_Long)
             value = literal->iValue; // precision loss to Int
         else if (literal->expressionType.baseType == HLSLBaseType_Short)
             value = literal->iValue;
-        
+
         else if (literal->expressionType.baseType == HLSLBaseType_Bool)
             value = (int)literal->bValue;
         else
             return false;
-        
+
         return true;
     }
 
@@ -446,26 +407,25 @@ bool HLSLTree::GetExpressionValue(HLSLExpression * expression, int & value)
 }
 
 // TODO: Nothing calling this?
-bool HLSLTree::NeedsFunction(const char* name)
+bool HLSLTree::NeedsFunction(const char *name)
 {
     // Early out
     if (!GetContainsString(name))
         return false;
 
-    struct NeedsFunctionVisitor: HLSLTreeVisitor
-    {
-        const char* name;
+    struct NeedsFunctionVisitor : HLSLTreeVisitor {
+        const char *name;
         bool result;
 
         virtual ~NeedsFunctionVisitor() {}
-        
-        virtual void VisitTopLevelStatement(HLSLStatement * node)
+
+        virtual void VisitTopLevelStatement(HLSLStatement *node)
         {
             if (!node->hidden)
                 HLSLTreeVisitor::VisitTopLevelStatement(node);
         }
 
-        virtual void VisitFunctionCall(HLSLFunctionCall * node)
+        virtual void VisitFunctionCall(HLSLFunctionCall *node)
         {
             result = result || String_Equal(name, node->function->name);
 
@@ -483,29 +443,26 @@ bool HLSLTree::NeedsFunction(const char* name)
 }
 
 // Returns dimension, 0 if invalid.
-int HLSLTree::GetExpressionValue(HLSLExpression * expression, float values[4])
+int HLSLTree::GetExpressionValue(HLSLExpression *expression, float values[4])
 {
-    ASSERT (expression != NULL);
+    ASSERT(expression != NULL);
 
     // Expression must be constant.
-    if ((expression->expressionType.flags & HLSLTypeFlag_Const) == 0) 
-    {
+    if ((expression->expressionType.flags & HLSLTypeFlag_Const) == 0) {
         return 0;
     }
 
     HLSLBaseType type = expression->expressionType.baseType;
-    
-    if (IsIntegerType(type))
-    {
-        if (IsScalarType(type))
-        {
+
+    if (IsIntegerType(type)) {
+        if (IsScalarType(type)) {
             int intValue;
             if (GetExpressionValue(expression, intValue)) {
-                for (int i = 0; i < 4; i++) values[i] = (float)intValue;   // @@ Warn if conversion is not exact.
+                for (int i = 0; i < 4; i++) values[i] = (float)intValue; // @@ Warn if conversion is not exact.
                 return 1;
             }
         }
-        
+
         return 0;
     }
     // this skips other int types not handled above
@@ -513,48 +470,40 @@ int HLSLTree::GetExpressionValue(HLSLExpression * expression, float values[4])
         return 0;
 
     // @@ Not supported yet, but we may need it?
-    if (expression->expressionType.array) 
-    {
+    if (expression->expressionType.array) {
         return false;
     }
 
     int dim = GetVectorDimension(type);
 
-    if (expression->nodeType == HLSLNodeType_BinaryExpression) 
-    {
-        HLSLBinaryExpression * binaryExpression = (HLSLBinaryExpression *)expression;
-        
+    if (expression->nodeType == HLSLNodeType_BinaryExpression) {
+        HLSLBinaryExpression *binaryExpression = (HLSLBinaryExpression *)expression;
+
         float values1[4], values2[4];
         int dim1 = GetExpressionValue(binaryExpression->expression1, values1);
         int dim2 = GetExpressionValue(binaryExpression->expression2, values2);
 
-        if (dim1 == 0 || dim2 == 0)
-        {
+        if (dim1 == 0 || dim2 == 0) {
             return 0;
         }
 
-        if (dim1 != dim2)
-        {
+        if (dim1 != dim2) {
             // Broadcast scalar to vector size.
-            if (dim1 == 1)
-            {
+            if (dim1 == 1) {
                 for (int i = 1; i < dim2; i++) values1[i] = values1[0];
                 dim1 = dim2;
             }
-            else if (dim2 == 1)
-            {
+            else if (dim2 == 1) {
                 for (int i = 1; i < dim1; i++) values2[i] = values2[0];
                 dim2 = dim1;
             }
-            else
-            {
+            else {
                 return 0;
             }
         }
         ASSERT(dim == dim1);
 
-        switch(binaryExpression->binaryOp)
-        {
+        switch (binaryExpression->binaryOp) {
             case HLSLBinaryOp_Add:
                 for (int i = 0; i < dim; i++) values[i] = values1[i] + values2[i];
                 return dim;
@@ -571,19 +520,16 @@ int HLSLTree::GetExpressionValue(HLSLExpression * expression, float values[4])
                 return 0;
         }
     }
-    else if (expression->nodeType == HLSLNodeType_UnaryExpression) 
-    {
-        HLSLUnaryExpression * unaryExpression = (HLSLUnaryExpression *)expression;
-       
+    else if (expression->nodeType == HLSLNodeType_UnaryExpression) {
+        HLSLUnaryExpression *unaryExpression = (HLSLUnaryExpression *)expression;
+
         int dim1 = GetExpressionValue(unaryExpression->expression, values);
-        if (dim1 == 0)
-        {
+        if (dim1 == 0) {
             return 0;
         }
         ASSERT(dim == dim1);
 
-        switch(unaryExpression->unaryOp)
-        {
+        switch (unaryExpression->unaryOp) {
             case HLSLUnaryOp_Negative:
                 for (int i = 0; i < dim; i++) values[i] = -values[i];
                 return dim;
@@ -594,14 +540,12 @@ int HLSLTree::GetExpressionValue(HLSLExpression * expression, float values[4])
                 return 0;
         }
     }
-    else if (expression->nodeType == HLSLNodeType_ConstructorExpression)
-    {
-        HLSLConstructorExpression * constructor = (HLSLConstructorExpression *)expression;
+    else if (expression->nodeType == HLSLNodeType_ConstructorExpression) {
+        HLSLConstructorExpression *constructor = (HLSLConstructorExpression *)expression;
 
         int idx = 0;
-        HLSLExpression * arg = constructor->argument;
-        while (arg != NULL)
-        {
+        HLSLExpression *arg = constructor->argument;
+        while (arg != NULL) {
             float tmp[4];
             int n = GetExpressionValue(arg, tmp);
             for (int i = 0; i < n; i++) values[idx + i] = tmp[i];
@@ -613,25 +557,21 @@ int HLSLTree::GetExpressionValue(HLSLExpression * expression, float values[4])
 
         return dim;
     }
-    else if (expression->nodeType == HLSLNodeType_IdentifierExpression)
-    {
-        HLSLIdentifierExpression * identifier = (HLSLIdentifierExpression *)expression;
+    else if (expression->nodeType == HLSLNodeType_IdentifierExpression) {
+        HLSLIdentifierExpression *identifier = (HLSLIdentifierExpression *)expression;
 
-        HLSLDeclaration * declaration = FindGlobalDeclaration(identifier->name);
-        if (declaration == NULL) 
-        {
+        HLSLDeclaration *declaration = FindGlobalDeclaration(identifier->name);
+        if (declaration == NULL) {
             return 0;
         }
-        if ((declaration->type.flags & HLSLTypeFlag_Const) == 0)
-        {
+        if ((declaration->type.flags & HLSLTypeFlag_Const) == 0) {
             return 0;
         }
 
         return GetExpressionValue(declaration->assignment, values);
     }
-    else if (expression->nodeType == HLSLNodeType_LiteralExpression)
-    {
-        HLSLLiteralExpression * literal = (HLSLLiteralExpression *)expression;
+    else if (expression->nodeType == HLSLNodeType_LiteralExpression) {
+        HLSLLiteralExpression *literal = (HLSLLiteralExpression *)expression;
 
         if (literal->expressionType.baseType == HLSLBaseType_Float)
             values[0] = literal->fValue;
@@ -639,44 +579,41 @@ int HLSLTree::GetExpressionValue(HLSLExpression * expression, float values[4])
             values[0] = literal->fValue;
         else if (literal->expressionType.baseType == HLSLBaseType_Double)
             values[0] = literal->fValue; // TODO: need more precision
-        
+
         else if (literal->expressionType.baseType == HLSLBaseType_Bool)
             values[0] = literal->bValue;
-        
+
         // TODO: add uint types, fix precision of short/long/double/half
         // signed ints
         else if (literal->expressionType.baseType == HLSLBaseType_Int)
-            values[0] = (float)literal->iValue;  // @@ Warn if conversion is not exact.
+            values[0] = (float)literal->iValue; // @@ Warn if conversion is not exact.
         else if (literal->expressionType.baseType == HLSLBaseType_Short)
             values[0] = (float)literal->iValue;
         else if (literal->expressionType.baseType == HLSLBaseType_Long)
             values[0] = (float)literal->iValue;
         else
             return 0;
-        
+
         return 1;
     }
 
     return 0;
 }
 
-
-
-
-void HLSLTreeVisitor::VisitType(HLSLType & type)
+void HLSLTreeVisitor::VisitType(HLSLType &type)
 {
 }
 
-void HLSLTreeVisitor::VisitRoot(HLSLRoot * root)
+void HLSLTreeVisitor::VisitRoot(HLSLRoot *root)
 {
-    HLSLStatement * statement = root->statement;
+    HLSLStatement *statement = root->statement;
     while (statement != NULL) {
         VisitTopLevelStatement(statement);
         statement = statement->nextStatement;
     }
 }
 
-void HLSLTreeVisitor::VisitTopLevelStatement(HLSLStatement * node)
+void HLSLTreeVisitor::VisitTopLevelStatement(HLSLStatement *node)
 {
     if (node->nodeType == HLSLNodeType_Declaration) {
         VisitDeclaration((HLSLDeclaration *)node);
@@ -691,23 +628,23 @@ void HLSLTreeVisitor::VisitTopLevelStatement(HLSLStatement * node)
         VisitFunction((HLSLFunction *)node);
     }
     else if (node->nodeType == HLSLNodeType_Comment) {
-        VisitComment((HLSLComment*)node);
+        VisitComment((HLSLComment *)node);
     }
-    
+
     // FX file stuff
-//    else if (node->nodeType == HLSLNodeType_Technique) {
-//        VisitTechnique((HLSLTechnique *)node);
-//    }
-//    else if (node->nodeType == HLSLNodeType_Pipeline) {
-//        VisitPipeline((HLSLPipeline *)node);
-//    }
-    
+    // else if (node->nodeType == HLSLNodeType_Technique) {
+    //     VisitTechnique((HLSLTechnique *)node);
+    // }
+    // else if (node->nodeType == HLSLNodeType_Pipeline) {
+    //     VisitPipeline((HLSLPipeline *)node);
+    // }
+
     else {
         ASSERT(false);
     }
 }
 
-void HLSLTreeVisitor::VisitStatements(HLSLStatement * statement)
+void HLSLTreeVisitor::VisitStatements(HLSLStatement *statement)
 {
     while (statement != NULL) {
         VisitStatement(statement);
@@ -715,7 +652,7 @@ void HLSLTreeVisitor::VisitStatements(HLSLStatement * statement)
     }
 }
 
-void HLSLTreeVisitor::VisitStatement(HLSLStatement * node)
+void HLSLTreeVisitor::VisitStatement(HLSLStatement *node)
 {
     // Function statements
     if (node->nodeType == HLSLNodeType_Declaration) {
@@ -753,7 +690,7 @@ void HLSLTreeVisitor::VisitStatement(HLSLStatement * node)
     }
 }
 
-void HLSLTreeVisitor::VisitDeclaration(HLSLDeclaration * node)
+void HLSLTreeVisitor::VisitDeclaration(HLSLDeclaration *node)
 {
     VisitType(node->type);
     /*do {
@@ -768,31 +705,29 @@ void HLSLTreeVisitor::VisitDeclaration(HLSLDeclaration * node)
     }
 }
 
-void HLSLTreeVisitor::VisitStruct(HLSLStruct * node)
+void HLSLTreeVisitor::VisitStruct(HLSLStruct *node)
 {
-    HLSLStructField * field = node->field;
+    HLSLStructField *field = node->field;
     while (field != NULL) {
         VisitStructField(field);
         field = field->nextField;
     }
 }
 
-void HLSLTreeVisitor::VisitStructField(HLSLStructField * node)
+void HLSLTreeVisitor::VisitStructField(HLSLStructField *node)
 {
     // This can use a constant in an array field that must be resolved
-    if (node->type.array)
-    {
+    if (node->type.array) {
         VisitExpression(node->type.arraySize);
     }
-    
+
     VisitType(node->type);
 }
 
-void HLSLTreeVisitor::VisitBuffer(HLSLBuffer * node)
+void HLSLTreeVisitor::VisitBuffer(HLSLBuffer *node)
 {
-    if (node->IsGlobalFields())
-    {
-        HLSLDeclaration* field = node->field;
+    if (node->IsGlobalFields()) {
+        HLSLDeclaration *field = node->field;
         while (field != NULL) {
             ASSERT(field->nodeType == HLSLNodeType_Declaration);
             VisitDeclaration(field);
@@ -800,8 +735,7 @@ void HLSLTreeVisitor::VisitBuffer(HLSLBuffer * node)
             field = (HLSLDeclaration *)field->nextStatement;
         }
     }
-    else
-    {
+    else {
         VisitStruct(node->bufferStruct);
     }
 }
@@ -811,11 +745,11 @@ void HLSLTreeVisitor::VisitBuffer(HLSLBuffer * node)
     VisitType(node->type);
 }*/
 
-void HLSLTreeVisitor::VisitFunction(HLSLFunction * node)
+void HLSLTreeVisitor::VisitFunction(HLSLFunction *node)
 {
     VisitType(node->returnType);
 
-    HLSLArgument* argument = node->argument;
+    HLSLArgument *argument = node->argument;
     while (argument != NULL) {
         VisitArgument(argument);
         argument = argument->nextArgument;
@@ -824,7 +758,7 @@ void HLSLTreeVisitor::VisitFunction(HLSLFunction * node)
     VisitStatements(node->statement);
 }
 
-void HLSLTreeVisitor::VisitArgument(HLSLArgument * node)
+void HLSLTreeVisitor::VisitArgument(HLSLArgument *node)
 {
     VisitType(node->type);
     if (node->defaultValue != NULL) {
@@ -832,12 +766,12 @@ void HLSLTreeVisitor::VisitArgument(HLSLArgument * node)
     }
 }
 
-void HLSLTreeVisitor::VisitExpressionStatement(HLSLExpressionStatement * node)
+void HLSLTreeVisitor::VisitExpressionStatement(HLSLExpressionStatement *node)
 {
     VisitExpression(node->expression);
 }
 
-void HLSLTreeVisitor::VisitExpression(HLSLExpression * node)
+void HLSLTreeVisitor::VisitExpression(HLSLExpression *node)
 {
     VisitType(node->expressionType);
 
@@ -872,29 +806,29 @@ void HLSLTreeVisitor::VisitExpression(HLSLExpression * node)
         VisitFunctionCall((HLSLFunctionCall *)node);
     }
     else if (node->nodeType == HLSLNodeType_MemberFunctionCall) {
-        HLSLMemberFunctionCall* memberFunctionCall = (HLSLMemberFunctionCall *)node;
-        VisitIdentifierExpression((HLSLIdentifierExpression*)memberFunctionCall->memberIdentifier); // const_cast
+        HLSLMemberFunctionCall *memberFunctionCall = (HLSLMemberFunctionCall *)node;
+        VisitIdentifierExpression((HLSLIdentifierExpression *)memberFunctionCall->memberIdentifier); // const_cast
         VisitFunctionCall(memberFunctionCall);
     }
     // Acoget-TODO: This was missing. Did adding it break anything?
-//    else if (node->nodeType == HLSLNodeType_SamplerState) {
-//        VisitSamplerState((HLSLSamplerState *)node);
-//    }
+    // else if (node->nodeType == HLSLNodeType_SamplerState) {
+    //     VisitSamplerState((HLSLSamplerState *)node);
+    // }
     else {
         ASSERT(false);
     }
 }
 
-void HLSLTreeVisitor::VisitReturnStatement(HLSLReturnStatement * node)
+void HLSLTreeVisitor::VisitReturnStatement(HLSLReturnStatement *node)
 {
     VisitExpression(node->expression);
 }
 
-void HLSLTreeVisitor::VisitDiscardStatement(HLSLDiscardStatement * node) {}
-void HLSLTreeVisitor::VisitBreakStatement(HLSLBreakStatement * node) {}
-void HLSLTreeVisitor::VisitContinueStatement(HLSLContinueStatement * node) {}
+void HLSLTreeVisitor::VisitDiscardStatement(HLSLDiscardStatement *node) {}
+void HLSLTreeVisitor::VisitBreakStatement(HLSLBreakStatement *node) {}
+void HLSLTreeVisitor::VisitContinueStatement(HLSLContinueStatement *node) {}
 
-void HLSLTreeVisitor::VisitIfStatement(HLSLIfStatement * node)
+void HLSLTreeVisitor::VisitIfStatement(HLSLIfStatement *node)
 {
     VisitExpression(node->condition);
     VisitStatements(node->statement);
@@ -903,7 +837,7 @@ void HLSLTreeVisitor::VisitIfStatement(HLSLIfStatement * node)
     }
 }
 
-void HLSLTreeVisitor::VisitForStatement(HLSLForStatement * node)
+void HLSLTreeVisitor::VisitForStatement(HLSLForStatement *node)
 {
     if (node->initialization) {
         VisitDeclaration(node->initialization);
@@ -917,61 +851,61 @@ void HLSLTreeVisitor::VisitForStatement(HLSLForStatement * node)
     VisitStatements(node->statement);
 }
 
-void HLSLTreeVisitor::VisitBlockStatement(HLSLBlockStatement * node)
+void HLSLTreeVisitor::VisitBlockStatement(HLSLBlockStatement *node)
 {
     VisitStatements(node->statement);
 }
 
-void HLSLTreeVisitor::VisitUnaryExpression(HLSLUnaryExpression * node)
+void HLSLTreeVisitor::VisitUnaryExpression(HLSLUnaryExpression *node)
 {
     VisitExpression(node->expression);
 }
 
-void HLSLTreeVisitor::VisitBinaryExpression(HLSLBinaryExpression * node)
+void HLSLTreeVisitor::VisitBinaryExpression(HLSLBinaryExpression *node)
 {
     VisitExpression(node->expression1);
     VisitExpression(node->expression2);
 }
 
-void HLSLTreeVisitor::VisitConditionalExpression(HLSLConditionalExpression * node)
+void HLSLTreeVisitor::VisitConditionalExpression(HLSLConditionalExpression *node)
 {
     VisitExpression(node->condition);
     VisitExpression(node->falseExpression);
     VisitExpression(node->trueExpression);
 }
 
-void HLSLTreeVisitor::VisitCastingExpression(HLSLCastingExpression * node)
+void HLSLTreeVisitor::VisitCastingExpression(HLSLCastingExpression *node)
 {
     VisitType(node->type);
     VisitExpression(node->expression);
 }
 
-void HLSLTreeVisitor::VisitLiteralExpression(HLSLLiteralExpression * node) {}
-void HLSLTreeVisitor::VisitIdentifierExpression(HLSLIdentifierExpression * node) {}
+void HLSLTreeVisitor::VisitLiteralExpression(HLSLLiteralExpression *node) {}
+void HLSLTreeVisitor::VisitIdentifierExpression(HLSLIdentifierExpression *node) {}
 
-void HLSLTreeVisitor::VisitConstructorExpression(HLSLConstructorExpression * node)
+void HLSLTreeVisitor::VisitConstructorExpression(HLSLConstructorExpression *node)
 {
-    HLSLExpression * argument = node->argument;
+    HLSLExpression *argument = node->argument;
     while (argument != NULL) {
         VisitExpression(argument);
         argument = argument->nextExpression;
     }
 }
 
-void HLSLTreeVisitor::VisitMemberAccess(HLSLMemberAccess * node)
+void HLSLTreeVisitor::VisitMemberAccess(HLSLMemberAccess *node)
 {
     VisitExpression(node->object);
 }
 
-void HLSLTreeVisitor::VisitArrayAccess(HLSLArrayAccess * node)
+void HLSLTreeVisitor::VisitArrayAccess(HLSLArrayAccess *node)
 {
     VisitExpression(node->array);
     VisitExpression(node->index);
 }
 
-void HLSLTreeVisitor::VisitFunctionCall(HLSLFunctionCall * node)
+void HLSLTreeVisitor::VisitFunctionCall(HLSLFunctionCall *node)
 {
-    HLSLExpression * argument = node->argument;
+    HLSLExpression *argument = node->argument;
     while (argument != NULL) {
         VisitExpression(argument);
         argument = argument->nextExpression;
@@ -1015,14 +949,13 @@ void HLSLTreeVisitor::VisitPipeline(HLSLPipeline * node)
 }
 */
 
-void HLSLTreeVisitor::VisitComment(HLSLComment * node)
+void HLSLTreeVisitor::VisitComment(HLSLComment *node)
 {
-    
 }
 
-void HLSLTreeVisitor::VisitFunctions(HLSLRoot * root)
+void HLSLTreeVisitor::VisitFunctions(HLSLRoot *root)
 {
-    HLSLStatement * statement = root->statement;
+    HLSLStatement *statement = root->statement;
     while (statement != NULL) {
         if (statement->nodeType == HLSLNodeType_Function) {
             VisitFunction((HLSLFunction *)statement);
@@ -1032,9 +965,9 @@ void HLSLTreeVisitor::VisitFunctions(HLSLRoot * root)
     }
 }
 
-void HLSLTreeVisitor::VisitParameters(HLSLRoot * root)
+void HLSLTreeVisitor::VisitParameters(HLSLRoot *root)
 {
-    HLSLStatement * statement = root->statement;
+    HLSLStatement *statement = root->statement;
     while (statement != NULL) {
         if (statement->nodeType == HLSLNodeType_Declaration) {
             VisitDeclaration((HLSLDeclaration *)statement);
@@ -1044,54 +977,49 @@ void HLSLTreeVisitor::VisitParameters(HLSLRoot * root)
     }
 }
 
-
-class ResetHiddenFlagVisitor : public HLSLTreeVisitor
-{
+class ResetHiddenFlagVisitor : public HLSLTreeVisitor {
 public:
     virtual ~ResetHiddenFlagVisitor() {}
-    
-    virtual void VisitTopLevelStatement(HLSLStatement * statement) override
+
+    virtual void VisitTopLevelStatement(HLSLStatement *statement) override
     {
         statement->hidden = true;
 
-        if (statement->nodeType == HLSLNodeType_Buffer)
-        {
-            VisitBuffer((HLSLBuffer*)statement);
+        if (statement->nodeType == HLSLNodeType_Buffer) {
+            VisitBuffer((HLSLBuffer *)statement);
         }
     }
 
     // Hide buffer fields.
-    virtual void VisitDeclaration(HLSLDeclaration * node) override
+    virtual void VisitDeclaration(HLSLDeclaration *node) override
     {
-       // node->hidden = true;
+        // node->hidden = true;
     }
 
-    virtual void VisitComment(HLSLComment * node) override
+    virtual void VisitComment(HLSLComment *node) override
     {
         node->hidden = true;
     }
-    
-    virtual void VisitArgument(HLSLArgument * node) override
+
+    virtual void VisitArgument(HLSLArgument *node) override
     {
-        node->hidden = false;   // Arguments are visible by default.
+        node->hidden = false; // Arguments are visible by default.
     }
 };
 
-class MarkVisibleStatementsVisitor : public HLSLTreeVisitor
-{
+class MarkVisibleStatementsVisitor : public HLSLTreeVisitor {
 public:
-    
-    HLSLTree * tree;
-    MarkVisibleStatementsVisitor(HLSLTree * tree) : tree(tree) {}
+    HLSLTree *tree;
+    MarkVisibleStatementsVisitor(HLSLTree *tree) : tree(tree) {}
 
     virtual ~MarkVisibleStatementsVisitor() {}
-    
-    virtual void VisitComment(HLSLComment * node) override
+
+    virtual void VisitComment(HLSLComment *node) override
     {
         node->hidden = false;
     }
 
-    virtual void VisitFunction(HLSLFunction * node) override
+    virtual void VisitFunction(HLSLFunction *node) override
     {
         node->hidden = false;
         HLSLTreeVisitor::VisitFunction(node);
@@ -1100,118 +1028,101 @@ class MarkVisibleStatementsVisitor : public HLSLTreeVisitor
             VisitFunction(node->forward);
     }
 
-    virtual void VisitFunctionCall(HLSLFunctionCall * node) override
+    virtual void VisitFunctionCall(HLSLFunctionCall *node) override
     {
         HLSLTreeVisitor::VisitFunctionCall(node);
 
-        if (node->function->hidden)
-        {
-            VisitFunction(const_cast<HLSLFunction*>(node->function));
+        if (node->function->hidden) {
+            VisitFunction(const_cast<HLSLFunction *>(node->function));
         }
     }
 
-    virtual void VisitIdentifierExpression(HLSLIdentifierExpression * node) override
+    virtual void VisitIdentifierExpression(HLSLIdentifierExpression *node) override
     {
         HLSLTreeVisitor::VisitIdentifierExpression(node);
-        
-        if (node->global)
-        {
-            HLSLBuffer* buffer = NULL;
-            HLSLDeclaration * declaration = tree->FindGlobalDeclaration(node->name, &buffer);
 
-            if (declaration != NULL && declaration->hidden)
-            {
+        if (node->global) {
+            HLSLBuffer *buffer = NULL;
+            HLSLDeclaration *declaration = tree->FindGlobalDeclaration(node->name, &buffer);
+
+            if (declaration != NULL && declaration->hidden) {
                 declaration->hidden = false;
                 VisitDeclaration(declaration);
             }
-            if (buffer != NULL && buffer->hidden)
-            {
+            if (buffer != NULL && buffer->hidden) {
                 buffer->hidden = false;
             }
         }
     }
-        
-    virtual void VisitType(HLSLType & type) override
+
+    virtual void VisitType(HLSLType &type) override
     {
-//        if (type.array)
-//        {
-//            //  Alec added this to try to handle structs with array constants, but
-//            // it causes other issues.  VisitStructField calls VisitType.
-//
-//            // handle sized or unsized array, since sized may use constant
-//            // VisitExpression(type.arraySize);
-//            int bp = 0;
-//            bp = bp;
-//        }
-//        else
-        if (type.baseType == HLSLBaseType_UserDefined)
-        {
-            HLSLStruct * globalStruct = tree->FindGlobalStruct(type.typeName);
-            if (globalStruct != NULL)
-            {
+        // if (type.array)
+        // {
+        //     //  Alec added this to try to handle structs with array constants, but
+        //     // it causes other issues.  VisitStructField calls VisitType.
+        //
+        //     // handle sized or unsized array, since sized may use constant
+        //     // VisitExpression(type.arraySize);
+        //     int bp = 0;
+        //     bp = bp;
+        // }
+        // else
+        if (type.baseType == HLSLBaseType_UserDefined) {
+            HLSLStruct *globalStruct = tree->FindGlobalStruct(type.typeName);
+            if (globalStruct != NULL) {
                 globalStruct->hidden = false;
                 VisitStruct(globalStruct);
             }
         }
     }
-
 };
 
-
-void PruneTree(HLSLTree* tree, const char* entryName0, const char* entryName1/*=NULL*/)
+void PruneTree(HLSLTree *tree, const char *entryName0, const char *entryName1 /*=NULL*/)
 {
-    HLSLRoot* root = tree->GetRoot();
+    HLSLRoot *root = tree->GetRoot();
 
     // Reset all flags.
     ResetHiddenFlagVisitor reset;
     reset.VisitRoot(root);
 
     // Mark all the statements necessary for these entrypoints.
-    HLSLFunction* entry = tree->FindFunction(entryName0);
-    if (entry != NULL)
-    {
+    HLSLFunction *entry = tree->FindFunction(entryName0);
+    if (entry != NULL) {
         MarkVisibleStatementsVisitor mark(tree);
         mark.VisitFunction(entry);
     }
 
-    if (entryName1 != NULL)
-    {
+    if (entryName1 != NULL) {
         entry = tree->FindFunction(entryName1);
-        if (entry != NULL)
-        {
+        if (entry != NULL) {
             MarkVisibleStatementsVisitor mark(tree);
             mark.VisitFunction(entry);
         }
     }
 
     // Mark buffers visible, if any of their fields is visible.
-    HLSLStatement * statement = root->statement;
-    while (statement != NULL)
-    {
-        if (statement->nodeType == HLSLNodeType_Buffer)
-        {
-            HLSLBuffer* buffer = (HLSLBuffer*)statement;
+    HLSLStatement *statement = root->statement;
+    while (statement != NULL) {
+        if (statement->nodeType == HLSLNodeType_Buffer) {
+            HLSLBuffer *buffer = (HLSLBuffer *)statement;
 
-            if (buffer->IsGlobalFields())
-            {
+            if (buffer->IsGlobalFields()) {
                 // mark buffer visible if any of its fields are used
-                HLSLDeclaration* field = buffer->field;
-                while (field != NULL)
-                {
+                HLSLDeclaration *field = buffer->field;
+                while (field != NULL) {
                     ASSERT(field->nodeType == HLSLNodeType_Declaration);
-                    if (!field->hidden)
-                    {
+                    if (!field->hidden) {
                         buffer->hidden = false;
                         break;
                     }
-                    field = (HLSLDeclaration*)field->nextStatement;
+                    field = (HLSLDeclaration *)field->nextStatement;
                 }
             }
-            else
-            {
+            else {
                 // TODO: these load from a struct so may just need
                 // to somehow mark this if present.
-                
+
                 /* all struct fields are hidden = false, so this doesn't work
                 // mark buffer visible if any struct fields are used
                 HLSLStructField* field = buffer->bufferStruct->field;
@@ -1233,84 +1144,73 @@ void PruneTree(HLSLTree* tree, const char* entryName0, const char* entryName1/*=
     }
 }
 
-
-void SortTree(HLSLTree * tree)
+void SortTree(HLSLTree *tree)
 {
     // Stable sort so that statements are in this order:
     // const scalars for arrays, structs, declarations, functions, techniques.
-	// but their relative order is preserved.
+    // but their relative order is preserved.
 
-    HLSLRoot* root = tree->GetRoot();
+    HLSLRoot *root = tree->GetRoot();
+
+    HLSLStatement *constScalarDeclarations = NULL;
+    HLSLStatement *lastConstScalarDeclaration = NULL;
 
-    HLSLStatement* constScalarDeclarations = NULL;
-    HLSLStatement* lastConstScalarDeclaration = NULL;
-    
-    HLSLStatement* structs = NULL;
-    HLSLStatement* lastStruct = NULL;
-    
-    HLSLStatement* constDeclarations = NULL;
-    HLSLStatement* lastConstDeclaration = NULL;
-    
-    HLSLStatement* declarations = NULL;
-    HLSLStatement* lastDeclaration = NULL;
-    
-    HLSLStatement* functions = NULL;
-    HLSLStatement* lastFunction = NULL;
-    
-    HLSLStatement* other = NULL;
-    HLSLStatement* lastOther = NULL;
-
-    
-#define AppendToList(statement, list, listLast) \
-    if (list == NULL) list = statement; \
+    HLSLStatement *structs = NULL;
+    HLSLStatement *lastStruct = NULL;
+
+    HLSLStatement *constDeclarations = NULL;
+    HLSLStatement *lastConstDeclaration = NULL;
+
+    HLSLStatement *declarations = NULL;
+    HLSLStatement *lastDeclaration = NULL;
+
+    HLSLStatement *functions = NULL;
+    HLSLStatement *lastFunction = NULL;
+
+    HLSLStatement *other = NULL;
+    HLSLStatement *lastOther = NULL;
+
+#define AppendToList(statement, list, listLast)                \
+    if (list == NULL) list = statement;                        \
     if (listLast != NULL) listLast->nextStatement = statement; \
     listLast = statement;
-    
-    HLSLStatement* statement = root->statement;
+
+    HLSLStatement *statement = root->statement;
     while (statement != NULL) {
-        HLSLStatement* nextStatement = statement->nextStatement;
+        HLSLStatement *nextStatement = statement->nextStatement;
         statement->nextStatement = NULL;
 
         if (statement->nodeType == HLSLNodeType_Struct) {
             AppendToList(statement, structs, lastStruct);
         }
         else if (statement->nodeType == HLSLNodeType_Declaration ||
-                 statement->nodeType == HLSLNodeType_Buffer)
-        {
+                 statement->nodeType == HLSLNodeType_Buffer) {
             // There are cases where a struct uses a const array size,
             // so those need to be ordered prior to the struct.
-            if (statement->nodeType == HLSLNodeType_Declaration)
-            {
-                HLSLDeclaration* decl = (HLSLDeclaration *)statement;
-                
-                if (decl->type.flags & HLSLTypeFlag_Const)
-                {
+            if (statement->nodeType == HLSLNodeType_Declaration) {
+                HLSLDeclaration *decl = (HLSLDeclaration *)statement;
+
+                if (decl->type.flags & HLSLTypeFlag_Const) {
                     // this is a global scalar, so best to order first
-                    if (IsScalarType(decl->type.baseType))
-                    {
+                    if (IsScalarType(decl->type.baseType)) {
                         AppendToList(statement, constScalarDeclarations, lastConstScalarDeclaration);
                     }
-                    else
-                    {
+                    else {
                         AppendToList(statement, constDeclarations, lastConstDeclaration);
                     }
                 }
-                else
-                {
+                else {
                     AppendToList(statement, declarations, lastDeclaration);
                 }
             }
-            else if (statement->nodeType == HLSLNodeType_Buffer)
-            {
+            else if (statement->nodeType == HLSLNodeType_Buffer) {
                 AppendToList(statement, declarations, lastDeclaration);
             }
         }
-        else if (statement->nodeType == HLSLNodeType_Function)
-        {
+        else if (statement->nodeType == HLSLNodeType_Function) {
             AppendToList(statement, functions, lastFunction);
         }
-        else
-        {
+        else {
             AppendToList(statement, other, lastOther);
         }
 
@@ -1318,48 +1218,54 @@ void SortTree(HLSLTree * tree)
     }
 
     // Chain all the statements in the order that we want.
-    HLSLStatement * firstStatement = constScalarDeclarations;
-    HLSLStatement * lastStatement = lastConstScalarDeclaration;
+    HLSLStatement *firstStatement = constScalarDeclarations;
+    HLSLStatement *lastStatement = lastConstScalarDeclaration;
 
     if (structs != NULL) {
-        if (firstStatement == NULL) firstStatement = structs;
-        else lastStatement->nextStatement = structs;
+        if (firstStatement == NULL)
+            firstStatement = structs;
+        else
+            lastStatement->nextStatement = structs;
         lastStatement = lastStruct;
     }
-    
+
     if (constDeclarations != NULL) {
-        if (firstStatement == NULL) firstStatement = constDeclarations;
-        else lastStatement->nextStatement = constDeclarations;
+        if (firstStatement == NULL)
+            firstStatement = constDeclarations;
+        else
+            lastStatement->nextStatement = constDeclarations;
         lastStatement = lastConstDeclaration;
     }
 
     if (declarations != NULL) {
-        if (firstStatement == NULL) firstStatement = declarations;
-        else lastStatement->nextStatement = declarations;
+        if (firstStatement == NULL)
+            firstStatement = declarations;
+        else
+            lastStatement->nextStatement = declarations;
         lastStatement = lastDeclaration;
     }
 
     if (functions != NULL) {
-        if (firstStatement == NULL) firstStatement = functions;
-        else lastStatement->nextStatement = functions;
+        if (firstStatement == NULL)
+            firstStatement = functions;
+        else
+            lastStatement->nextStatement = functions;
         lastStatement = lastFunction;
     }
 
     if (other != NULL) {
-        if (firstStatement == NULL) firstStatement = other;
-        else lastStatement->nextStatement = other;
+        if (firstStatement == NULL)
+            firstStatement = other;
+        else
+            lastStatement->nextStatement = other;
         lastStatement = lastOther;
     }
 
     root->statement = firstStatement;
 }
 
-
-
-
-
 // First and last can be the same.
-void AddStatements(HLSLRoot * root, HLSLStatement * before, HLSLStatement * first, HLSLStatement * last)
+void AddStatements(HLSLRoot *root, HLSLStatement *before, HLSLStatement *first, HLSLStatement *last)
 {
     if (before == NULL) {
         last->nextStatement = root->statement;
@@ -1371,12 +1277,11 @@ void AddStatements(HLSLRoot * root, HLSLStatement * before, HLSLStatement * firs
     }
 }
 
-void AddSingleStatement(HLSLRoot * root, HLSLStatement * before, HLSLStatement * statement)
+void AddSingleStatement(HLSLRoot *root, HLSLStatement *before, HLSLStatement *statement)
 {
     AddStatements(root, before, statement, statement);
 }
 
-
 /* *X file releated
 // @@ This is very game-specific. Should be moved to pipeline_parser or somewhere else.
 void GroupParameters(HLSLTree * tree)
@@ -1400,7 +1305,7 @@ void GroupParameters(HLSLTree * tree)
     HLSLDeclaration * lastPerPassSampler = NULL;
 
     HLSLStatement * statementBeforeBuffers = NULL;
-    
+
     HLSLStatement* previousStatement = NULL;
     HLSLStatement* statement = root->statement;
     while (statement != NULL)
@@ -1408,7 +1313,7 @@ void GroupParameters(HLSLTree * tree)
         HLSLStatement* nextStatement = statement->nextStatement;
 
         if (statement->nodeType == HLSLNodeType_Struct) // Do not remove this, or it will mess the else clause below.
-        {   
+        {
             statementBeforeBuffers = statement;
         }
         else if (statement->nodeType == HLSLNodeType_Declaration)
@@ -1529,7 +1434,7 @@ void GroupParameters(HLSLTree * tree)
         perItemBuffer->name = tree->AddString("per_item");
         perItemBuffer->registerName = tree->AddString("b0");
         perItemBuffer->field = firstPerItemDeclaration;
-        
+
         // Set declaration buffer pointers.
         HLSLDeclaration * field = perItemBuffer->field;
         while (field != NULL)
@@ -1558,64 +1463,58 @@ void GroupParameters(HLSLTree * tree)
             field->buffer = perPassBuffer;
             field = (HLSLDeclaration *)field->nextStatement;
         }
-        
+
         // Add buffer to statements.
         AddSingleStatement(root, statementBeforeBuffers, perPassBuffer);
     }
 }
 */
 
-class FindArgumentVisitor : public HLSLTreeVisitor
-{
+class FindArgumentVisitor : public HLSLTreeVisitor {
 public:
     bool found;
-    const char * name;
+    const char *name;
 
     virtual ~FindArgumentVisitor() {}
-    
-	FindArgumentVisitor()
-	{
-		found = false;
-		name  = NULL;
-	}
-
-    bool FindArgument(const char * _name, HLSLFunction * function)
+
+    FindArgumentVisitor()
+    {
+        found = false;
+        name = NULL;
+    }
+
+    bool FindArgument(const char *_name, HLSLFunction *function)
     {
         found = false;
         name = _name;
         VisitStatements(function->statement);
         return found;
     }
-    
-    virtual void VisitStatements(HLSLStatement * statement) override
+
+    virtual void VisitStatements(HLSLStatement *statement) override
     {
-        while (statement != NULL && !found)
-        {
+        while (statement != NULL && !found) {
             VisitStatement(statement);
             statement = statement->nextStatement;
         }
     }
 
-    virtual void VisitIdentifierExpression(HLSLIdentifierExpression * node) override
+    virtual void VisitIdentifierExpression(HLSLIdentifierExpression *node) override
     {
-        if (node->name == name)
-        {
+        if (node->name == name) {
             found = true;
         }
     }
 };
 
-
-void HideUnusedArguments(HLSLFunction * function)
+void HideUnusedArguments(HLSLFunction *function)
 {
     FindArgumentVisitor visitor;
- 
+
     // For each argument.
-    HLSLArgument * arg = function->argument;
-    while (arg != NULL)
-    {
-        if (!visitor.FindArgument(arg->name, function))
-        {
+    HLSLArgument *arg = function->argument;
+    while (arg != NULL) {
+        if (!visitor.FindArgument(arg->name, function)) {
             arg->hidden = true;
         }
 
@@ -1623,30 +1522,31 @@ void HideUnusedArguments(HLSLFunction * function)
     }
 }
 
-bool NeedsFlattening(HLSLExpression * expr, int level = 0) {
+bool NeedsFlattening(HLSLExpression *expr, int level = 0)
+{
     if (expr == NULL) {
         return false;
     }
     if (expr->nodeType == HLSLNodeType_UnaryExpression) {
-        HLSLUnaryExpression * unaryExpr = (HLSLUnaryExpression *)expr;
-        return NeedsFlattening(unaryExpr->expression, level+1) || NeedsFlattening(expr->nextExpression, level);
+        HLSLUnaryExpression *unaryExpr = (HLSLUnaryExpression *)expr;
+        return NeedsFlattening(unaryExpr->expression, level + 1) || NeedsFlattening(expr->nextExpression, level);
     }
     else if (expr->nodeType == HLSLNodeType_BinaryExpression) {
-        HLSLBinaryExpression * binaryExpr = (HLSLBinaryExpression *)expr;
+        HLSLBinaryExpression *binaryExpr = (HLSLBinaryExpression *)expr;
         if (IsAssignOp(binaryExpr->binaryOp)) {
-            return NeedsFlattening(binaryExpr->expression2, level+1) || NeedsFlattening(expr->nextExpression, level);
+            return NeedsFlattening(binaryExpr->expression2, level + 1) || NeedsFlattening(expr->nextExpression, level);
         }
         else {
-            return NeedsFlattening(binaryExpr->expression1, level+1) || NeedsFlattening(binaryExpr->expression2, level+1) || NeedsFlattening(expr->nextExpression, level);
+            return NeedsFlattening(binaryExpr->expression1, level + 1) || NeedsFlattening(binaryExpr->expression2, level + 1) || NeedsFlattening(expr->nextExpression, level);
         }
     }
     else if (expr->nodeType == HLSLNodeType_ConditionalExpression) {
-        HLSLConditionalExpression * conditionalExpr = (HLSLConditionalExpression *)expr;
-        return NeedsFlattening(conditionalExpr->condition, level+1) || NeedsFlattening(conditionalExpr->trueExpression, level+1) || NeedsFlattening(conditionalExpr->falseExpression, level+1) || NeedsFlattening(expr->nextExpression, level);
+        HLSLConditionalExpression *conditionalExpr = (HLSLConditionalExpression *)expr;
+        return NeedsFlattening(conditionalExpr->condition, level + 1) || NeedsFlattening(conditionalExpr->trueExpression, level + 1) || NeedsFlattening(conditionalExpr->falseExpression, level + 1) || NeedsFlattening(expr->nextExpression, level);
     }
     else if (expr->nodeType == HLSLNodeType_CastingExpression) {
-        HLSLCastingExpression * castingExpr = (HLSLCastingExpression *)expr;
-        return NeedsFlattening(castingExpr->expression, level+1) || NeedsFlattening(expr->nextExpression, level);
+        HLSLCastingExpression *castingExpr = (HLSLCastingExpression *)expr;
+        return NeedsFlattening(castingExpr->expression, level + 1) || NeedsFlattening(expr->nextExpression, level);
     }
     else if (expr->nodeType == HLSLNodeType_LiteralExpression) {
         return NeedsFlattening(expr->nextExpression, level);
@@ -1655,24 +1555,24 @@ bool NeedsFlattening(HLSLExpression * expr, int level = 0) {
         return NeedsFlattening(expr->nextExpression, level);
     }
     else if (expr->nodeType == HLSLNodeType_ConstructorExpression) {
-        HLSLConstructorExpression * constructorExpr = (HLSLConstructorExpression *)expr;
-        return NeedsFlattening(constructorExpr->argument, level+1) || NeedsFlattening(expr->nextExpression, level);
+        HLSLConstructorExpression *constructorExpr = (HLSLConstructorExpression *)expr;
+        return NeedsFlattening(constructorExpr->argument, level + 1) || NeedsFlattening(expr->nextExpression, level);
     }
     else if (expr->nodeType == HLSLNodeType_MemberAccess) {
-        return NeedsFlattening(expr->nextExpression, level+1);
+        return NeedsFlattening(expr->nextExpression, level + 1);
     }
     else if (expr->nodeType == HLSLNodeType_ArrayAccess) {
-        HLSLArrayAccess * arrayAccess = (HLSLArrayAccess *)expr;
-        return NeedsFlattening(arrayAccess->array, level+1) || NeedsFlattening(arrayAccess->index, level+1) || NeedsFlattening(expr->nextExpression, level);
+        HLSLArrayAccess *arrayAccess = (HLSLArrayAccess *)expr;
+        return NeedsFlattening(arrayAccess->array, level + 1) || NeedsFlattening(arrayAccess->index, level + 1) || NeedsFlattening(expr->nextExpression, level);
     }
     else if (expr->nodeType == HLSLNodeType_FunctionCall) {
-        HLSLFunctionCall * functionCall = (HLSLFunctionCall *)expr;
+        HLSLFunctionCall *functionCall = (HLSLFunctionCall *)expr;
         if (functionCall->function->numOutputArguments > 0) {
             if (level > 0) {
                 return true;
             }
         }
-        return NeedsFlattening(functionCall->argument, level+1) || NeedsFlattening(expr->nextExpression, level);
+        return NeedsFlattening(functionCall->argument, level + 1) || NeedsFlattening(expr->nextExpression, level);
     }
     else {
         //assert(false);
@@ -1680,11 +1580,11 @@ bool NeedsFlattening(HLSLExpression * expr, int level = 0) {
     }
 }
 
-
 struct StatementList {
-    HLSLStatement * head = NULL;
-    HLSLStatement * tail = NULL;
-    void append(HLSLStatement * st) {
+    HLSLStatement *head = NULL;
+    HLSLStatement *tail = NULL;
+    void append(HLSLStatement *st)
+    {
         if (head == NULL) {
             tail = head = st;
         }
@@ -1693,292 +1593,285 @@ struct StatementList {
     }
 };
 
+class ExpressionFlattener : public HLSLTreeVisitor {
+public:
+    HLSLTree *m_tree;
+    int tmp_index;
+    HLSLStatement **statement_pointer;
+    HLSLFunction *current_function;
 
-    class ExpressionFlattener : public HLSLTreeVisitor
+    ExpressionFlattener()
     {
-    public:
-        HLSLTree * m_tree;
-        int tmp_index;
-        HLSLStatement ** statement_pointer;
-        HLSLFunction * current_function;
-        
-        ExpressionFlattener()
-        {
-            m_tree = NULL;
-            tmp_index = 0;
-            statement_pointer = NULL;
-            current_function = NULL;
-        }
-        virtual ~ExpressionFlattener() {}
-        
-        void FlattenExpressions(HLSLTree * tree)
-        {
-            m_tree = tree;
-            VisitRoot(tree->GetRoot());
+        m_tree = NULL;
+        tmp_index = 0;
+        statement_pointer = NULL;
+        current_function = NULL;
+    }
+    virtual ~ExpressionFlattener() {}
+
+    void FlattenExpressions(HLSLTree *tree)
+    {
+        m_tree = tree;
+        VisitRoot(tree->GetRoot());
+    }
+
+    // Visit all statements updating the statement_pointer so that we can insert and replace statements. @@ Add this to the default visitor?
+    virtual void VisitFunction(HLSLFunction *node) override
+    {
+        current_function = node;
+        statement_pointer = &node->statement;
+        VisitStatements(node->statement);
+        statement_pointer = NULL;
+        current_function = NULL;
+    }
+
+    virtual void VisitComment(HLSLComment *node) override
+    {
+        // TODO: do nothing?
+    }
+
+    virtual void VisitIfStatement(HLSLIfStatement *node) override
+    {
+        if (NeedsFlattening(node->condition, 1)) {
+            assert(false); // @@ Add statements before if statement.
         }
 
-        // Visit all statements updating the statement_pointer so that we can insert and replace statements. @@ Add this to the default visitor?
-        virtual void VisitFunction(HLSLFunction * node) override
-        {
-            current_function = node;
-            statement_pointer = &node->statement;
-            VisitStatements(node->statement);
-            statement_pointer = NULL;
-            current_function = NULL;
+        statement_pointer = &node->statement;
+        VisitStatements(node->statement);
+        if (node->elseStatement) {
+            statement_pointer = &node->elseStatement;
+            VisitStatements(node->elseStatement);
         }
+    }
 
-        virtual void VisitComment(HLSLComment * node) override
-        {
-            // TODO: do nothing?
+    virtual void VisitForStatement(HLSLForStatement *node) override
+    {
+        if (NeedsFlattening(node->initialization->assignment, 1)) {
+            assert(false); // @@ Add statements before for statement.
         }
-        
-        virtual void VisitIfStatement(HLSLIfStatement * node) override
-        {
-            if (NeedsFlattening(node->condition, 1)) {
-                assert(false);  // @@ Add statements before if statement.
-            }
-            
-            statement_pointer = &node->statement;
-            VisitStatements(node->statement);
-            if (node->elseStatement) {
-                statement_pointer = &node->elseStatement;
-                VisitStatements(node->elseStatement);
-            }
+        if (NeedsFlattening(node->condition, 1) || NeedsFlattening(node->increment, 1)) {
+            assert(false); // @@ These are tricky to implement. Need to handle all loop exits.
         }
-        
-        virtual void VisitForStatement(HLSLForStatement * node) override
-        {
-            if (NeedsFlattening(node->initialization->assignment, 1)) {
-                assert(false);  // @@ Add statements before for statement.
-            }
-            if (NeedsFlattening(node->condition, 1) || NeedsFlattening(node->increment, 1)) {
-                assert(false);  // @@ These are tricky to implement. Need to handle all loop exits.
-            }
 
-            statement_pointer = &node->statement;
-            VisitStatements(node->statement);
+        statement_pointer = &node->statement;
+        VisitStatements(node->statement);
+    }
+
+    virtual void VisitBlockStatement(HLSLBlockStatement *node) override
+    {
+        statement_pointer = &node->statement;
+        VisitStatements(node->statement);
+    }
+
+    virtual void VisitStatements(HLSLStatement *statement) override
+    {
+        while (statement != NULL) {
+            VisitStatement(statement);
+            statement_pointer = &statement->nextStatement;
+            statement = statement->nextStatement;
         }
-        
-        virtual void VisitBlockStatement(HLSLBlockStatement * node) override
-        {
-            statement_pointer = &node->statement;
-            VisitStatements(node->statement);
+    }
+
+    // This is usually a function call or assignment.
+    virtual void VisitExpressionStatement(HLSLExpressionStatement *node) override
+    {
+        if (NeedsFlattening(node->expression, 0)) {
+            StatementList statements;
+            Flatten(node->expression, statements, false);
+
+            // Link beginning of statement list.
+            *statement_pointer = statements.head;
+
+            // Link end of statement list.
+            HLSLStatement *tail = statements.tail;
+            tail->nextStatement = node->nextStatement;
+
+            // Update statement pointer.
+            statement_pointer = &tail->nextStatement;
+
+            // @@ Delete node?
         }
-        
-        virtual void VisitStatements(HLSLStatement * statement) override
-        {
-            while (statement != NULL) {
-                VisitStatement(statement);
-                statement_pointer = &statement->nextStatement;
-                statement = statement->nextStatement;
-            }
+    }
+
+    virtual void VisitDeclaration(HLSLDeclaration *node) override
+    {
+        // Skip global declarations.
+        if (statement_pointer == NULL) return;
+
+        if (NeedsFlattening(node->assignment, 1)) {
+            StatementList statements;
+            HLSLIdentifierExpression *ident = Flatten(node->assignment, statements, true);
+
+            // @@ Delete node->assignment?
+
+            node->assignment = ident;
+            statements.append(node);
+
+            // Link beginning of statement list.
+            *statement_pointer = statements.head;
+
+            // Link end of statement list.
+            HLSLStatement *tail = statements.tail;
+            tail->nextStatement = node->nextStatement;
+
+            // Update statement pointer.
+            statement_pointer = &tail->nextStatement;
         }
+    }
 
-        // This is usually a function call or assignment.
-        virtual void VisitExpressionStatement(HLSLExpressionStatement * node) override
-        {
-            if (NeedsFlattening(node->expression, 0))
-            {
-                StatementList statements;
-                Flatten(node->expression, statements, false);
-                
-                // Link beginning of statement list.
-                *statement_pointer = statements.head;
-
-                // Link end of statement list.
-                HLSLStatement * tail = statements.tail;
-                tail->nextStatement = node->nextStatement;
-                
-                // Update statement pointer.
-                statement_pointer = &tail->nextStatement;
-                
-                // @@ Delete node?
-            }
+    virtual void VisitReturnStatement(HLSLReturnStatement *node) override
+    {
+        if (NeedsFlattening(node->expression, 1)) {
+            StatementList statements;
+            HLSLIdentifierExpression *ident = Flatten(node->expression, statements, true);
+
+            // @@ Delete node->expression?
+
+            node->expression = ident;
+            statements.append(node);
+
+            // Link beginning of statement list.
+            *statement_pointer = statements.head;
+
+            // Link end of statement list.
+            HLSLStatement *tail = statements.tail;
+            tail->nextStatement = node->nextStatement;
+
+            // Update statement pointer.
+            statement_pointer = &tail->nextStatement;
         }
+    }
 
-        virtual void VisitDeclaration(HLSLDeclaration * node) override
-        {
-            // Skip global declarations.
-            if (statement_pointer == NULL) return;
-            
-            if (NeedsFlattening(node->assignment, 1))
-            {
-                StatementList statements;
-                HLSLIdentifierExpression * ident = Flatten(node->assignment, statements, true);
-                
-                // @@ Delete node->assignment?
-                
-                node->assignment = ident;
-                statements.append(node);
-                
-                // Link beginning of statement list.
-                *statement_pointer = statements.head;
-                
-                // Link end of statement list.
-                HLSLStatement * tail = statements.tail;
-                tail->nextStatement = node->nextStatement;
-                
-                // Update statement pointer.
-                statement_pointer = &tail->nextStatement;
-            }
+    HLSLDeclaration *BuildTemporaryDeclaration(HLSLExpression *expr)
+    {
+        assert(expr->expressionType.baseType != HLSLBaseType_Void);
+
+        HLSLDeclaration *declaration = m_tree->AddNode<HLSLDeclaration>(expr->fileName, expr->line);
+        declaration->name = m_tree->AddStringFormat("tmp%d", tmp_index++);
+        declaration->type = expr->expressionType;
+        declaration->assignment = expr;
+
+        //HLSLIdentifierExpression * ident = (HLSLIdentifierExpression *)expr;
+
+        return declaration;
+    }
+
+    HLSLExpressionStatement *BuildExpressionStatement(HLSLExpression *expr)
+    {
+        HLSLExpressionStatement *statement = m_tree->AddNode<HLSLExpressionStatement>(expr->fileName, expr->line);
+        statement->expression = expr;
+        return statement;
+    }
+
+    HLSLIdentifierExpression *AddExpressionStatement(HLSLExpression *expr, StatementList &statements, bool wantIdent)
+    {
+        if (wantIdent) {
+            HLSLDeclaration *declaration = BuildTemporaryDeclaration(expr);
+            statements.append(declaration);
+
+            HLSLIdentifierExpression *ident = m_tree->AddNode<HLSLIdentifierExpression>(expr->fileName, expr->line);
+            ident->name = declaration->name;
+            ident->expressionType = declaration->type;
+            return ident;
+        }
+        else {
+            HLSLExpressionStatement *statement = BuildExpressionStatement(expr);
+            statements.append(statement);
+            return NULL;
         }
+    }
 
-        virtual void VisitReturnStatement(HLSLReturnStatement * node) override
-        {
-            if (NeedsFlattening(node->expression, 1))
-            {
-                StatementList statements;
-                HLSLIdentifierExpression * ident = Flatten(node->expression, statements, true);
-
-                // @@ Delete node->expression?
-                
-                node->expression = ident;
-                statements.append(node);
-                
-                // Link beginning of statement list.
-                *statement_pointer = statements.head;
-                
-                // Link end of statement list.
-                HLSLStatement * tail = statements.tail;
-                tail->nextStatement = node->nextStatement;
-                
-                // Update statement pointer.
-                statement_pointer = &tail->nextStatement;
-            }
+    HLSLIdentifierExpression *Flatten(HLSLExpression *expr, StatementList &statements, bool wantIdent = true)
+    {
+        if (!NeedsFlattening(expr, wantIdent)) {
+            return AddExpressionStatement(expr, statements, wantIdent);
         }
 
-        
-        HLSLDeclaration * BuildTemporaryDeclaration(HLSLExpression * expr)
-        {
-            assert(expr->expressionType.baseType != HLSLBaseType_Void);
-            
-            HLSLDeclaration * declaration = m_tree->AddNode<HLSLDeclaration>(expr->fileName, expr->line);
-            declaration->name = m_tree->AddStringFormat("tmp%d", tmp_index++);
-            declaration->type = expr->expressionType;
-            declaration->assignment = expr;
-            
-            //HLSLIdentifierExpression * ident = (HLSLIdentifierExpression *)expr;
-            
-            return declaration;
-        }
-
-        HLSLExpressionStatement * BuildExpressionStatement(HLSLExpression * expr)
-        {
-            HLSLExpressionStatement * statement = m_tree->AddNode<HLSLExpressionStatement>(expr->fileName, expr->line);
-            statement->expression = expr;
-            return statement;
+        if (expr->nodeType == HLSLNodeType_UnaryExpression) {
+            assert(expr->nextExpression == NULL);
+
+            HLSLUnaryExpression *unaryExpr = (HLSLUnaryExpression *)expr;
+
+            HLSLIdentifierExpression *tmp = Flatten(unaryExpr->expression, statements, true);
+
+            HLSLUnaryExpression *newUnaryExpr = m_tree->AddNode<HLSLUnaryExpression>(unaryExpr->fileName, unaryExpr->line);
+            newUnaryExpr->unaryOp = unaryExpr->unaryOp;
+            newUnaryExpr->expression = tmp;
+            newUnaryExpr->expressionType = unaryExpr->expressionType;
+
+            return AddExpressionStatement(newUnaryExpr, statements, wantIdent);
         }
+        else if (expr->nodeType == HLSLNodeType_BinaryExpression) {
+            assert(expr->nextExpression == NULL);
 
-        HLSLIdentifierExpression * AddExpressionStatement(HLSLExpression * expr, StatementList & statements, bool wantIdent)
-        {
-            if (wantIdent) {
-                HLSLDeclaration * declaration = BuildTemporaryDeclaration(expr);
-                statements.append(declaration);
-                
-                HLSLIdentifierExpression * ident = m_tree->AddNode<HLSLIdentifierExpression>(expr->fileName, expr->line);
-                ident->name = declaration->name;
-                ident->expressionType = declaration->type;
-                return ident;
+            HLSLBinaryExpression *binaryExpr = (HLSLBinaryExpression *)expr;
+
+            if (IsAssignOp(binaryExpr->binaryOp)) {
+                // Flatten right hand side only.
+                HLSLIdentifierExpression *tmp2 = Flatten(binaryExpr->expression2, statements, true);
+
+                HLSLBinaryExpression *newBinaryExpr = m_tree->AddNode<HLSLBinaryExpression>(binaryExpr->fileName, binaryExpr->line);
+                newBinaryExpr->binaryOp = binaryExpr->binaryOp;
+                newBinaryExpr->expression1 = binaryExpr->expression1;
+                newBinaryExpr->expression2 = tmp2;
+                newBinaryExpr->expressionType = binaryExpr->expressionType;
+
+                return AddExpressionStatement(newBinaryExpr, statements, wantIdent);
             }
             else {
-                HLSLExpressionStatement * statement = BuildExpressionStatement(expr);
-                statements.append(statement);
-                return NULL;
+                HLSLIdentifierExpression *tmp1 = Flatten(binaryExpr->expression1, statements, true);
+                HLSLIdentifierExpression *tmp2 = Flatten(binaryExpr->expression2, statements, true);
+
+                HLSLBinaryExpression *newBinaryExpr = m_tree->AddNode<HLSLBinaryExpression>(binaryExpr->fileName, binaryExpr->line);
+                newBinaryExpr->binaryOp = binaryExpr->binaryOp;
+                newBinaryExpr->expression1 = tmp1;
+                newBinaryExpr->expression2 = tmp2;
+                newBinaryExpr->expressionType = binaryExpr->expressionType;
+
+                return AddExpressionStatement(newBinaryExpr, statements, wantIdent);
             }
         }
-        
-        HLSLIdentifierExpression * Flatten(HLSLExpression * expr, StatementList & statements, bool wantIdent = true)
-        {
-            if (!NeedsFlattening(expr, wantIdent)) {
-                return AddExpressionStatement(expr, statements, wantIdent);
-            }
-            
-            if (expr->nodeType == HLSLNodeType_UnaryExpression) {
-                assert(expr->nextExpression == NULL);
-                
-                HLSLUnaryExpression * unaryExpr = (HLSLUnaryExpression *)expr;
-                
-                HLSLIdentifierExpression * tmp = Flatten(unaryExpr->expression, statements, true);
-                
-                HLSLUnaryExpression * newUnaryExpr = m_tree->AddNode<HLSLUnaryExpression>(unaryExpr->fileName, unaryExpr->line);
-                newUnaryExpr->unaryOp = unaryExpr->unaryOp;
-                newUnaryExpr->expression = tmp;
-                newUnaryExpr->expressionType = unaryExpr->expressionType;
-
-                return AddExpressionStatement(newUnaryExpr, statements, wantIdent);
-            }
-            else if (expr->nodeType == HLSLNodeType_BinaryExpression) {
-                assert(expr->nextExpression == NULL);
-                
-                HLSLBinaryExpression * binaryExpr = (HLSLBinaryExpression *)expr;
-                
-                if (IsAssignOp(binaryExpr->binaryOp)) {
-                    // Flatten right hand side only.
-                    HLSLIdentifierExpression * tmp2 = Flatten(binaryExpr->expression2, statements, true);
-                    
-                    HLSLBinaryExpression * newBinaryExpr = m_tree->AddNode<HLSLBinaryExpression>(binaryExpr->fileName, binaryExpr->line);
-                    newBinaryExpr->binaryOp = binaryExpr->binaryOp;
-                    newBinaryExpr->expression1 = binaryExpr->expression1;
-                    newBinaryExpr->expression2 = tmp2;
-                    newBinaryExpr->expressionType = binaryExpr->expressionType;
-                    
-                    return AddExpressionStatement(newBinaryExpr, statements, wantIdent);
-                }
-                else {
-                    HLSLIdentifierExpression * tmp1 = Flatten(binaryExpr->expression1, statements, true);
-                    HLSLIdentifierExpression * tmp2 = Flatten(binaryExpr->expression2, statements, true);
-
-                    HLSLBinaryExpression * newBinaryExpr = m_tree->AddNode<HLSLBinaryExpression>(binaryExpr->fileName, binaryExpr->line);
-                    newBinaryExpr->binaryOp = binaryExpr->binaryOp;
-                    newBinaryExpr->expression1 = tmp1;
-                    newBinaryExpr->expression2 = tmp2;
-                    newBinaryExpr->expressionType = binaryExpr->expressionType;
-                    
-                    return AddExpressionStatement(newBinaryExpr, statements, wantIdent);
-                }
-            }
-            else if (expr->nodeType == HLSLNodeType_ConditionalExpression) {
-                assert(false);
-            }
-            else if (expr->nodeType == HLSLNodeType_CastingExpression) {
-                assert(false);
-            }
-            else if (expr->nodeType == HLSLNodeType_LiteralExpression) {
-                assert(false);
-            }
-            else if (expr->nodeType == HLSLNodeType_IdentifierExpression) {
-                assert(false);
-            }
-            else if (expr->nodeType == HLSLNodeType_ConstructorExpression) {
-                assert(false);
-            }
-            else if (expr->nodeType == HLSLNodeType_MemberAccess) {
-                assert(false);
-            }
-            else if (expr->nodeType == HLSLNodeType_ArrayAccess) {
-                assert(false);
-            }
-            else if (expr->nodeType == HLSLNodeType_FunctionCall) {
-                HLSLFunctionCall * functionCall = (HLSLFunctionCall *)expr;
-
-                // @@ Output function as is?
-                // @@ We have to flatten function arguments! This is tricky, need to handle input/output arguments.
-                assert(!NeedsFlattening(functionCall->argument));
-                
-                return AddExpressionStatement(expr, statements, wantIdent);
-            }
-            else {
-                assert(false);
-            }
-            return NULL;
+        else if (expr->nodeType == HLSLNodeType_ConditionalExpression) {
+            assert(false);
         }
-    };
+        else if (expr->nodeType == HLSLNodeType_CastingExpression) {
+            assert(false);
+        }
+        else if (expr->nodeType == HLSLNodeType_LiteralExpression) {
+            assert(false);
+        }
+        else if (expr->nodeType == HLSLNodeType_IdentifierExpression) {
+            assert(false);
+        }
+        else if (expr->nodeType == HLSLNodeType_ConstructorExpression) {
+            assert(false);
+        }
+        else if (expr->nodeType == HLSLNodeType_MemberAccess) {
+            assert(false);
+        }
+        else if (expr->nodeType == HLSLNodeType_ArrayAccess) {
+            assert(false);
+        }
+        else if (expr->nodeType == HLSLNodeType_FunctionCall) {
+            HLSLFunctionCall *functionCall = (HLSLFunctionCall *)expr;
+
+            // @@ Output function as is?
+            // @@ We have to flatten function arguments! This is tricky, need to handle input/output arguments.
+            assert(!NeedsFlattening(functionCall->argument));
+
+            return AddExpressionStatement(expr, statements, wantIdent);
+        }
+        else {
+            assert(false);
+        }
+        return NULL;
+    }
+};
 
-    
-void FlattenExpressions(HLSLTree* tree) {
+void FlattenExpressions(HLSLTree *tree)
+{
     ExpressionFlattener flattener;
     flattener.FlattenExpressions(tree);
 }
 
-} // M4
-
+} //namespace M4
diff --git a/hlslparser/src/HLSLTree.h b/hlslparser/src/HLSLTree.h
index 3ad0bd63..39956bac 100644
--- a/hlslparser/src/HLSLTree.h
+++ b/hlslparser/src/HLSLTree.h
@@ -1,41 +1,38 @@
 #pragma once
 
-#include "Engine.h"
-
 #include <new>
 
-namespace M4
-{
+#include "Engine.h"
 
-enum HLSLTarget
-{
+namespace M4 {
+
+enum HLSLTarget {
     HLSLTarget_VertexShader,
     HLSLTarget_PixelShader,
-    
+
     HLSLTarget_ComputeShader,
-    
+
     // none of these are portable to Metal/Android, they have own triangulation
     // HLSLTarget_GeometryShader,
     // HLSLTarget_HullShader,
     // HLSLTarget_ControlShader,
-    
+
     // This is compute prior to frag (combined vertex + geo state)
     // HLSLTarget_MeshShader,
 };
 
-enum HLSLNodeType
-{
+enum HLSLNodeType {
     HLSLNodeType_Root,
-    
+
     HLSLNodeType_Declaration,
     HLSLNodeType_Struct,
     HLSLNodeType_StructField,
     HLSLNodeType_Buffer,
     HLSLNodeType_BufferField, // TODO: or just ref structField
-    
+
     HLSLNodeType_Function,
     HLSLNodeType_Argument,
-    
+
     HLSLNodeType_ExpressionStatement,
     HLSLNodeType_Expression,
     HLSLNodeType_ReturnStatement,
@@ -56,7 +53,7 @@ enum HLSLNodeType
     HLSLNodeType_ArrayAccess,
     HLSLNodeType_FunctionCall,
     HLSLNodeType_MemberFunctionCall,
-    
+
     /* FX file stuff
     HLSLNodeType_StateAssignment,
     HLSLNodeType_SamplerState,
@@ -65,33 +62,32 @@ enum HLSLNodeType
     HLSLNodeType_Pipeline,
     HLSLNodeType_Stage,
     */
-    
+
     HLSLNodeType_Attribute,
     HLSLNodeType_Comment
 };
 
-enum HLSLBaseType
-{
+enum HLSLBaseType {
     HLSLBaseType_Unknown,
     HLSLBaseType_Void,
-    
+
     // float
     HLSLBaseType_Float,
     HLSLBaseType_Float2,
     HLSLBaseType_Float3,
     HLSLBaseType_Float4,
-	HLSLBaseType_Float2x2,
+    HLSLBaseType_Float2x2,
     HLSLBaseType_Float3x3,
     HLSLBaseType_Float4x4,
-    
+
     HLSLBaseType_Half,
     HLSLBaseType_Half2,
     HLSLBaseType_Half3,
     HLSLBaseType_Half4,
-	HLSLBaseType_Half2x2,
+    HLSLBaseType_Half2x2,
     HLSLBaseType_Half3x3,
     HLSLBaseType_Half4x4,
-    
+
     HLSLBaseType_Double,
     HLSLBaseType_Double2,
     HLSLBaseType_Double3,
@@ -99,43 +95,43 @@ enum HLSLBaseType
     HLSLBaseType_Double2x2,
     HLSLBaseType_Double3x3,
     HLSLBaseType_Double4x4,
-    
+
     // integer
     HLSLBaseType_Bool,
     HLSLBaseType_Bool2,
-	HLSLBaseType_Bool3,
-	HLSLBaseType_Bool4,
-    
+    HLSLBaseType_Bool3,
+    HLSLBaseType_Bool4,
+
     HLSLBaseType_Int,
     HLSLBaseType_Int2,
     HLSLBaseType_Int3,
     HLSLBaseType_Int4,
-    
+
     HLSLBaseType_Uint,
     HLSLBaseType_Uint2,
     HLSLBaseType_Uint3,
     HLSLBaseType_Uint4,
-    
+
     HLSLBaseType_Short,
     HLSLBaseType_Short2,
     HLSLBaseType_Short3,
     HLSLBaseType_Short4,
-    
+
     HLSLBaseType_Ushort,
     HLSLBaseType_Ushort2,
     HLSLBaseType_Ushort3,
     HLSLBaseType_Ushort4,
-    
+
     HLSLBaseType_Long,
     HLSLBaseType_Long2,
     HLSLBaseType_Long3,
     HLSLBaseType_Long4,
-    
+
     HLSLBaseType_Ulong,
     HLSLBaseType_Ulong2,
     HLSLBaseType_Ulong3,
     HLSLBaseType_Ulong4,
-    
+
     // Seems like these should be subtype of HLSLTexture, but
     // many of the intrinsics require a specific type of texture.
     // MSL has many more types, included depth vs. regular textures.
@@ -145,45 +141,44 @@ enum HLSLBaseType
     HLSLBaseType_Texture2DArray,
     HLSLBaseType_TextureCubeArray,
     HLSLBaseType_Texture2DMS,
-    
+
     HLSLBaseType_Depth2D,
     HLSLBaseType_Depth2DArray,
     HLSLBaseType_DepthCube,
     // TODO: add more depth types as needed (pair with SamplerComparisonState)
-    
+
     HLSLBaseType_RWTexture2D,
-    
+
     // Only 2 sampler types. - type is for defining state inside them
     HLSLBaseType_SamplerState,
     HLSLBaseType_SamplerComparisonState,
-    
-    HLSLBaseType_UserDefined,       // struct
-    HLSLBaseType_Expression,        // type argument for defined() sizeof() and typeof().
+
+    HLSLBaseType_UserDefined, // struct
+    HLSLBaseType_Expression, // type argument for defined() sizeof() and typeof().
     //HLSLBaseType_Auto,            // this wasn't hooked up
-    HLSLBaseType_Comment,           // single line comments optionally transferred to output
-    
+    HLSLBaseType_Comment, // single line comments optionally transferred to output
+
     // Buffer subtypes below
     HLSLBaseType_Buffer,
-    
+
     HLSLBaseType_Count,
-    
+
     // counts
     //HLSLBaseType_FirstNumeric = HLSLBaseType_Float,
     //HLSLBaseType_LastNumeric = HLSLBaseType_Ulong4,
-    
+
     //HLSLBaseType_FirstInteger = HLSLBaseType_Bool,
     //HLSLBaseType_LastInteger = HLSLBaseType_LastNumeric,
-   
+
     HLSLBaseType_NumericCount = HLSLBaseType_Ulong4 - HLSLBaseType_Float + 1
 };
-  
+
 // This a subtype to HLSLBaseType_Buffer
-enum HLSLBufferType
-{
+enum HLSLBufferType {
     // DX9
     HLSLBufferType_CBuffer,
     HLSLBufferType_TBuffer,
-    
+
     // DX10 templated types
     HLSLBufferType_ConstantBuffer, // indexable
     HLSLBufferType_StructuredBuffer,
@@ -192,18 +187,17 @@ enum HLSLBufferType
     HLSLBufferType_RWByteAddressBuffer
 };
 
-enum HLSLBinaryOp
-{
+enum HLSLBinaryOp {
     // bit ops
     HLSLBinaryOp_And,
     HLSLBinaryOp_Or,
-    
+
     // math ops
     HLSLBinaryOp_Add,
     HLSLBinaryOp_Sub,
     HLSLBinaryOp_Mul,
     HLSLBinaryOp_Div,
-    
+
     // comparison ops
     HLSLBinaryOp_Less,
     HLSLBinaryOp_Greater,
@@ -211,12 +205,12 @@ enum HLSLBinaryOp
     HLSLBinaryOp_GreaterEqual,
     HLSLBinaryOp_Equal,
     HLSLBinaryOp_NotEqual,
-    
+
     // bit ops
     HLSLBinaryOp_BitAnd,
     HLSLBinaryOp_BitOr,
     HLSLBinaryOp_BitXor,
-    
+
     // assign ops
     HLSLBinaryOp_Assign,
     HLSLBinaryOp_AddAssign,
@@ -225,60 +219,58 @@ enum HLSLBinaryOp
     HLSLBinaryOp_DivAssign,
 };
 
-inline bool IsCompareOp( HLSLBinaryOp op )
+inline bool IsCompareOp(HLSLBinaryOp op)
 {
-	return op == HLSLBinaryOp_Less ||
-		op == HLSLBinaryOp_Greater ||
-		op == HLSLBinaryOp_LessEqual ||
-		op == HLSLBinaryOp_GreaterEqual ||
-		op == HLSLBinaryOp_Equal ||
-		op == HLSLBinaryOp_NotEqual;
+    return op == HLSLBinaryOp_Less ||
+           op == HLSLBinaryOp_Greater ||
+           op == HLSLBinaryOp_LessEqual ||
+           op == HLSLBinaryOp_GreaterEqual ||
+           op == HLSLBinaryOp_Equal ||
+           op == HLSLBinaryOp_NotEqual;
 }
 
-inline bool IsArithmeticOp( HLSLBinaryOp op )
+inline bool IsArithmeticOp(HLSLBinaryOp op)
 {
     return op == HLSLBinaryOp_Add ||
-        op == HLSLBinaryOp_Sub ||
-        op == HLSLBinaryOp_Mul ||
-        op == HLSLBinaryOp_Div;
+           op == HLSLBinaryOp_Sub ||
+           op == HLSLBinaryOp_Mul ||
+           op == HLSLBinaryOp_Div;
 }
 
-inline bool IsLogicOp( HLSLBinaryOp op )
+inline bool IsLogicOp(HLSLBinaryOp op)
 {
     return op == HLSLBinaryOp_And ||
-        op == HLSLBinaryOp_Or;
+           op == HLSLBinaryOp_Or;
 }
 
-inline bool IsAssignOp( HLSLBinaryOp op )
+inline bool IsAssignOp(HLSLBinaryOp op)
 {
     return op == HLSLBinaryOp_Assign ||
-        op == HLSLBinaryOp_AddAssign ||
-        op == HLSLBinaryOp_SubAssign ||
-        op == HLSLBinaryOp_MulAssign ||
-        op == HLSLBinaryOp_DivAssign;
+           op == HLSLBinaryOp_AddAssign ||
+           op == HLSLBinaryOp_SubAssign ||
+           op == HLSLBinaryOp_MulAssign ||
+           op == HLSLBinaryOp_DivAssign;
 }
 
-inline bool IsBitOp( HLSLBinaryOp op )
+inline bool IsBitOp(HLSLBinaryOp op)
 {
     return op == HLSLBinaryOp_BitAnd ||
-        op == HLSLBinaryOp_BitOr ||
-        op == HLSLBinaryOp_BitXor;
+           op == HLSLBinaryOp_BitOr ||
+           op == HLSLBinaryOp_BitXor;
 }
-    
-enum HLSLUnaryOp
-{
-    HLSLUnaryOp_Negative,       // -x
-    HLSLUnaryOp_Positive,       // +x
-    HLSLUnaryOp_Not,            // !x
-    HLSLUnaryOp_PreIncrement,   // ++x
-    HLSLUnaryOp_PreDecrement,   // --x
-    HLSLUnaryOp_PostIncrement,  // x++
-    HLSLUnaryOp_PostDecrement,  // x++
-    HLSLUnaryOp_BitNot,         // ~x
+
+enum HLSLUnaryOp {
+    HLSLUnaryOp_Negative, // -x
+    HLSLUnaryOp_Positive, // +x
+    HLSLUnaryOp_Not, // !x
+    HLSLUnaryOp_PreIncrement, // ++x
+    HLSLUnaryOp_PreDecrement, // --x
+    HLSLUnaryOp_PostIncrement, // x++
+    HLSLUnaryOp_PostDecrement, // x++
+    HLSLUnaryOp_BitNot, // ~x
 };
 
-enum HLSLArgumentModifier
-{
+enum HLSLArgumentModifier {
     HLSLArgumentModifier_None,
     HLSLArgumentModifier_In,
     HLSLArgumentModifier_Out,
@@ -287,8 +279,7 @@ enum HLSLArgumentModifier
     HLSLArgumentModifier_Const,
 };
 
-enum HLSLTypeFlags
-{
+enum HLSLTypeFlags {
     HLSLTypeFlag_None = 0,
     HLSLTypeFlag_Const = 0x01,
     HLSLTypeFlag_Static = 0x02,
@@ -312,23 +303,21 @@ enum HLSLTypeFlags
     HLSLTypeFlag_NoPromote = 0x200000,
 };
 
-enum HLSLAttributeType
-{
+enum HLSLAttributeType {
     HLSLAttributeType_Unknown,
-    
+
     // TODO: a lot more attributes, these are loop attributes
     // f.e. specialization constant and numthreads for HLSL
     HLSLAttributeType_Unroll,
     HLSLAttributeType_Branch,
     HLSLAttributeType_Flatten,
     HLSLAttributeType_NoFastMath,
-    
+
 };
 
-enum HLSLAddressSpace
-{
+enum HLSLAddressSpace {
     HLSLAddressSpace_Undefined,
-    
+
     // These only apply to MSL
     HLSLAddressSpace_Constant,
     HLSLAddressSpace_Device,
@@ -338,7 +327,6 @@ enum HLSLAddressSpace
     // TODO: ThreadgroupImageblock
 };
 
-
 struct HLSLNode;
 struct HLSLRoot;
 struct HLSLStatement;
@@ -359,312 +347,278 @@ struct HLSLFunctionCall;
 struct HLSLArrayAccess;
 struct HLSLAttribute;
 
-struct HLSLType
-{
+struct HLSLType {
     explicit HLSLType(HLSLBaseType _baseType = HLSLBaseType_Unknown)
-    { 
-        baseType    = _baseType;
+    {
+        baseType = _baseType;
     }
     bool TestFlags(int flags_) const { return (flags & flags_) == flags_; }
-    
-    HLSLBaseType        baseType = HLSLBaseType_Unknown;
-    HLSLBaseType        formatType = HLSLBaseType_Float;    // Half or Float (only applies to templated params like buffer/texture)
-    const char*         typeName = NULL;       // For user defined types.
-    bool                array = false;
-    HLSLExpression*     arraySize = NULL; // can ref constant like NUM_LIGHTS
-    int                 flags = 0;
-    HLSLAddressSpace    addressSpace = HLSLAddressSpace_Undefined; // MSL mostly
+
+    HLSLBaseType baseType = HLSLBaseType_Unknown;
+    HLSLBaseType formatType = HLSLBaseType_Float; // Half or Float (only applies to templated params like buffer/texture)
+    const char* typeName = NULL; // For user defined types.
+    bool array = false;
+    HLSLExpression* arraySize = NULL; // can ref constant like NUM_LIGHTS
+    int flags = 0;
+    HLSLAddressSpace addressSpace = HLSLAddressSpace_Undefined; // MSL mostly
 };
 
 // Only Statment, Argument, StructField can be marked hidden.
 // But many elements like Buffer derive from Statement.
 
 /// Base class for all nodes in the HLSL AST
-struct HLSLNode
-{
-    HLSLNodeType        nodeType; // set to s_type
-    const char*         fileName = NULL;
-    int                 line = 0;
+struct HLSLNode {
+    HLSLNodeType nodeType; // set to s_type
+    const char* fileName = NULL;
+    int line = 0;
 };
 
-struct HLSLRoot : public HLSLNode
-{
+struct HLSLRoot : public HLSLNode {
     static const HLSLNodeType s_type = HLSLNodeType_Root;
-    HLSLStatement*      statement = NULL;          // First statement.
+    HLSLStatement* statement = NULL; // First statement.
 };
 
-struct HLSLStatement : public HLSLNode
-{
-    HLSLStatement*      nextStatement = NULL;      // Next statement in the block.
-    HLSLAttribute*      attributes = NULL;
-    
+struct HLSLStatement : public HLSLNode {
+    HLSLStatement* nextStatement = NULL; // Next statement in the block.
+    HLSLAttribute* attributes = NULL;
+
     // This allows tree pruning.  Marked true after traversing use in
-    mutable bool        hidden = false;
-    
+    mutable bool hidden = false;
+
     // This is marked as false at start, and multi endpoint traversal marks
     // when a global is already written, and next write is skipped.
-    mutable bool        written = false;
+    mutable bool written = false;
 };
 
 // [unroll]
-struct HLSLAttribute : public HLSLNode
-{
+struct HLSLAttribute : public HLSLNode {
     static const HLSLNodeType s_type = HLSLNodeType_Attribute;
-    HLSLAttributeType   attributeType = HLSLAttributeType_Unknown;
-    HLSLExpression*     argument = NULL;
-    HLSLAttribute*      nextAttribute = NULL;
+    HLSLAttributeType attributeType = HLSLAttributeType_Unknown;
+    HLSLExpression* argument = NULL;
+    HLSLAttribute* nextAttribute = NULL;
 };
 
-struct HLSLDeclaration : public HLSLStatement
-{
+struct HLSLDeclaration : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_Declaration;
-    const char*         name  = NULL;
-    HLSLType            type;
-    const char*         registerName  = NULL;       // @@ Store register index?
-    const char*         semantic  = NULL;
-    HLSLDeclaration*    nextDeclaration = NULL;    // If multiple variables declared on a line.
-    HLSLExpression*     assignment = NULL;
-    
-    HLSLBuffer*         buffer = NULL; // reference cbuffer for decl
+    const char* name = NULL;
+    HLSLType type;
+    const char* registerName = NULL; // @@ Store register index?
+    const char* semantic = NULL;
+    HLSLDeclaration* nextDeclaration = NULL; // If multiple variables declared on a line.
+    HLSLExpression* assignment = NULL;
+
+    HLSLBuffer* buffer = NULL; // reference cbuffer for decl
 };
 
-struct HLSLStruct : public HLSLStatement
-{
+struct HLSLStruct : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_Struct;
-    const char*         name = NULL;
-    HLSLStructField*    field = NULL;              // First field in the structure.
+    const char* name = NULL;
+    HLSLStructField* field = NULL; // First field in the structure.
 };
 
-struct HLSLStructField : public HLSLNode
-{
+struct HLSLStructField : public HLSLNode {
     static const HLSLNodeType s_type = HLSLNodeType_StructField;
-    const char*         name = NULL;
-    HLSLType            type;
-    const char*         semantic = NULL;
-    const char*         sv_semantic = NULL;
-    HLSLStructField*    nextField = NULL;      // Next field in the structure.
-    bool                hidden = false;
+    const char* name = NULL;
+    HLSLType type;
+    const char* semantic = NULL;
+    const char* sv_semantic = NULL;
+    HLSLStructField* nextField = NULL; // Next field in the structure.
+    bool hidden = false;
 };
 
 /// Buffer declaration.
-struct HLSLBuffer : public HLSLStatement
-{
+struct HLSLBuffer : public HLSLStatement {
     // These spill a ton of globals throughout shader
     bool IsGlobalFields() const
     {
-        return  bufferType == HLSLBufferType_CBuffer ||
-                bufferType == HLSLBufferType_TBuffer;
+        return bufferType == HLSLBufferType_CBuffer ||
+               bufferType == HLSLBufferType_TBuffer;
     }
-    
+
     // DX changes registers for read-only vs. read-write buffers (SRV vs. UAV)
     // so constant/cbuffer use b, structured/byte use t (like textures),
     // and read-write use u.  MSL only has u and
     bool IsReadOnly() const
     {
-        return  bufferType == HLSLBufferType_CBuffer ||
-                bufferType == HLSLBufferType_TBuffer ||
-                bufferType == HLSLBufferType_ConstantBuffer ||
-                bufferType == HLSLBufferType_StructuredBuffer ||
-                bufferType == HLSLBufferType_ByteAddressBuffer;
+        return bufferType == HLSLBufferType_CBuffer ||
+               bufferType == HLSLBufferType_TBuffer ||
+               bufferType == HLSLBufferType_ConstantBuffer ||
+               bufferType == HLSLBufferType_StructuredBuffer ||
+               bufferType == HLSLBufferType_ByteAddressBuffer;
     }
-    
+
     static const HLSLNodeType s_type = HLSLNodeType_Buffer;
-    const char*         name = NULL;
-    const char*         registerName = NULL;
-    HLSLDeclaration*    field = NULL;
-    HLSLBufferType      bufferType = HLSLBufferType_CBuffer;
-    HLSLStruct*         bufferStruct = NULL;
+    const char* name = NULL;
+    const char* registerName = NULL;
+    HLSLDeclaration* field = NULL;
+    HLSLBufferType bufferType = HLSLBufferType_CBuffer;
+    HLSLStruct* bufferStruct = NULL;
 };
 
-
 /// Function declaration
-struct HLSLFunction : public HLSLStatement
-{
+struct HLSLFunction : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_Function;
-    const char*         name  = NULL;
-    HLSLType            returnType;
-    HLSLBaseType        memberType = HLSLBaseType_Unknown; // for sampler members, must also look at GetScalarType(returnType)
-    const char*         semantic  = NULL;
-    const char*         sv_semantic = NULL;
-    int                 numArguments = 0;
-    int                 numOutputArguments = 0;     // Includes out and inout arguments.
-    HLSLArgument*       argument = NULL;
-    HLSLStatement*      statement = NULL;
-    HLSLFunction*       forward = NULL; // Which HLSLFunction this one forward-declares
-    
+    const char* name = NULL;
+    HLSLType returnType;
+    HLSLBaseType memberType = HLSLBaseType_Unknown; // for sampler members, must also look at GetScalarType(returnType)
+    const char* semantic = NULL;
+    const char* sv_semantic = NULL;
+    int numArguments = 0;
+    int numOutputArguments = 0; // Includes out and inout arguments.
+    HLSLArgument* argument = NULL;
+    HLSLStatement* statement = NULL;
+    HLSLFunction* forward = NULL; // Which HLSLFunction this one forward-declares
+
     bool IsMemberFunction() const { return memberType != HLSLBaseType_Unknown; }
 };
 
 /// Declaration of an argument to a function.
-struct HLSLArgument : public HLSLNode
-{
+struct HLSLArgument : public HLSLNode {
     static const HLSLNodeType s_type = HLSLNodeType_Argument;
-    const char*             name = NULL;
-    HLSLArgumentModifier    modifier = HLSLArgumentModifier_None;
-    HLSLType                type;
-    const char*             semantic = NULL;
-    const char*             sv_semantic = NULL;
-    HLSLExpression*         defaultValue = NULL;
-    HLSLArgument*           nextArgument = NULL;
-    bool                    hidden = false;
+    const char* name = NULL;
+    HLSLArgumentModifier modifier = HLSLArgumentModifier_None;
+    HLSLType type;
+    const char* semantic = NULL;
+    const char* sv_semantic = NULL;
+    HLSLExpression* defaultValue = NULL;
+    HLSLArgument* nextArgument = NULL;
+    bool hidden = false;
 };
 
 /// A expression which forms a complete statement.
-struct HLSLExpressionStatement : public HLSLStatement
-{
+struct HLSLExpressionStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_ExpressionStatement;
-    HLSLExpression*     expression = NULL;
+    HLSLExpression* expression = NULL;
 };
 
-struct HLSLReturnStatement : public HLSLStatement
-{
+struct HLSLReturnStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_ReturnStatement;
-    HLSLExpression*     expression = NULL;
+    HLSLExpression* expression = NULL;
 };
 
-struct HLSLDiscardStatement : public HLSLStatement
-{
+struct HLSLDiscardStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_DiscardStatement;
 };
 
-struct HLSLBreakStatement : public HLSLStatement
-{
+struct HLSLBreakStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_BreakStatement;
 };
 
-struct HLSLContinueStatement : public HLSLStatement
-{
+struct HLSLContinueStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_ContinueStatement;
 };
 
-struct HLSLIfStatement : public HLSLStatement
-{
+struct HLSLIfStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_IfStatement;
-    HLSLExpression*     condition = NULL;
-    HLSLStatement*      statement = NULL;
-    HLSLStatement*      elseStatement = NULL;
-    bool                isStatic = false;
+    HLSLExpression* condition = NULL;
+    HLSLStatement* statement = NULL;
+    HLSLStatement* elseStatement = NULL;
+    bool isStatic = false;
 };
 
-struct HLSLForStatement : public HLSLStatement
-{
+struct HLSLForStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_ForStatement;
-    HLSLDeclaration*    initialization = NULL;
-    HLSLExpression*     condition = NULL;
-    HLSLExpression*     increment = NULL;
-    HLSLStatement*      statement = NULL;
+    HLSLDeclaration* initialization = NULL;
+    HLSLExpression* condition = NULL;
+    HLSLExpression* increment = NULL;
+    HLSLStatement* statement = NULL;
 };
 
-struct HLSLBlockStatement : public HLSLStatement
-{
+struct HLSLBlockStatement : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_BlockStatement;
-    HLSLStatement*      statement = NULL;
+    HLSLStatement* statement = NULL;
 };
 
-
 /// Base type for all types of expressions.
-struct HLSLExpression : public HLSLNode
-{
+struct HLSLExpression : public HLSLNode {
     static const HLSLNodeType s_type = HLSLNodeType_Expression;
-    HLSLType            expressionType;
-    HLSLExpression*     nextExpression = NULL; // Used when the expression is part of a list, like in a function call.
+    HLSLType expressionType;
+    HLSLExpression* nextExpression = NULL; // Used when the expression is part of a list, like in a function call.
 };
 
 // -a
-struct HLSLUnaryExpression : public HLSLExpression
-{
+struct HLSLUnaryExpression : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_UnaryExpression;
-    HLSLUnaryOp         unaryOp = {};
-    HLSLExpression*     expression = NULL;
+    HLSLUnaryOp unaryOp = {};
+    HLSLExpression* expression = NULL;
 };
 
 /// a + b
-struct HLSLBinaryExpression : public HLSLExpression
-{
+struct HLSLBinaryExpression : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_BinaryExpression;
-    HLSLBinaryOp        binaryOp = {};
-    HLSLExpression*     expression1 = NULL;
-    HLSLExpression*     expression2 = NULL;
+    HLSLBinaryOp binaryOp = {};
+    HLSLExpression* expression1 = NULL;
+    HLSLExpression* expression2 = NULL;
 };
 
 /// ? : construct
-struct HLSLConditionalExpression : public HLSLExpression
-{
+struct HLSLConditionalExpression : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_ConditionalExpression;
-    HLSLExpression*     condition = NULL;
-    HLSLExpression*     trueExpression = NULL;
-    HLSLExpression*     falseExpression = NULL;
+    HLSLExpression* condition = NULL;
+    HLSLExpression* trueExpression = NULL;
+    HLSLExpression* falseExpression = NULL;
 };
 
 /// v = (half4)v2
-struct HLSLCastingExpression : public HLSLExpression
-{
+struct HLSLCastingExpression : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_CastingExpression;
-    HLSLType            type;
-    HLSLExpression*     expression = NULL;
+    HLSLType type;
+    HLSLExpression* expression = NULL;
 };
 
 /// Float, integer, boolean, etc. literal constant.
-struct HLSLLiteralExpression : public HLSLExpression
-{
+struct HLSLLiteralExpression : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_LiteralExpression;
-    HLSLBaseType        type = HLSLBaseType_Unknown;   // Note, not all types can be literals.
-    union
-    {
-        bool            bValue;
-        float           fValue;
-        int32_t         iValue;
+    HLSLBaseType type = HLSLBaseType_Unknown; // Note, not all types can be literals.
+    union {
+        bool bValue;
+        float fValue;
+        int32_t iValue;
     };
 };
 
 /// An identifier, typically a variable name or structure field name.
-struct HLSLIdentifierExpression : public HLSLExpression
-{
+struct HLSLIdentifierExpression : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_IdentifierExpression;
-    const char*         name = NULL;
-    bool                global = false; // This is a global variable.
+    const char* name = NULL;
+    bool global = false; // This is a global variable.
 };
 
 /// float2(1, 2)
-struct HLSLConstructorExpression : public HLSLExpression
-{
+struct HLSLConstructorExpression : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_ConstructorExpression;
-	HLSLType            type;
-    HLSLExpression*     argument = NULL;
+    HLSLType type;
+    HLSLExpression* argument = NULL;
 };
 
 /// object.member input.member or input[10].member
-struct HLSLMemberAccess : public HLSLExpression
-{
+struct HLSLMemberAccess : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_MemberAccess;
-	HLSLExpression*     object = NULL;
-    const char*         field = NULL;
-    bool                swizzle = false;
+    HLSLExpression* object = NULL;
+    const char* field = NULL;
+    bool swizzle = false;
 };
 
 /// array[index]
-struct HLSLArrayAccess : public HLSLExpression
-{
+struct HLSLArrayAccess : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_ArrayAccess;
-	HLSLExpression*     array = NULL;
-    HLSLExpression*     index = NULL;
+    HLSLExpression* array = NULL;
+    HLSLExpression* index = NULL;
 };
 
 /// c-style foo(arg1, arg2) - args can have defaults that are parsed
-struct HLSLFunctionCall : public HLSLExpression
-{
+struct HLSLFunctionCall : public HLSLExpression {
     static const HLSLNodeType s_type = HLSLNodeType_FunctionCall;
     const HLSLFunction* function = NULL;
-    HLSLExpression*     argument = NULL;
-    int                 numArguments = 0;
+    HLSLExpression* argument = NULL;
+    int numArguments = 0;
 };
 
 // TODO: finish adding this for texture and buffer ops
 /// c++ style member.foo(arg1, arg2)
-struct HLSLMemberFunctionCall : public HLSLFunctionCall
-{
+struct HLSLMemberFunctionCall : public HLSLFunctionCall {
     static const HLSLNodeType s_type = HLSLNodeType_MemberFunctionCall;
-    
+
     // could be buffer, texture, raytrace
     const HLSLIdentifierExpression* memberIdentifier = NULL;
 };
@@ -737,18 +691,14 @@ struct HLSLStage : public HLSLStatement
 */
 #endif
 
-struct HLSLComment : public HLSLStatement
-{
+struct HLSLComment : public HLSLStatement {
     static const HLSLNodeType s_type = HLSLNodeType_Comment;
-    const char*             text = NULL;
+    const char* text = NULL;
 };
 
 /// Abstract syntax tree for parsed HLSL code.
-class HLSLTree
-{
-
+class HLSLTree {
 public:
-
     explicit HLSLTree(Allocator* allocator);
     ~HLSLTree();
 
@@ -767,116 +717,108 @@ class HLSLTree
     T* AddNode(const char* fileName, int line)
     {
         HLSLNode* node = new (AllocateMemory(sizeof(T))) T();
-        node->nodeType  = T::s_type;
-        node->fileName  = fileName;
-        node->line      = line;
+        node->nodeType = T::s_type;
+        node->fileName = fileName;
+        node->line = line;
         return static_cast<T*>(node);
     }
 
-    HLSLFunction * FindFunction(const char * name);
-    HLSLDeclaration * FindGlobalDeclaration(const char * name, HLSLBuffer ** buffer_out = NULL);
-    
-    HLSLStruct * FindGlobalStruct(const char * name);
-    HLSLBuffer * FindBuffer(const char * name);
+    HLSLFunction* FindFunction(const char* name);
+    HLSLDeclaration* FindGlobalDeclaration(const char* name, HLSLBuffer** buffer_out = NULL);
 
-// FX files
-//    HLSLTechnique * FindTechnique(const char * name);
-//    HLSLPipeline * FindFirstPipeline();
-//    HLSLPipeline * FindNextPipeline(HLSLPipeline * current);
-//    HLSLPipeline * FindPipeline(const char * name);
- 
-    bool GetExpressionValue(HLSLExpression * expression, int & value);
-    int GetExpressionValue(HLSLExpression * expression, float values[4]);
+    HLSLStruct* FindGlobalStruct(const char* name);
+    HLSLBuffer* FindBuffer(const char* name);
 
-    bool NeedsFunction(const char * name);
+    // FX files
+    //    HLSLTechnique * FindTechnique(const char * name);
+    //    HLSLPipeline * FindFirstPipeline();
+    //    HLSLPipeline * FindNextPipeline(HLSLPipeline * current);
+    //    HLSLPipeline * FindPipeline(const char * name);
 
-private:
+    bool GetExpressionValue(HLSLExpression* expression, int& value);
+    int GetExpressionValue(HLSLExpression* expression, float values[4]);
 
-    void* AllocateMemory(size_t size);
-    void  AllocatePage();
+    bool NeedsFunction(const char* name);
 
 private:
+    void* AllocateMemory(size_t size);
+    void AllocatePage();
 
+private:
     static const size_t s_nodePageSize = 1024 * 4;
 
-    struct NodePage
-    {
-        NodePage*   next;
-        char        buffer[s_nodePageSize];
+    struct NodePage {
+        NodePage* next;
+        char buffer[s_nodePageSize];
     };
 
-    Allocator*      m_allocator;
-    StringPool      m_stringPool;
-    HLSLRoot*       m_root;
-
-    NodePage*       m_firstPage;
-    NodePage*       m_currentPage;
-    size_t          m_currentPageOffset;
+    Allocator* m_allocator;
+    StringPool m_stringPool;
+    HLSLRoot* m_root;
 
+    NodePage* m_firstPage;
+    NodePage* m_currentPage;
+    size_t m_currentPageOffset;
 };
 
-
-
-class HLSLTreeVisitor
-{
+class HLSLTreeVisitor {
 public:
     virtual ~HLSLTreeVisitor() {}
-    virtual void VisitType(HLSLType & type);
-
-    virtual void VisitRoot(HLSLRoot * node);
-    virtual void VisitTopLevelStatement(HLSLStatement * node);
-    virtual void VisitStatements(HLSLStatement * statement);
-    virtual void VisitStatement(HLSLStatement * node);
-    virtual void VisitDeclaration(HLSLDeclaration * node);
-    virtual void VisitStruct(HLSLStruct * node);
-    virtual void VisitStructField(HLSLStructField * node);
-    virtual void VisitBuffer(HLSLBuffer * node);
+    virtual void VisitType(HLSLType& type);
+
+    virtual void VisitRoot(HLSLRoot* node);
+    virtual void VisitTopLevelStatement(HLSLStatement* node);
+    virtual void VisitStatements(HLSLStatement* statement);
+    virtual void VisitStatement(HLSLStatement* node);
+    virtual void VisitDeclaration(HLSLDeclaration* node);
+    virtual void VisitStruct(HLSLStruct* node);
+    virtual void VisitStructField(HLSLStructField* node);
+    virtual void VisitBuffer(HLSLBuffer* node);
     //virtual void VisitBufferField(HLSLBufferField * node); // TODO:
-    virtual void VisitFunction(HLSLFunction * node);
-    virtual void VisitArgument(HLSLArgument * node);
-    virtual void VisitExpressionStatement(HLSLExpressionStatement * node);
-    virtual void VisitExpression(HLSLExpression * node);
-    virtual void VisitReturnStatement(HLSLReturnStatement * node);
-    virtual void VisitDiscardStatement(HLSLDiscardStatement * node);
-    virtual void VisitBreakStatement(HLSLBreakStatement * node);
-    virtual void VisitContinueStatement(HLSLContinueStatement * node);
-    virtual void VisitIfStatement(HLSLIfStatement * node);
-    virtual void VisitForStatement(HLSLForStatement * node);
-    virtual void VisitBlockStatement(HLSLBlockStatement * node);
-    virtual void VisitUnaryExpression(HLSLUnaryExpression * node);
-    virtual void VisitBinaryExpression(HLSLBinaryExpression * node);
-    virtual void VisitConditionalExpression(HLSLConditionalExpression * node);
-    virtual void VisitCastingExpression(HLSLCastingExpression * node);
-    virtual void VisitLiteralExpression(HLSLLiteralExpression * node);
-    virtual void VisitIdentifierExpression(HLSLIdentifierExpression * node);
-    virtual void VisitConstructorExpression(HLSLConstructorExpression * node);
-    virtual void VisitMemberAccess(HLSLMemberAccess * node);
-    virtual void VisitArrayAccess(HLSLArrayAccess * node);
-    virtual void VisitFunctionCall(HLSLFunctionCall * node);
-    
-    virtual void VisitComment(HLSLComment * node);
-
-    virtual void VisitFunctions(HLSLRoot * root);
-    virtual void VisitParameters(HLSLRoot * root);
-
-    HLSLFunction * FindFunction(HLSLRoot * root, const char * name);
-    HLSLDeclaration * FindGlobalDeclaration(HLSLRoot * root, const char * name);
-    HLSLStruct * FindGlobalStruct(HLSLRoot * root, const char * name);
-    
+    virtual void VisitFunction(HLSLFunction* node);
+    virtual void VisitArgument(HLSLArgument* node);
+    virtual void VisitExpressionStatement(HLSLExpressionStatement* node);
+    virtual void VisitExpression(HLSLExpression* node);
+    virtual void VisitReturnStatement(HLSLReturnStatement* node);
+    virtual void VisitDiscardStatement(HLSLDiscardStatement* node);
+    virtual void VisitBreakStatement(HLSLBreakStatement* node);
+    virtual void VisitContinueStatement(HLSLContinueStatement* node);
+    virtual void VisitIfStatement(HLSLIfStatement* node);
+    virtual void VisitForStatement(HLSLForStatement* node);
+    virtual void VisitBlockStatement(HLSLBlockStatement* node);
+    virtual void VisitUnaryExpression(HLSLUnaryExpression* node);
+    virtual void VisitBinaryExpression(HLSLBinaryExpression* node);
+    virtual void VisitConditionalExpression(HLSLConditionalExpression* node);
+    virtual void VisitCastingExpression(HLSLCastingExpression* node);
+    virtual void VisitLiteralExpression(HLSLLiteralExpression* node);
+    virtual void VisitIdentifierExpression(HLSLIdentifierExpression* node);
+    virtual void VisitConstructorExpression(HLSLConstructorExpression* node);
+    virtual void VisitMemberAccess(HLSLMemberAccess* node);
+    virtual void VisitArrayAccess(HLSLArrayAccess* node);
+    virtual void VisitFunctionCall(HLSLFunctionCall* node);
+
+    virtual void VisitComment(HLSLComment* node);
+
+    virtual void VisitFunctions(HLSLRoot* root);
+    virtual void VisitParameters(HLSLRoot* root);
+
+    HLSLFunction* FindFunction(HLSLRoot* root, const char* name);
+    HLSLDeclaration* FindGlobalDeclaration(HLSLRoot* root, const char* name);
+    HLSLStruct* FindGlobalStruct(HLSLRoot* root, const char* name);
+
     // These are fx file constructs
-//    virtual void VisitStateAssignment(HLSLStateAssignment * node);
-//    virtual void VisitSamplerState(HLSLSamplerState * node);
-//    virtual void VisitPass(HLSLPass * node);
-//    virtual void VisitTechnique(HLSLTechnique * node);
-//    virtual void VisitPipeline(HLSLPipeline * node);
+    // virtual void VisitStateAssignment(HLSLStateAssignment * node);
+    // virtual void VisitSamplerState(HLSLSamplerState * node);
+    // virtual void VisitPass(HLSLPass * node);
+    // virtual void VisitTechnique(HLSLTechnique * node);
+    // virtual void VisitPipeline(HLSLPipeline * node);
 };
 
-
 // Tree transformations:
 extern void PruneTree(HLSLTree* tree, const char* entryName0, const char* entryName1 = NULL);
 extern void SortTree(HLSLTree* tree);
 //extern void GroupParameters(HLSLTree* tree);
-extern void HideUnusedArguments(HLSLFunction * function);
+extern void HideUnusedArguments(HLSLFunction* function);
 extern void FlattenExpressions(HLSLTree* tree);
-    
-} // M4
+
+} //namespace M4
diff --git a/hlslparser/src/MSLGenerator.cpp b/hlslparser/src/MSLGenerator.cpp
index 9d1dc56d..faffb254 100644
--- a/hlslparser/src/MSLGenerator.cpp
+++ b/hlslparser/src/MSLGenerator.cpp
@@ -9,12 +9,12 @@
 
 #include "MSLGenerator.h"
 
+#include <string.h>
+
 #include "Engine.h"
 #include "HLSLParser.h"
 #include "HLSLTree.h"
 
-#include <string.h>
-
 // MSL limitations:
 // - Some type conversions and constructors don't work exactly the same way. For example, casts to smaller size vectors are not alloweed in C++. @@ Add more details...
 // - Swizzles on scalar types, whether or not it expands them. a_float.x, a_float.xxxx both cause compile errors.
@@ -25,14 +25,12 @@
 // - No support for boolean vectors and logical operators involving vectors. This is not just in metal.
 // - No support for non-float texture types
 
-namespace M4
-{
+namespace M4 {
 static void ParseSemantic(const char* semantic, uint32_t* outputLength, uint32_t* outputIndex)
 {
     const char* semanticIndex = semantic;
 
-    while (*semanticIndex && !isdigit(*semanticIndex))
-    {
+    while (*semanticIndex && !isdigit(*semanticIndex)) {
         semanticIndex++;
     }
 
@@ -43,36 +41,29 @@ static void ParseSemantic(const char* semantic, uint32_t* outputLength, uint32_t
 // Parse register name and advance next register index.
 static int ParseRegister(const char* registerName, int& nextRegister)
 {
-    if (!registerName)
-    {
+    if (!registerName) {
         return nextRegister++;
     }
 
     // skip over the u/b/t register prefix
-    while (*registerName && !isdigit(*registerName))
-    {
+    while (*registerName && !isdigit(*registerName)) {
         registerName++;
     }
 
-    if (!*registerName)
-    {
+    if (!*registerName) {
         return nextRegister++;
     }
 
     // parse the number
     int result = atoi(registerName);
 
-    if (nextRegister <= result)
-    {
+    if (nextRegister <= result) {
         nextRegister = result + 1;
     }
 
     return result;
 }
 
-
-
-
 MSLGenerator::MSLGenerator()
 {
     m_tree = NULL;
@@ -92,8 +83,7 @@ void MSLGenerator::Error(const char* format, ...) const
     // It's not always convenient to stop executing when an error occurs,
     // so just track once we've hit an error and stop reporting them until
     // we successfully bail out of execution.
-    if (m_error)
-    {
+    if (m_error) {
         return;
     }
     m_error = true;
@@ -106,36 +96,33 @@ void MSLGenerator::Error(const char* format, ...) const
 
 inline void MSLGenerator::AddClassArgument(ClassArgument* arg)
 {
-    if (m_firstClassArgument == NULL)
-    {
+    if (m_firstClassArgument == NULL) {
         m_firstClassArgument = arg;
     }
-    else
-    {
+    else {
         m_lastClassArgument->nextArg = arg;
     }
     m_lastClassArgument = arg;
 }
 
-
 void MSLGenerator::Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entryFunction)
 {
     // Hide unused arguments. @@ It would be good to do this in the other generators too.
-    
+
     // PruneTree resets hidden flags to true, then marks visible elements
     // based on whether entry point visits them.
     PruneTree(tree, entryFunction->name); // Note: takes second entry
-    
+
     // This sorts tree by type, but keeps ordering
     SortTree(tree);
-   
+
     // This strips any unused inputs to the entry point function
     HideUnusedArguments(entryFunction);
-    
+
     // Note sure if/where to add these calls.  Just wanted to point
     // out that nothing is calling them, but could be useful.
     FlattenExpressions(tree);
-    
+
     HLSLRoot* root = tree->GetRoot();
     HLSLStatement* statement = root->statement;
     ASSERT(m_firstClassArgument == NULL);
@@ -146,56 +133,50 @@ void MSLGenerator::Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entr
     int nextSamplerRegister = 0;
     int nextBufferRegister = 0;
 
-    while (statement != NULL)
-    {
-        if (statement->hidden)
-        {
+    while (statement != NULL) {
+        if (statement->hidden) {
             statement = statement->nextStatement;
             continue;
         }
-        
-        if (statement->nodeType == HLSLNodeType_Declaration)
-        {
+
+        if (statement->nodeType == HLSLNodeType_Declaration) {
             HLSLDeclaration* declaration = (HLSLDeclaration*)statement;
 
-            if (IsTextureType(declaration->type))
-            {
-                const char * textureName = declaration->name;
-                
+            if (IsTextureType(declaration->type)) {
+                const char* textureName = declaration->name;
+
                 int textureRegister = ParseRegister(declaration->registerName, nextTextureRegister);
-                 const char * textureRegisterName = m_tree->AddStringFormat("texture(%d)", textureRegister);
+                const char* textureRegisterName = m_tree->AddStringFormat("texture(%d)", textureRegister);
 
                 if (declaration->type.addressSpace == HLSLAddressSpace_Undefined)
                     declaration->type.addressSpace = HLSLAddressSpace_Device;
-                
+
                 AddClassArgument(new ClassArgument(textureName, declaration->type, textureRegisterName, true));
             }
-            else if (IsSamplerType(declaration->type))
-            {
-                const char * samplerName = declaration->name;
-                
+            else if (IsSamplerType(declaration->type)) {
+                const char* samplerName = declaration->name;
+
                 int samplerRegister = ParseRegister(declaration->registerName, nextSamplerRegister);
-                const char * samplerRegisterName = m_tree->AddStringFormat("sampler(%d)", samplerRegister);
-                
+                const char* samplerRegisterName = m_tree->AddStringFormat("sampler(%d)", samplerRegister);
+
                 if (declaration->type.addressSpace == HLSLAddressSpace_Undefined)
                     declaration->type.addressSpace = HLSLAddressSpace_Device;
-                
+
                 AddClassArgument(new ClassArgument(samplerName, declaration->type, samplerRegisterName, true));
             }
         }
-        else if (statement->nodeType == HLSLNodeType_Buffer)
-        {
-            HLSLBuffer * buffer = (HLSLBuffer *)statement;
-            
+        else if (statement->nodeType == HLSLNodeType_Buffer) {
+            HLSLBuffer* buffer = (HLSLBuffer*)statement;
+
             HLSLType type(HLSLBaseType_UserDefined);
-            
+
             // TODO: on cbuffer is a ubo, not tbuffer, or others
             // TODO: this is having to rename due to globals
             if (buffer->IsGlobalFields())
                 type.typeName = m_tree->AddStringFormat("%s_ubo", buffer->name);
             else
                 type.typeName = m_tree->AddStringFormat("%s", buffer->bufferStruct->name);
-            
+
             // TODO: ConstantBuffer can use ptr notation, detect array decl
             bool isRef = buffer->bufferType == HLSLBufferType_ConstantBuffer ||
                          buffer->IsGlobalFields();
@@ -204,10 +185,10 @@ void MSLGenerator::Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entr
                 type.addressSpace = HLSLAddressSpace_Constant;
             else
                 type.addressSpace = HLSLAddressSpace_Device;
-            
+
             int bufferRegister = ParseRegister(buffer->registerName, nextBufferRegister) + m_options.bufferRegisterOffset;
 
-            const char * bufferRegisterName = m_tree->AddStringFormat("buffer(%d)", bufferRegister);
+            const char* bufferRegisterName = m_tree->AddStringFormat("buffer(%d)", bufferRegister);
 
             AddClassArgument(new ClassArgument(buffer->name, type, bufferRegisterName, isRef));
         }
@@ -220,58 +201,45 @@ void MSLGenerator::Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entr
 
     // Translate semantics.
     HLSLArgument* argument = entryFunction->argument;
-    while (argument != NULL)
-    {
-        if (argument->hidden)
-        {
+    while (argument != NULL) {
+        if (argument->hidden) {
             argument = argument->nextArgument;
             continue;
         }
 
-        if (argument->modifier == HLSLArgumentModifier_Out)
-        {
+        if (argument->modifier == HLSLArgumentModifier_Out) {
             // Translate output arguments semantics.
-            if (argument->type.baseType == HLSLBaseType_UserDefined)
-            {
+            if (argument->type.baseType == HLSLBaseType_UserDefined) {
                 // Our vertex input is a struct and its fields need to be tagged when we generate that
                 HLSLStruct* structure = tree->FindGlobalStruct(argument->type.typeName);
-                if (structure == NULL)
-                {
+                if (structure == NULL) {
                     Error("Vertex shader output struct '%s' not found in shader\n", argument->type.typeName);
                 }
 
                 HLSLStructField* field = structure->field;
-                while (field != NULL)
-                {
-                    if (!field->hidden)
-                    {
+                while (field != NULL) {
+                    if (!field->hidden) {
                         field->sv_semantic = TranslateOutputSemantic(field->semantic);
                     }
                     field = field->nextField;
                 }
             }
-            else
-            {
+            else {
                 argument->sv_semantic = TranslateOutputSemantic(argument->semantic);
             }
         }
-        else
-        {
+        else {
             // Translate input arguments semantics.
-            if (argument->type.baseType == HLSLBaseType_UserDefined)
-            {
+            if (argument->type.baseType == HLSLBaseType_UserDefined) {
                 // Our vertex input is a struct and its fields need to be tagged when we generate that
                 HLSLStruct* structure = tree->FindGlobalStruct(argument->type.typeName);
-                if (structure == NULL)
-                {
+                if (structure == NULL) {
                     Error("Vertex shader input struct '%s' not found in shader\n", argument->type.typeName);
                 }
 
                 HLSLStructField* field = structure->field;
-                while (field != NULL)
-                {
-                    if (!field->hidden)
-                    {
+                while (field != NULL) {
+                    if (!field->hidden) {
                         field->sv_semantic = TranslateInputSemantic(field->semantic);
 
                         // Force type to uint.
@@ -288,8 +256,7 @@ void MSLGenerator::Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entr
                     field = field->nextField;
                 }
             }
-            else
-            {
+            else {
                 argument->sv_semantic = TranslateInputSemantic(argument->semantic);
 
                 // Force type to uint.
@@ -304,29 +271,23 @@ void MSLGenerator::Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entr
     }
 
     // Translate return value semantic.
-    if (entryFunction->returnType.baseType != HLSLBaseType_Void)
-    {
-        if (entryFunction->returnType.baseType == HLSLBaseType_UserDefined)
-        {
+    if (entryFunction->returnType.baseType != HLSLBaseType_Void) {
+        if (entryFunction->returnType.baseType == HLSLBaseType_UserDefined) {
             // Our vertex input is a struct and its fields need to be tagged when we generate that
             HLSLStruct* structure = tree->FindGlobalStruct(entryFunction->returnType.typeName);
-            if (structure == NULL)
-            {
+            if (structure == NULL) {
                 Error("Vertex shader output struct '%s' not found in shader\n", entryFunction->returnType.typeName);
             }
 
             HLSLStructField* field = structure->field;
-            while (field != NULL)
-            {
-                if (!field->hidden)
-                {
+            while (field != NULL) {
+                if (!field->hidden) {
                     field->sv_semantic = TranslateOutputSemantic(field->semantic);
                 }
                 field = field->nextField;
             }
         }
-        else
-        {
+        else {
             entryFunction->sv_semantic = TranslateOutputSemantic(entryFunction->semantic);
 
             //Error("MSL only supports COLOR semantic in return \n", entryFunction->returnType.typeName);
@@ -337,8 +298,7 @@ void MSLGenerator::Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entr
 void MSLGenerator::CleanPrepass()
 {
     ClassArgument* currentArg = m_firstClassArgument;
-    while (currentArg != NULL)
-    {
+    while (currentArg != NULL) {
         ClassArgument* nextArg = currentArg->nextArg;
         delete currentArg;
         currentArg = nextArg;
@@ -352,7 +312,7 @@ void MSLGenerator::PrependDeclarations()
 {
     // Any special function stubs we need go here
     // That includes special constructors to emulate HLSL not being strict
-    
+
     //Branch internally to HLSL vs. MSL verision
     m_writer.WriteLine(0, "#include \"ShaderMSL.h\"");
 }
@@ -360,30 +320,32 @@ void MSLGenerator::PrependDeclarations()
 // Any reference or pointer must be qualified with address space in MSL
 const char* MSLGenerator::GetAddressSpaceName(HLSLBaseType baseType, HLSLAddressSpace addressSpace) const
 {
-    if (IsSamplerType(baseType))
-    {
+    if (IsSamplerType(baseType)) {
         return "thread";
     }
-    if (IsTextureType(baseType))
-    {
+    if (IsTextureType(baseType)) {
         return "thread";
     }
 
     // buffers also need to handle readonly (constant and const device) vs.
     // readwrite (device).
-    
-    switch(addressSpace)
-    {
-        case HLSLAddressSpace_Constant: return "constant";
-        case HLSLAddressSpace_Device: return "device";
-        case HLSLAddressSpace_Thread: return "thread";
-        case HLSLAddressSpace_Shared: return "shared";
-        //case HLSLAddressSpace_Threadgroup:  return "threadgroup_local";
-        //case HLSLAddressSpace_ThreadgroupImageblock: return "threadgroup_imageblock");
-            
-        case HLSLAddressSpace_Undefined: break;
-    }
-    
+
+    switch (addressSpace) {
+        case HLSLAddressSpace_Constant:
+            return "constant";
+        case HLSLAddressSpace_Device:
+            return "device";
+        case HLSLAddressSpace_Thread:
+            return "thread";
+        case HLSLAddressSpace_Shared:
+            return "shared";
+            //case HLSLAddressSpace_Threadgroup:  return "threadgroup_local";
+            //case HLSLAddressSpace_ThreadgroupImageblock: return "threadgroup_imageblock");
+
+        case HLSLAddressSpace_Undefined:
+            break;
+    }
+
     Error("Unknown address space");
     return "";
 }
@@ -396,7 +358,7 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
     m_tree = tree;
     m_target = target;
     m_entryName = entryName;
-    
+
     m_options = options;
     m_writer.SetWriteFileLine(options.writeFileLine);
 
@@ -404,8 +366,7 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
 
     // Find entry point function
     HLSLFunction* entryFunction = tree->FindFunction(entryName);
-    if (entryFunction == NULL)
-    {
+    if (entryFunction == NULL) {
         Error("Entry point '%s' doesn't exist\n", entryName);
         return false;
     }
@@ -423,7 +384,7 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
     // Uniforms are then passed to the constructor and copied to member variables.
     std::string shaderClassNameStr = entryName;
     shaderClassNameStr += "NS"; // to distinguish from function
-    
+
     const char* shaderClassName = shaderClassNameStr.c_str();
     m_writer.WriteLine(0, "struct %s {", shaderClassName);
 
@@ -434,50 +395,45 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
     m_writer.BeginLine(1);
 
     m_writer.Write("%s(", shaderClassName);
-    
+
     // mod
     int indent = m_writer.EndLine();
-    m_writer.BeginLine(indent+1); // 1 more level for params
-    
+    m_writer.BeginLine(indent + 1); // 1 more level for params
+
     const ClassArgument* currentArg = m_firstClassArgument;
-    while (currentArg != NULL)
-    {
+    while (currentArg != NULL) {
         m_writer.Write("%s ", GetAddressSpaceName(currentArg->type.baseType, currentArg->type.addressSpace));
-        
+
         // ref vs. ptr
         bool isRef = currentArg->isRef;
-        
+
         m_writer.Write("%s %s %s", GetTypeName(currentArg->type, /*exactType=*/true), isRef ? "&" : "*", currentArg->name);
 
         currentArg = currentArg->nextArg;
-        if (currentArg)
-        {
+        if (currentArg) {
             m_writer.Write(", ");
-            
+
             // mod
             indent = m_writer.EndLine();
             m_writer.BeginLine(indent);
         }
     }
     m_writer.Write(")");
-    
+
     // mod
     indent = m_writer.EndLine();
     m_writer.BeginLine(indent);
-    
+
     currentArg = m_firstClassArgument;
-    if (currentArg)
-    {
+    if (currentArg) {
         m_writer.Write(" : ");
     }
-    while (currentArg != NULL)
-    {
+    while (currentArg != NULL) {
         m_writer.Write("%s(%s)", currentArg->name, currentArg->name);
         currentArg = currentArg->nextArg;
-        if (currentArg)
-        {
+        if (currentArg) {
             m_writer.Write(", ");
-            
+
             // mod
             indent = m_writer.EndLine();
             m_writer.BeginLine(indent);
@@ -487,14 +443,12 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
 
     m_writer.WriteLine(0, "};"); // Class
 
-
     // Generate real entry point, the one called by Metal
     m_writer.WriteLine(0, "");
 
     // If function return value has a non-color output semantic, declare a temporary struct for the output.
     bool wrapReturnType = false;
-    if (entryFunction->sv_semantic != NULL && !String_Equal(entryFunction->sv_semantic, "color(0)"))
-    {
+    if (entryFunction->sv_semantic != NULL && !String_Equal(entryFunction->sv_semantic, "color(0)")) {
         wrapReturnType = true;
 
         m_writer.WriteLine(0, "struct %s_output { %s tmp [[%s]]; };", entryName, GetTypeName(entryFunction->returnType, /*exactType=*/true), entryFunction->sv_semantic);
@@ -502,7 +456,6 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
         m_writer.WriteLine(0, "");
     }
 
-
     m_writer.BeginLine(0);
 
     // @@ Add/Translate function attributes.
@@ -512,115 +465,103 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
     // MSL doesn't seem to have this, set from code?
     // if (m_target == HLSLTarget_ComputeShader)
     //     m_writer.WriteLine(indent, "[numthreads(1,1,1)]");
-    
-    switch(m_target)
-    {
-        case HLSLTarget_VertexShader:   m_writer.Write("vertex "); break;
-        case HLSLTarget_PixelShader:    m_writer.Write("fragment "); break;
-        case HLSLTarget_ComputeShader:  m_writer.Write("kernel "); break;
+
+    switch (m_target) {
+        case HLSLTarget_VertexShader:
+            m_writer.Write("vertex ");
+            break;
+        case HLSLTarget_PixelShader:
+            m_writer.Write("fragment ");
+            break;
+        case HLSLTarget_ComputeShader:
+            m_writer.Write("kernel ");
+            break;
     }
 
     // Return type.
-    if (wrapReturnType)
-    {
+    if (wrapReturnType) {
         m_writer.Write("%s_output", entryName);
     }
-    else
-    {
-        if (entryFunction->returnType.baseType == HLSLBaseType_UserDefined)
-        {
+    else {
+        if (entryFunction->returnType.baseType == HLSLBaseType_UserDefined) {
             // Alec removing namespaced structs
             // m_writer.Write("%s::", shaderClassName);
         }
         m_writer.Write("%s", GetTypeName(entryFunction->returnType, /*exactType=*/true));
     }
-    
+
     m_writer.Write(" %s(", entryName);
 
     // Alec added for readability
     indent = m_writer.EndLine();
-    
-    m_writer.BeginLine(indent+1); // indent more
-    
+
+    m_writer.BeginLine(indent + 1); // indent more
+
     //--------------------
     // This is the class taking in arguments
-    
+
     int argumentCount = 0;
     HLSLArgument* argument = entryFunction->argument;
-    while (argument != NULL)
-    {
-        if (argument->hidden)
-        {
+    while (argument != NULL) {
+        if (argument->hidden) {
             argument = argument->nextArgument;
             continue;
         }
-        
-        if (argument->type.baseType == HLSLBaseType_UserDefined)
-        {
+
+        if (argument->type.baseType == HLSLBaseType_UserDefined) {
             //TODO: aled removing namespaced structs
             //m_writer.Write("%s::", shaderClassName);
         }
         m_writer.Write("%s %s", GetTypeName(argument->type, /*exactType=*/true), argument->name);
 
         // @@ IC: We are assuming that the first argument is the 'stage_in'.
-        if (argument->type.baseType == HLSLBaseType_UserDefined && argument == entryFunction->argument)
-        {
+        if (argument->type.baseType == HLSLBaseType_UserDefined && argument == entryFunction->argument) {
             m_writer.Write(" [[stage_in]]");
         }
-        else if (argument->sv_semantic)
-        {
+        else if (argument->sv_semantic) {
             m_writer.Write(" [[%s]]", argument->sv_semantic);
         }
-        
+
         argumentCount++;
-        
+
         argument = argument->nextArgument;
-        if (argument && !argument->hidden)
-        {
+        if (argument && !argument->hidden) {
             m_writer.Write(", ");
-            
+
             // Alec added for readability
             indent = m_writer.EndLine();
             m_writer.BeginLine(indent);
         }
-        
-        
     }
 
     // These are additional inputs/outputs not [[stage_in]]
-    
+
     currentArg = m_firstClassArgument;
-    if (argumentCount && currentArg != NULL)
-    {
+    if (argumentCount && currentArg != NULL) {
         m_writer.Write(",");
-        
+
         // Alec added for readability
         indent = m_writer.EndLine();
         m_writer.BeginLine(indent);
-        
     }
-    while (currentArg != NULL)
-    {
-        if (currentArg->type.baseType == HLSLBaseType_UserDefined)
-        {
+    while (currentArg != NULL) {
+        if (currentArg->type.baseType == HLSLBaseType_UserDefined) {
             bool isRef = currentArg->isRef;
-           
+
             m_writer.Write("%s %s %s %s [[%s]]", GetAddressSpaceName(currentArg->type.baseType, currentArg->type.addressSpace),
-              // shaderClassName,
-             currentArg->type.typeName, isRef ? "&" : "*", currentArg->name,
+                           // shaderClassName,
+                           currentArg->type.typeName, isRef ? "&" : "*", currentArg->name,
                            currentArg->registerName);
         }
-        else
-        {
+        else {
             m_writer.Write("%s %s [[%s]]", GetTypeName(currentArg->type, /*exactType=*/true), currentArg->name, currentArg->registerName);
         }
 
         currentArg = currentArg->nextArg;
-        if (currentArg)
-        {
+        if (currentArg) {
             m_writer.Write(", ");
         }
-        
+
         // Alec added for readability
         indent = m_writer.EndLine();
         m_writer.BeginLine(indent);
@@ -633,18 +574,15 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
     m_writer.Write("%s %s", shaderClassName, entryName);
 
     currentArg = m_firstClassArgument;
-    if (currentArg)
-    {
+    if (currentArg) {
         m_writer.Write("(");
 
-        while (currentArg != NULL)
-        {
+        while (currentArg != NULL) {
             m_writer.Write("%s", currentArg->name);
             currentArg = currentArg->nextArg;
-            if (currentArg)
-            {
+            if (currentArg) {
                 m_writer.Write(", ");
-                
+
                 // indent = m_writer.EndLine();
                 // m_writer.BeginLine(indent);
             }
@@ -656,33 +594,27 @@ bool MSLGenerator::Generate(HLSLTree* tree, HLSLTarget target, const char* entry
 
     m_writer.BeginLine(1);
 
-    if (wrapReturnType)
-    {
+    if (wrapReturnType) {
         m_writer.Write("%s_output output; output.tmp = %s.%s(", entryName, entryName, entryName);
     }
-    else
-    {
+    else {
         m_writer.Write("return %s.%s(", entryName, entryName);
     }
 
     argument = entryFunction->argument;
-    while (argument != NULL)
-    {
-        if (!argument->hidden)
-        {
+    while (argument != NULL) {
+        if (!argument->hidden) {
             m_writer.Write("%s", argument->name);
         }
         argument = argument->nextArgument;
-        if (argument && !argument->hidden)
-        {
+        if (argument && !argument->hidden) {
             m_writer.Write(", ");
         }
     }
 
     m_writer.EndLine(");");
 
-    if (wrapReturnType)
-    {
+    if (wrapReturnType) {
         m_writer.WriteLine(1, "return output;");
     }
 
@@ -703,22 +635,18 @@ const char* MSLGenerator::GetResult() const
 
 void MSLGenerator::OutputStaticDeclarations(int indent, HLSLStatement* statement)
 {
-    while (statement != NULL)
-    {
-        if (statement->hidden)
-        {
+    while (statement != NULL) {
+        if (statement->hidden) {
             statement = statement->nextStatement;
             continue;
         }
 
         // write struct/buffer outside of the namespace struct
-        if (statement->nodeType == HLSLNodeType_Struct)
-        {
-            if (!statement->written)
-            {
+        if (statement->nodeType == HLSLNodeType_Struct) {
+            if (!statement->written) {
                 HLSLStruct* structure = static_cast<HLSLStruct*>(statement);
                 OutputStruct(indent, structure);
-                
+
                 // skipped for multi-entrypoint
                 statement->written = true;
             }
@@ -730,44 +658,39 @@ void MSLGenerator::OutputStaticDeclarations(int indent, HLSLStatement* statement
             {
                 HLSLBuffer* buffer = static_cast<HLSLBuffer*>(statement);
                 OutputBuffer(indent, buffer);
-                
+
                 // skipped for multi-entrypoint
                 statement->written = true;
             }
         }
         */
-        
-        else if (statement->nodeType == HLSLNodeType_Declaration)
-        {
+
+        else if (statement->nodeType == HLSLNodeType_Declaration) {
             HLSLDeclaration* declaration = static_cast<HLSLDeclaration*>(statement);
 
             const HLSLType& type = declaration->type;
 
-            if (type.TestFlags(HLSLTypeFlag_Const | HLSLTypeFlag_Static))
-            {
-                if (!declaration->written)
-                {
+            if (type.TestFlags(HLSLTypeFlag_Const | HLSLTypeFlag_Static)) {
+                if (!declaration->written) {
                     m_writer.BeginLine(indent, declaration->fileName, declaration->line);
                     OutputDeclaration(declaration);
                     m_writer.EndLine(";");
-                    
+
                     // skipped for multi-entrypoint
                     declaration->written = true;
                 }
-                
+
                 // TODO: sure this is needed, or does written handle it
                 // hide declaration from subsequent passes
                 declaration->hidden = true;
             }
         }
-        else if (statement->nodeType == HLSLNodeType_Function)
-        {
+        else if (statement->nodeType == HLSLNodeType_Function) {
             HLSLFunction* function = static_cast<HLSLFunction*>(statement);
-            
-            if (!function->forward)
-            {
+
+            if (!function->forward) {
                 OutputStaticDeclarations(indent, function->statement);
-                
+
                 // skipped for multi-entrypoint
                 //function->written = true;
             }
@@ -783,29 +706,27 @@ bool MSLGenerator::CanSkipWrittenStatement(const HLSLStatement* statement) const
 
     // only write these once for multi-entrypoint
     if (statement->nodeType == HLSLNodeType_Comment ||
-         // statement->nodeType == HLSLNodeType_Buffer ||
-         statement->nodeType == HLSLNodeType_Struct)
+        // statement->nodeType == HLSLNodeType_Buffer ||
+        statement->nodeType == HLSLNodeType_Struct)
         return true;
 
     // only write const scalars out once, so they don't conflict
-    if (statement->nodeType == HLSLNodeType_Declaration)
-    {
+    if (statement->nodeType == HLSLNodeType_Declaration) {
         const HLSLDeclaration* decl = (const HLSLDeclaration*)statement;
-        if (IsScalarType(decl->type.baseType) && decl->type.flags & HLSLTypeFlag_Const)
-        {
+        if (IsScalarType(decl->type.baseType) && decl->type.flags & HLSLTypeFlag_Const) {
             return true;
         }
     }
 
     // TODO: all functions are currently thrown into the namespace class
     // so can't yet strip them.
-    
+
     // Helper functions should be skipped once written out
-//    if (statement->nodeType == HLSLNodeType_Function)
-//    {
-//        return true;
-//    }
-    
+    // if (statement->nodeType == HLSLNodeType_Function)
+    // {
+    //     return true;
+    // }
+
     return false;
 }
 
@@ -813,101 +734,82 @@ bool MSLGenerator::CanSkipWrittenStatement(const HLSLStatement* statement) const
 void MSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
 {
     // Main generator loop: called recursively
-    while (statement != NULL)
-    {
+    while (statement != NULL) {
         // skip pruned statements
-        if (statement->hidden)
-        {
+        if (statement->hidden) {
             statement = statement->nextStatement;
             continue;
         }
-        
+
         // skip writing across multiple entry points
         // skip writing some types across multiple entry points
-        if (CanSkipWrittenStatement(statement))
-        {
+        if (CanSkipWrittenStatement(statement)) {
             statement = statement->nextStatement;
             continue;
         }
         statement->written = true;
-        
+
         OutputAttributes(indent, statement->attributes);
-        
-        if (statement->nodeType == HLSLNodeType_Comment)
-        {
+
+        if (statement->nodeType == HLSLNodeType_Comment) {
             HLSLComment* comment = static_cast<HLSLComment*>(statement);
             m_writer.WriteLine(indent, "//%s", comment->text);
         }
-        else if (statement->nodeType == HLSLNodeType_Declaration)
-        {
+        else if (statement->nodeType == HLSLNodeType_Declaration) {
             HLSLDeclaration* declaration = static_cast<HLSLDeclaration*>(statement);
 
-            if (declaration->assignment && declaration->assignment->nodeType == HLSLNodeType_FunctionCall)
-            {
+            if (declaration->assignment && declaration->assignment->nodeType == HLSLNodeType_FunctionCall) {
                 OutputFunctionCallStatement(indent, (HLSLFunctionCall*)declaration->assignment, declaration);
             }
-            else
-            {
+            else {
                 m_writer.BeginLine(indent, declaration->fileName, declaration->line);
                 OutputDeclaration(declaration);
                 m_writer.EndLine(";");
             }
         }
-        else if (statement->nodeType == HLSLNodeType_Struct)
-        {
+        else if (statement->nodeType == HLSLNodeType_Struct) {
             HLSLStruct* structure = static_cast<HLSLStruct*>(statement);
             OutputStruct(indent, structure);
         }
-        else if (statement->nodeType == HLSLNodeType_Buffer)
-        {
+        else if (statement->nodeType == HLSLNodeType_Buffer) {
             HLSLBuffer* buffer = static_cast<HLSLBuffer*>(statement);
             OutputBuffer(indent, buffer);
         }
-        else if (statement->nodeType == HLSLNodeType_Function)
-        {
+        else if (statement->nodeType == HLSLNodeType_Function) {
             HLSLFunction* function = static_cast<HLSLFunction*>(statement);
 
-            if (!function->forward)
-            {
+            if (!function->forward) {
                 OutputFunction(indent, function);
             }
         }
-        else if (statement->nodeType == HLSLNodeType_ExpressionStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ExpressionStatement) {
             HLSLExpressionStatement* expressionStatement = static_cast<HLSLExpressionStatement*>(statement);
             HLSLExpression* expression = expressionStatement->expression;
 
-            if (expression->nodeType == HLSLNodeType_FunctionCall)
-            {
+            if (expression->nodeType == HLSLNodeType_FunctionCall) {
                 OutputFunctionCallStatement(indent, (HLSLFunctionCall*)expression, NULL);
             }
-            else
-            {
+            else {
                 m_writer.BeginLine(indent, statement->fileName, statement->line);
                 OutputExpression(expressionStatement->expression, NULL);
                 m_writer.EndLine(";");
             }
         }
-        else if (statement->nodeType == HLSLNodeType_ReturnStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ReturnStatement) {
             HLSLReturnStatement* returnStatement = static_cast<HLSLReturnStatement*>(statement);
-            if (m_currentFunction->numOutputArguments > 0)
-            {
+            if (m_currentFunction->numOutputArguments > 0) {
                 m_writer.BeginLine(indent, returnStatement->fileName, returnStatement->line);
                 m_writer.Write("return { ");
 
                 int numArguments = 0;
-                if (returnStatement->expression != NULL)
-                {
+                if (returnStatement->expression != NULL) {
                     OutputTypedExpression(m_currentFunction->returnType, returnStatement->expression, NULL);
                     numArguments++;
                 }
 
-                HLSLArgument * argument = m_currentFunction->argument;
-                while (argument != NULL)
-                {
-                    if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout)
-                    {
+                HLSLArgument* argument = m_currentFunction->argument;
+                while (argument != NULL) {
+                    if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout) {
                         if (numArguments) m_writer.Write(", ");
                         m_writer.Write("%s", argument->name);
                         numArguments++;
@@ -917,35 +819,29 @@ void MSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
 
                 m_writer.EndLine(" };");
             }
-            else if (returnStatement->expression != NULL)
-            {
+            else if (returnStatement->expression != NULL) {
                 m_writer.BeginLine(indent, returnStatement->fileName, returnStatement->line);
                 m_writer.Write("return ");
                 OutputTypedExpression(m_currentFunction->returnType, returnStatement->expression, NULL);
                 m_writer.EndLine(";");
             }
-            else
-            {
+            else {
                 m_writer.WriteLineTagged(indent, returnStatement->fileName, returnStatement->line, "return;");
             }
         }
-        else if (statement->nodeType == HLSLNodeType_DiscardStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_DiscardStatement) {
             HLSLDiscardStatement* discardStatement = static_cast<HLSLDiscardStatement*>(statement);
             m_writer.WriteLineTagged(indent, discardStatement->fileName, discardStatement->line, "discard_fragment();");
         }
-        else if (statement->nodeType == HLSLNodeType_BreakStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_BreakStatement) {
             HLSLBreakStatement* breakStatement = static_cast<HLSLBreakStatement*>(statement);
             m_writer.WriteLineTagged(indent, breakStatement->fileName, breakStatement->line, "break;");
         }
-        else if (statement->nodeType == HLSLNodeType_ContinueStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ContinueStatement) {
             HLSLContinueStatement* continueStatement = static_cast<HLSLContinueStatement*>(statement);
             m_writer.WriteLineTagged(indent, continueStatement->fileName, continueStatement->line, "continue;");
         }
-        else if (statement->nodeType == HLSLNodeType_IfStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_IfStatement) {
             HLSLIfStatement* ifStatement = static_cast<HLSLIfStatement*>(statement);
 
             if (ifStatement->isStatic) {
@@ -968,16 +864,14 @@ void MSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
                 m_writer.EndLine();
                 OutputStatements(indent + 1, ifStatement->statement);
                 m_writer.WriteLine(indent, "}");
-                if (ifStatement->elseStatement != NULL)
-                {
+                if (ifStatement->elseStatement != NULL) {
                     m_writer.WriteLine(indent, "else {");
                     OutputStatements(indent + 1, ifStatement->elseStatement);
                     m_writer.WriteLine(indent, "}");
                 }
             }
         }
-        else if (statement->nodeType == HLSLNodeType_ForStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_ForStatement) {
             HLSLForStatement* forStatement = static_cast<HLSLForStatement*>(statement);
             m_writer.BeginLine(indent, forStatement->fileName, forStatement->line);
             m_writer.Write("for (");
@@ -991,25 +885,23 @@ void MSLGenerator::OutputStatements(int indent, HLSLStatement* statement)
             OutputStatements(indent + 1, forStatement->statement);
             m_writer.WriteLine(indent, "}");
         }
-        else if (statement->nodeType == HLSLNodeType_BlockStatement)
-        {
+        else if (statement->nodeType == HLSLNodeType_BlockStatement) {
             HLSLBlockStatement* blockStatement = static_cast<HLSLBlockStatement*>(statement);
             m_writer.WriteLineTagged(indent, blockStatement->fileName, blockStatement->line, "{");
             OutputStatements(indent + 1, blockStatement->statement);
             m_writer.WriteLine(indent, "}");
         }
-        
+
         // fx file support for Technique/Pipeline
-//            else if (statement->nodeType == HLSLNodeType_Technique)
-//            {
-//                // Techniques are ignored.
-//            }
-//            else if (statement->nodeType == HLSLNodeType_Pipeline)
-//            {
-//                // Pipelines are ignored.
-//            }
-        else
-        {
+        // else if (statement->nodeType == HLSLNodeType_Technique)
+        // {
+        //     // Techniques are ignored.
+        // }
+        // else if (statement->nodeType == HLSLNodeType_Pipeline)
+        // {
+        //     // Pipelines are ignored.
+        // }
+        else {
             // Unhandled statement type.
             Error("Unknown statement");
         }
@@ -1023,18 +915,15 @@ void MSLGenerator::OutputAttributes(int indent, HLSLAttribute* attribute)
 {
     // IC: These do not appear to exist in MSL.
     while (attribute != NULL) {
-        if (attribute->attributeType == HLSLAttributeType_Unroll)
-        {
+        if (attribute->attributeType == HLSLAttributeType_Unroll) {
             // @@ Do any of these work?
             //m_writer.WriteLine(indent, attribute->fileName, attribute->line, "#pragma unroll");
             //m_writer.WriteLine(indent, attribute->fileName, attribute->line, "[[unroll]]");
         }
-        else if (attribute->attributeType == HLSLAttributeType_Flatten)
-        {
+        else if (attribute->attributeType == HLSLAttributeType_Flatten) {
             // @@
         }
-        else if (attribute->attributeType == HLSLAttributeType_Branch)
-        {
+        else if (attribute->attributeType == HLSLAttributeType_Branch) {
             // @@, [[likely]]?
         }
 
@@ -1044,25 +933,21 @@ void MSLGenerator::OutputAttributes(int indent, HLSLAttribute* attribute)
 
 void MSLGenerator::OutputDeclaration(HLSLDeclaration* declaration)
 {
-    if (IsSamplerType(declaration->type))
-    {
+    if (IsSamplerType(declaration->type)) {
         m_writer.Write("%s sampler& %s", GetAddressSpaceName(declaration->type.baseType, declaration->type.addressSpace), declaration->name);
     }
-    else if (IsTextureType(declaration->type))
-    {
+    else if (IsTextureType(declaration->type)) {
         const char* textureName = GetTypeName(declaration->type, true);
         if (textureName)
             m_writer.Write("%s %s& %s", GetAddressSpaceName(declaration->type.baseType, declaration->type.addressSpace), textureName, declaration->name);
         else
             Error("Unknown texture");
     }
-    else
-    {
+    else {
         OutputDeclaration(declaration->type, declaration->name, declaration->assignment);
 
         declaration = declaration->nextDeclaration;
-        while (declaration != NULL)
-        {
+        while (declaration != NULL) {
             m_writer.Write(",");
             OutputDeclarationBody(declaration->type, declaration->name, declaration->assignment);
             declaration = declaration->nextDeclaration;
@@ -1074,18 +959,15 @@ void MSLGenerator::OutputStruct(int indent, HLSLStruct* structure)
 {
     m_writer.WriteLineTagged(indent, structure->fileName, structure->line, "struct %s {", structure->name);
     HLSLStructField* field = structure->field;
-    while (field != NULL)
-    {
-        if (!field->hidden)
-        {
+    while (field != NULL) {
+        if (!field->hidden) {
             m_writer.BeginLine(indent + 1, field->fileName, field->line);
             OutputDeclaration(field->type, field->name, NULL);
-            
+
             // DONE: would need a semantic remap for all possible semantics
             // just use the name the caller specified if sv_semantic
             // is not set.  The header can handle translating
-            if (field->sv_semantic)
-            {
+            if (field->sv_semantic) {
                 m_writer.Write(" [[%s]]", field->sv_semantic);
             }
 
@@ -1098,45 +980,39 @@ void MSLGenerator::OutputStruct(int indent, HLSLStruct* structure)
 
 void MSLGenerator::OutputBuffer(int indent, HLSLBuffer* buffer)
 {
-    if (!buffer->IsGlobalFields())
-    {
+    if (!buffer->IsGlobalFields()) {
         m_writer.BeginLine(indent, buffer->fileName, buffer->line);
-        
+
         // TODO: handle array count for indexing into constant buffer
         // some are unbounded array like BAB and SBO
         // TODO: may need to use t/u registers for those too and a thread?
-        
+
         // TODO: fix this, ConstantBuffer can index into a constant buffer too
         // detect use of array notation on decl
         bool isRef = buffer->bufferType == HLSLBufferType_ConstantBuffer ||
                      buffer->IsGlobalFields();
-        
+
         if (buffer->bufferType == HLSLBufferType_ConstantBuffer ||
             buffer->bufferType == HLSLBufferType_ByteAddressBuffer ||
-            buffer->bufferType == HLSLBufferType_StructuredBuffer)
-        {
+            buffer->bufferType == HLSLBufferType_StructuredBuffer) {
             m_writer.Write("constant %s %s %s", buffer->bufferStruct->name, isRef ? "&" : "*", buffer->name);
         }
-        else
-        {
-            m_writer.Write("device %s %s %s",  buffer->bufferStruct->name, isRef ? "&" : "*", buffer->name);
+        else {
+            m_writer.Write("device %s %s %s", buffer->bufferStruct->name, isRef ? "&" : "*", buffer->name);
         }
-        
+
         m_writer.EndLine(";");
     }
-    else
-    {
+    else {
         // converted cbuffer that spill tons of globals for every field
         HLSLDeclaration* field = buffer->field;
-        
+
         m_writer.BeginLine(indent, buffer->fileName, buffer->line);
         m_writer.Write("struct %s_ubo", buffer->name);
         m_writer.EndLine(" {");
-        
-        while (field != NULL)
-        {
-            if (!field->hidden)
-            {
+
+        while (field != NULL) {
+            if (!field->hidden) {
                 m_writer.BeginLine(indent + 1, field->fileName, field->line);
                 OutputDeclaration(field->type, field->name, field->assignment, false, false, 0); // /*alignment=*/16);
                 m_writer.EndLine(";");
@@ -1144,7 +1020,7 @@ void MSLGenerator::OutputBuffer(int indent, HLSLBuffer* buffer)
             field = (HLSLDeclaration*)field->nextStatement;
         }
         m_writer.WriteLine(indent, "};");
-        
+
         m_writer.WriteLine(indent, "constant %s_ubo & %s;", buffer->name, buffer->name);
     }
 }
@@ -1155,26 +1031,22 @@ void MSLGenerator::OutputFunction(int indent, HLSLFunction* function)
     const char* returnTypeName = GetTypeName(function->returnType, /*exactType=*/false);
 
     // Declare output tuple.
-    if (function->numOutputArguments > 0)
-    {
+    if (function->numOutputArguments > 0) {
         returnTypeName = m_tree->AddStringFormat("%s_out%d", functionName, function->line); // @@ Find a better way to generate unique name.
 
         m_writer.BeginLine(indent, function->fileName, function->line);
         m_writer.Write("struct %s { ", returnTypeName);
         m_writer.EndLine();
 
-        if (function->returnType.baseType != HLSLBaseType_Void)
-        {
+        if (function->returnType.baseType != HLSLBaseType_Void) {
             m_writer.BeginLine(indent + 1, function->fileName, function->line);
             OutputDeclaration(function->returnType, "__result", /*defaultValue=*/NULL, /*isRef=*/false, /*isConst=*/false);
             m_writer.EndLine(";");
         }
 
-        HLSLArgument * argument = function->argument;
-        while (argument != NULL)
-        {
-            if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout)
-            {
+        HLSLArgument* argument = function->argument;
+        while (argument != NULL) {
+            if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout) {
                 m_writer.BeginLine(indent + 1, function->fileName, function->line);
                 OutputDeclaration(argument->type, argument->name, /*defaultValue=*/NULL, /*isRef=*/false, /*isConst=*/false);
                 m_writer.EndLine(";");
@@ -1188,8 +1060,7 @@ void MSLGenerator::OutputFunction(int indent, HLSLFunction* function)
         m_writer.BeginLine(indent, function->fileName, function->line);
         m_writer.Write("%s %s_%d(", returnTypeName, functionName, function->line);
     }
-    else
-    {
+    else {
         m_writer.BeginLine(indent, function->fileName, function->line);
         m_writer.Write("%s %s(", returnTypeName, functionName);
     }
@@ -1200,11 +1071,9 @@ void MSLGenerator::OutputFunction(int indent, HLSLFunction* function)
     m_currentFunction = function;
 
     // Local declarations for output arguments.
-    HLSLArgument * argument = function->argument;
-    while (argument != NULL)
-    {
-        if (argument->modifier == HLSLArgumentModifier_Out)
-        {
+    HLSLArgument* argument = function->argument;
+    while (argument != NULL) {
+        if (argument->modifier == HLSLArgumentModifier_Out) {
             m_writer.BeginLine(indent + 1, function->fileName, function->line);
             OutputDeclaration(argument->type, argument->name, /*defaultValue=*/NULL, /*isRef=*/false, /*isConst=*/false);
             m_writer.EndLine(";");
@@ -1215,31 +1084,25 @@ void MSLGenerator::OutputFunction(int indent, HLSLFunction* function)
     OutputStatements(indent + 1, function->statement); // @@ Modify return statements if function has multiple output arguments!
 
     // Output implicit return.
-    if (function->numOutputArguments > 0)
-    {
+    if (function->numOutputArguments > 0) {
         bool needsImplicitReturn = true;
-        HLSLStatement * statement = function->statement;
-        if (statement != NULL)
-        {
-            while (statement->nextStatement != NULL)
-            {
+        HLSLStatement* statement = function->statement;
+        if (statement != NULL) {
+            while (statement->nextStatement != NULL) {
                 statement = statement->nextStatement;
             }
             needsImplicitReturn = (statement->nodeType != HLSLNodeType_ReturnStatement) && function->returnType.baseType == HLSLBaseType_Void;
         }
 
-        if (needsImplicitReturn)
-        {
+        if (needsImplicitReturn) {
             m_writer.BeginLine(indent + 1);
             m_writer.Write("return { ");
 
             int numArguments = 0;
-            HLSLArgument * argument2 = m_currentFunction->argument;
-            while (argument2 != NULL)
-            {
+            HLSLArgument* argument2 = m_currentFunction->argument;
+            while (argument2 != NULL) {
                 if (argument2->modifier == HLSLArgumentModifier_Out ||
-                    argument2->modifier == HLSLArgumentModifier_Inout)
-                {
+                    argument2->modifier == HLSLArgumentModifier_Inout) {
                     if (numArguments) m_writer.Write(", ");
                     m_writer.Write("%s ", argument2->name);
                     numArguments++;
@@ -1255,23 +1118,19 @@ void MSLGenerator::OutputFunction(int indent, HLSLFunction* function)
     m_currentFunction = NULL;
 }
 
-
 // @@ We could be a lot smarter removing parenthesis based on the operator precedence of the parent expression.
-static bool NeedsParenthesis(HLSLExpression* expression, HLSLExpression* parentExpression) {
-
+static bool NeedsParenthesis(HLSLExpression* expression, HLSLExpression* parentExpression)
+{
     // For now we just omit the parenthesis if there's no parent expression.
-    if (parentExpression == NULL)
-    {
+    if (parentExpression == NULL) {
         return false;
     }
 
     // One more special case that's pretty common.
-    if (parentExpression->nodeType == HLSLNodeType_MemberAccess)
-    {
+    if (parentExpression->nodeType == HLSLNodeType_MemberAccess) {
         if (expression->nodeType == HLSLNodeType_IdentifierExpression ||
             expression->nodeType == HLSLNodeType_ArrayAccess ||
-            expression->nodeType == HLSLNodeType_MemberAccess)
-        {
+            expression->nodeType == HLSLNodeType_MemberAccess) {
             return false;
         }
     }
@@ -1279,7 +1138,7 @@ static bool NeedsParenthesis(HLSLExpression* expression, HLSLExpression* parentE
     return true;
 }
 
-bool MSLGenerator::NeedsCast(const HLSLType & target, const HLSLType & source)
+bool MSLGenerator::NeedsCast(const HLSLType& target, const HLSLType& source)
 {
     HLSLBaseType targetType = target.baseType;
     HLSLBaseType sourceType = source.baseType;
@@ -1294,23 +1153,20 @@ bool MSLGenerator::NeedsCast(const HLSLType & target, const HLSLType & source)
     return false;
     }*/
 
-    if (m_options.treatHalfAsFloat)
-    {
+    if (m_options.treatHalfAsFloat) {
         // use call to convert half back to float type
         if (IsHalf(targetType)) targetType = HalfToFloatBaseType(targetType);
-        if (IsHalf(sourceType)) sourceType = HalfToFloatBaseType(sourceType );
+        if (IsHalf(sourceType)) sourceType = HalfToFloatBaseType(sourceType);
     }
 
     return targetType != sourceType && (IsCoreTypeEqual(targetType, sourceType) || IsScalarType(sourceType));
 }
 
-
 void MSLGenerator::OutputTypedExpression(const HLSLType& type, HLSLExpression* expression, HLSLExpression* parentExpression)
 {
     // If base types are not exactly the same, do explicit cast.
     bool closeCastExpression = false;
-    if (NeedsCast(type, expression->expressionType))
-    {
+    if (NeedsCast(type, expression->expressionType)) {
         OutputCast(type);
         m_writer.Write("(");
         closeCastExpression = true;
@@ -1318,28 +1174,24 @@ void MSLGenerator::OutputTypedExpression(const HLSLType& type, HLSLExpression* e
 
     OutputExpression(expression, parentExpression);
 
-    if (closeCastExpression)
-    {
+    if (closeCastExpression) {
         m_writer.Write(")");
     }
 }
 
 void MSLGenerator::OutputExpression(HLSLExpression* expression, HLSLExpression* parentExpression)
 {
-    if (expression->nodeType == HLSLNodeType_IdentifierExpression)
-    {
+    if (expression->nodeType == HLSLNodeType_IdentifierExpression) {
         HLSLIdentifierExpression* identifierExpression = static_cast<HLSLIdentifierExpression*>(expression);
         const char* name = identifierExpression->name;
-        
+
         {
-            if (identifierExpression->global)
-            {
+            if (identifierExpression->global) {
                 // prepend cbuffer name
-                HLSLBuffer * buffer;
-                HLSLDeclaration * declaration = m_tree->FindGlobalDeclaration(identifierExpression->name, &buffer);
+                HLSLBuffer* buffer;
+                HLSLDeclaration* declaration = m_tree->FindGlobalDeclaration(identifierExpression->name, &buffer);
 
-                if (declaration && declaration->buffer)
-                {
+                if (declaration && declaration->buffer) {
                     ASSERT(buffer == declaration->buffer);
                     m_writer.Write("%s.", declaration->buffer->name);
                 }
@@ -1369,177 +1221,214 @@ void MSLGenerator::OutputExpression(HLSLExpression* expression, HLSLExpression*
             }*/
         }
     }
-    else if (expression->nodeType == HLSLNodeType_CastingExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_CastingExpression) {
         HLSLCastingExpression* castingExpression = static_cast<HLSLCastingExpression*>(expression);
         OutputCast(castingExpression->type);
         m_writer.Write("(");
         OutputExpression(castingExpression->expression, castingExpression);
         m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_ConstructorExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_ConstructorExpression) {
         HLSLConstructorExpression* constructorExpression = static_cast<HLSLConstructorExpression*>(expression);
-        
+
         m_writer.Write("%s(", GetTypeName(constructorExpression->type, /*exactType=*/false));
         //OutputExpressionList(constructorExpression->type, constructorExpression->argument);   // @@ Get element type.
         OutputExpressionList(constructorExpression->argument);
         m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_LiteralExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_LiteralExpression) {
         HLSLLiteralExpression* literalExpression = static_cast<HLSLLiteralExpression*>(expression);
-    
+
         HLSLBaseType type = literalExpression->type;
         if (m_options.treatHalfAsFloat && IsHalf(type))
             type = HLSLBaseType_Float;
-        
-        switch (type)
-        {
-                
-        case HLSLBaseType_Half:
-        case HLSLBaseType_Double:
-        case HLSLBaseType_Float:
-        {
-            char floatBuffer[64];
-            
-            String_FormatFloat(floatBuffer, sizeof(floatBuffer), literalExpression->fValue);
-            String_StripTrailingFloatZeroes(floatBuffer);
-            m_writer.Write("%s%s", floatBuffer, type == HLSLBaseType_Half ? "h" : "");
-            break;
-        }
-        // TODO: missing uint types (trailing character u, ul, ..)
-                
-        case HLSLBaseType_Short:
-        case HLSLBaseType_Long:
-        case HLSLBaseType_Int:
-            m_writer.Write("%d", literalExpression->iValue);
-            break;
-                
-        case HLSLBaseType_Bool:
-            m_writer.Write("%s", literalExpression->bValue ? "true" : "false");
-            break;
-       default:
-            Error("Unhandled literal");
-            //ASSERT(0);
+
+        switch (type) {
+            case HLSLBaseType_Half:
+            case HLSLBaseType_Double:
+            case HLSLBaseType_Float: {
+                char floatBuffer[64];
+
+                String_FormatFloat(floatBuffer, sizeof(floatBuffer), literalExpression->fValue);
+                String_StripTrailingFloatZeroes(floatBuffer);
+                m_writer.Write("%s%s", floatBuffer, type == HLSLBaseType_Half ? "h" : "");
+                break;
+            }
+                // TODO: missing uint types (trailing character u, ul, ..)
+
+            case HLSLBaseType_Short:
+            case HLSLBaseType_Long:
+            case HLSLBaseType_Int:
+                m_writer.Write("%d", literalExpression->iValue);
+                break;
+
+            case HLSLBaseType_Bool:
+                m_writer.Write("%s", literalExpression->bValue ? "true" : "false");
+                break;
+            default:
+                Error("Unhandled literal");
+                //ASSERT(0);
         }
     }
-    else if (expression->nodeType == HLSLNodeType_UnaryExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_UnaryExpression) {
         HLSLUnaryExpression* unaryExpression = static_cast<HLSLUnaryExpression*>(expression);
         const char* op = "?";
         bool pre = true;
-        switch (unaryExpression->unaryOp)
-        {
-        case HLSLUnaryOp_Negative:      op = "-";  break;
-        case HLSLUnaryOp_Positive:      op = "+";  break;
-        case HLSLUnaryOp_Not:           op = "!";  break;
-        case HLSLUnaryOp_BitNot:        op = "~";  break;
-        case HLSLUnaryOp_PreIncrement:  op = "++"; break;
-        case HLSLUnaryOp_PreDecrement:  op = "--"; break;
-        case HLSLUnaryOp_PostIncrement: op = "++"; pre = false; break;
-        case HLSLUnaryOp_PostDecrement: op = "--"; pre = false; break;
+        switch (unaryExpression->unaryOp) {
+            case HLSLUnaryOp_Negative:
+                op = "-";
+                break;
+            case HLSLUnaryOp_Positive:
+                op = "+";
+                break;
+            case HLSLUnaryOp_Not:
+                op = "!";
+                break;
+            case HLSLUnaryOp_BitNot:
+                op = "~";
+                break;
+            case HLSLUnaryOp_PreIncrement:
+                op = "++";
+                break;
+            case HLSLUnaryOp_PreDecrement:
+                op = "--";
+                break;
+            case HLSLUnaryOp_PostIncrement:
+                op = "++";
+                pre = false;
+                break;
+            case HLSLUnaryOp_PostDecrement:
+                op = "--";
+                pre = false;
+                break;
         }
         bool addParenthesis = NeedsParenthesis(unaryExpression->expression, expression);
         if (addParenthesis) m_writer.Write("(");
-        if (pre)
-        {
+        if (pre) {
             m_writer.Write("%s", op);
             OutputExpression(unaryExpression->expression, unaryExpression);
         }
-        else
-        {
+        else {
             OutputExpression(unaryExpression->expression, unaryExpression);
             m_writer.Write("%s", op);
         }
         if (addParenthesis) m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_BinaryExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_BinaryExpression) {
         HLSLBinaryExpression* binaryExpression = static_cast<HLSLBinaryExpression*>(expression);
 
         bool addParenthesis = NeedsParenthesis(expression, parentExpression);
         if (addParenthesis) m_writer.Write("(");
-        
+
         {
-            if (IsArithmeticOp(binaryExpression->binaryOp) || IsLogicOp(binaryExpression->binaryOp))
-            {
+            if (IsArithmeticOp(binaryExpression->binaryOp) || IsLogicOp(binaryExpression->binaryOp)) {
                 // Do intermediate type promotion, without changing dimension:
                 HLSLType promotedType = binaryExpression->expression1->expressionType;
 
-                if (!IsNumericTypeEqual(binaryExpression->expressionType.baseType, promotedType.baseType))
-                {
+                if (!IsNumericTypeEqual(binaryExpression->expressionType.baseType, promotedType.baseType)) {
                     promotedType.baseType = PromoteType(binaryExpression->expressionType.baseType, promotedType.baseType);
                 }
 
                 OutputTypedExpression(promotedType, binaryExpression->expression1, binaryExpression);
             }
-            else
-            {
+            else {
                 OutputExpression(binaryExpression->expression1, binaryExpression);
             }
 
             const char* op = "?";
-            switch (binaryExpression->binaryOp)
-            {
-            case HLSLBinaryOp_Add:          op = " + "; break;
-            case HLSLBinaryOp_Sub:          op = " - "; break;
-            case HLSLBinaryOp_Mul:          op = " * "; break;
-            case HLSLBinaryOp_Div:          op = " / "; break;
-            case HLSLBinaryOp_Less:         op = " < "; break;
-            case HLSLBinaryOp_Greater:      op = " > "; break;
-            case HLSLBinaryOp_LessEqual:    op = " <= "; break;
-            case HLSLBinaryOp_GreaterEqual: op = " >= "; break;
-            case HLSLBinaryOp_Equal:        op = " == "; break;
-            case HLSLBinaryOp_NotEqual:     op = " != "; break;
-            case HLSLBinaryOp_Assign:       op = " = "; break;
-            case HLSLBinaryOp_AddAssign:    op = " += "; break;
-            case HLSLBinaryOp_SubAssign:    op = " -= "; break;
-            case HLSLBinaryOp_MulAssign:    op = " *= "; break;
-            case HLSLBinaryOp_DivAssign:    op = " /= "; break;
-            case HLSLBinaryOp_And:          op = " && "; break;
-            case HLSLBinaryOp_Or:           op = " || "; break;
-            case HLSLBinaryOp_BitAnd:       op = " & "; break;
-            case HLSLBinaryOp_BitOr:        op = " | "; break;
-            case HLSLBinaryOp_BitXor:       op = " ^ "; break;
-            default:
-                Error("unhandled literal");
-                //ASSERT(0);
+            switch (binaryExpression->binaryOp) {
+                case HLSLBinaryOp_Add:
+                    op = " + ";
+                    break;
+                case HLSLBinaryOp_Sub:
+                    op = " - ";
+                    break;
+                case HLSLBinaryOp_Mul:
+                    op = " * ";
+                    break;
+                case HLSLBinaryOp_Div:
+                    op = " / ";
+                    break;
+                case HLSLBinaryOp_Less:
+                    op = " < ";
+                    break;
+                case HLSLBinaryOp_Greater:
+                    op = " > ";
+                    break;
+                case HLSLBinaryOp_LessEqual:
+                    op = " <= ";
+                    break;
+                case HLSLBinaryOp_GreaterEqual:
+                    op = " >= ";
+                    break;
+                case HLSLBinaryOp_Equal:
+                    op = " == ";
+                    break;
+                case HLSLBinaryOp_NotEqual:
+                    op = " != ";
+                    break;
+                case HLSLBinaryOp_Assign:
+                    op = " = ";
+                    break;
+                case HLSLBinaryOp_AddAssign:
+                    op = " += ";
+                    break;
+                case HLSLBinaryOp_SubAssign:
+                    op = " -= ";
+                    break;
+                case HLSLBinaryOp_MulAssign:
+                    op = " *= ";
+                    break;
+                case HLSLBinaryOp_DivAssign:
+                    op = " /= ";
+                    break;
+                case HLSLBinaryOp_And:
+                    op = " && ";
+                    break;
+                case HLSLBinaryOp_Or:
+                    op = " || ";
+                    break;
+                case HLSLBinaryOp_BitAnd:
+                    op = " & ";
+                    break;
+                case HLSLBinaryOp_BitOr:
+                    op = " | ";
+                    break;
+                case HLSLBinaryOp_BitXor:
+                    op = " ^ ";
+                    break;
+                default:
+                    Error("unhandled literal");
+                    //ASSERT(0);
             }
             m_writer.Write("%s", op);
 
-            
             if (binaryExpression->binaryOp == HLSLBinaryOp_MulAssign ||
                 binaryExpression->binaryOp == HLSLBinaryOp_DivAssign ||
                 IsArithmeticOp(binaryExpression->binaryOp) ||
-                IsLogicOp(binaryExpression->binaryOp))
-            {
+                IsLogicOp(binaryExpression->binaryOp)) {
                 // Do intermediate type promotion, without changing dimension:
                 HLSLType promotedType = binaryExpression->expression2->expressionType;
 
-                if (!IsNumericTypeEqual(binaryExpression->expressionType.baseType, promotedType.baseType))
-                {
+                if (!IsNumericTypeEqual(binaryExpression->expressionType.baseType, promotedType.baseType)) {
                     // This should only promote up (half->float, etc)
                     promotedType.baseType = PromoteType(binaryExpression->expressionType.baseType, promotedType.baseType);
                 }
 
                 OutputTypedExpression(promotedType, binaryExpression->expression2, binaryExpression);
             }
-            else if (IsAssignOp(binaryExpression->binaryOp))
-            {
+            else if (IsAssignOp(binaryExpression->binaryOp)) {
                 OutputTypedExpression(binaryExpression->expressionType, binaryExpression->expression2, binaryExpression);
             }
-            else
-            {
+            else {
                 OutputExpression(binaryExpression->expression2, binaryExpression);
             }
         }
         if (addParenthesis) m_writer.Write(")");
     }
-    else if (expression->nodeType == HLSLNodeType_ConditionalExpression)
-    {
+    else if (expression->nodeType == HLSLNodeType_ConditionalExpression) {
         HLSLConditionalExpression* conditionalExpression = static_cast<HLSLConditionalExpression*>(expression);
-        
+
         // TODO: @@ Remove parenthesis.
         m_writer.Write("((");
         OutputExpression(conditionalExpression->condition, NULL);
@@ -1549,27 +1438,23 @@ void MSLGenerator::OutputExpression(HLSLExpression* expression, HLSLExpression*
         OutputExpression(conditionalExpression->falseExpression, NULL);
         m_writer.Write("))");
     }
-    else if (expression->nodeType == HLSLNodeType_MemberAccess)
-    {
+    else if (expression->nodeType == HLSLNodeType_MemberAccess) {
         HLSLMemberAccess* memberAccess = static_cast<HLSLMemberAccess*>(expression);
         bool addParenthesis = NeedsParenthesis(memberAccess->object, expression);
 
-        if (addParenthesis)
-        {
+        if (addParenthesis) {
             m_writer.Write("(");
         }
         OutputExpression(memberAccess->object, NULL);
-        if (addParenthesis)
-        {
+        if (addParenthesis) {
             m_writer.Write(")");
         }
 
         m_writer.Write(".%s", memberAccess->field);
     }
-    else if (expression->nodeType == HLSLNodeType_ArrayAccess)
-    {
+    else if (expression->nodeType == HLSLNodeType_ArrayAccess) {
         HLSLArrayAccess* arrayAccess = static_cast<HLSLArrayAccess*>(expression);
-        
+
         // Just use the matrix notation, using column_order instead of row_order
         //if (arrayAccess->array->expressionType.array) // || !IsMatrixType(arrayAccess->array->expressionType.baseType))
         {
@@ -1578,32 +1463,29 @@ void MSLGenerator::OutputExpression(HLSLExpression* expression, HLSLExpression*
             OutputExpression(arrayAccess->index, NULL);
             m_writer.Write("]");
         }
-//            else
-//            {
-//                // @@ This doesn't work for l-values!
-//                m_writer.Write("column(");
-//                OutputExpression(arrayAccess->array, NULL);
-//                m_writer.Write(", ");
-//                OutputExpression(arrayAccess->index, NULL);
-//                m_writer.Write(")");
-//            }
-    }
-    else if (expression->nodeType == HLSLNodeType_FunctionCall)
-    {
+        // else
+        // {
+        //     // @@ This doesn't work for l-values!
+        //     m_writer.Write("column(");
+        //     OutputExpression(arrayAccess->array, NULL);
+        //     m_writer.Write(", ");
+        //     OutputExpression(arrayAccess->index, NULL);
+        //     m_writer.Write(")");
+        // }
+    }
+    else if (expression->nodeType == HLSLNodeType_FunctionCall) {
         HLSLFunctionCall* functionCall = static_cast<HLSLFunctionCall*>(expression);
         OutputFunctionCall(functionCall, parentExpression);
     }
-    else if (expression->nodeType == HLSLNodeType_MemberFunctionCall)
-    {
+    else if (expression->nodeType == HLSLNodeType_MemberFunctionCall) {
         HLSLMemberFunctionCall* functionCall = static_cast<HLSLMemberFunctionCall*>(expression);
-        
+
         // Write out the member identifier
         m_writer.Write("%s.", functionCall->memberIdentifier->name);
 
         OutputFunctionCall(functionCall, parentExpression);
     }
-    else
-    {
+    else {
         Error("unknown expression");
     }
 }
@@ -1611,16 +1493,13 @@ void MSLGenerator::OutputExpression(HLSLExpression* expression, HLSLExpression*
 void MSLGenerator::OutputCast(const HLSLType& type)
 {
     // Note: msl fails on float4x4 to float3x3 casting
-    if (type.baseType == HLSLBaseType_Float3x3)
-    {
+    if (type.baseType == HLSLBaseType_Float3x3) {
         m_writer.Write("tofloat3x3");
     }
-    else if (type.baseType == HLSLBaseType_Half3x3)
-    {
+    else if (type.baseType == HLSLBaseType_Half3x3) {
         m_writer.Write("tohalft3x3");
     }
-    else
-    {
+    else {
         m_writer.Write("(");
         OutputDeclarationType(type, /*isConst=*/false, /*isRef=*/false, /*alignment=*/0, /*isTypeCast=*/true);
         m_writer.Write(")");
@@ -1631,17 +1510,14 @@ void MSLGenerator::OutputCast(const HLSLType& type)
 void MSLGenerator::OutputArguments(HLSLArgument* argument)
 {
     int numArgs = 0;
-    while (argument != NULL)
-    {
+    while (argument != NULL) {
         // Skip hidden and output arguments.
-        if (argument->hidden || argument->modifier == HLSLArgumentModifier_Out)
-        {
+        if (argument->hidden || argument->modifier == HLSLArgumentModifier_Out) {
             argument = argument->nextArgument;
             continue;
         }
 
-        if (numArgs > 0)
-        {
+        if (numArgs > 0) {
             m_writer.Write(", ");
         }
 
@@ -1651,8 +1527,7 @@ void MSLGenerator::OutputArguments(HLSLArgument* argument)
         {
         isRef = true;
         }*/
-        if (argument->modifier == HLSLArgumentModifier_In || argument->modifier == HLSLArgumentModifier_Const)
-        {
+        if (argument->modifier == HLSLArgumentModifier_In || argument->modifier == HLSLArgumentModifier_Const) {
             isConst = true;
         }
 
@@ -1670,76 +1545,62 @@ void MSLGenerator::OutputDeclaration(const HLSLType& type, const char* name, HLS
 
 void MSLGenerator::OutputDeclarationType(const HLSLType& type, bool isRef, bool isConst, int alignment, bool isTypeCast)
 {
-    const char* typeName = GetTypeName(type, /*exactType=*/isTypeCast);  // @@ Don't allow type changes in uniform/globals or casts!
+    const char* typeName = GetTypeName(type, /*exactType=*/isTypeCast); // @@ Don't allow type changes in uniform/globals or casts!
 
     /*if (!isTypeCast)*/
     {
-        if (isRef && !isTypeCast)
-        {
+        if (isRef && !isTypeCast) {
             m_writer.Write("%s ", GetAddressSpaceName(type.baseType, type.addressSpace));
         }
-        if (isConst || type.TestFlags(HLSLTypeFlag_Const))
-        {
+        if (isConst || type.TestFlags(HLSLTypeFlag_Const)) {
             m_writer.Write("constant ");
-            
-//                m_writer.Write("const ");
-//
-//                if ((type.flags & HLSLTypeFlag_Static) != 0 && !isTypeCast)
-//                {
-//                    // TODO: use GetAddressSpaceName?
-//                    m_writer.Write("static constant constexpr ");
-//                }
+
+            // m_writer.Write("const ");
+            //
+            // if ((type.flags & HLSLTypeFlag_Static) != 0 && !isTypeCast)
+            // {
+            //     // TODO: use GetAddressSpaceName?
+            //     m_writer.Write("static constant constexpr ");
+            // }
         }
     }
-    
-    if (alignment != 0 && !isTypeCast)
-    {
+
+    if (alignment != 0 && !isTypeCast) {
         // caller can request alignment, but default is 0
         m_writer.Write("alignas(%d) ", alignment);
     }
 
     m_writer.Write("%s", typeName);
 
-    if (isTypeCast)
-    {
+    if (isTypeCast) {
         // Do not output modifiers inside type cast expressions.
         return;
     }
 
     // Interpolation modifiers.
-    if (type.TestFlags(HLSLTypeFlag_NoInterpolation))
-    {
+    if (type.TestFlags(HLSLTypeFlag_NoInterpolation)) {
         m_writer.Write(" [[flat]]");
     }
-    else
-    {
-        if (type.TestFlags(HLSLTypeFlag_NoPerspective))
-        {
-            if (type.TestFlags(HLSLTypeFlag_Centroid))
-            {
+    else {
+        if (type.TestFlags(HLSLTypeFlag_NoPerspective)) {
+            if (type.TestFlags(HLSLTypeFlag_Centroid)) {
                 m_writer.Write(" [[centroid_no_perspective]]");
             }
-            else if (type.TestFlags(HLSLTypeFlag_Sample))
-            {
+            else if (type.TestFlags(HLSLTypeFlag_Sample)) {
                 m_writer.Write(" [[sample_no_perspective]]");
             }
-            else
-            {
+            else {
                 m_writer.Write(" [[center_no_perspective]]");
             }
         }
-        else
-        {
-            if (type.TestFlags(HLSLTypeFlag_Centroid))
-            {
+        else {
+            if (type.TestFlags(HLSLTypeFlag_Centroid)) {
                 m_writer.Write(" [[centroid_perspective]]");
             }
-            else if (type.TestFlags(HLSLTypeFlag_Sample))
-            {
+            else if (type.TestFlags(HLSLTypeFlag_Sample)) {
                 m_writer.Write(" [[sample_perspective]]");
             }
-            else
-            {
+            else {
                 // Default.
                 //m_writer.Write(" [[center_perspective]]");
             }
@@ -1749,8 +1610,7 @@ void MSLGenerator::OutputDeclarationType(const HLSLType& type, bool isRef, bool
 
 void MSLGenerator::OutputDeclarationBody(const HLSLType& type, const char* name, HLSLExpression* assignment, bool isRef)
 {
-    if (isRef)
-    {
+    if (isRef) {
         // Arrays of refs are illegal in C++ and hence MSL, need to "link" the & to the var name
         m_writer.Write("(&");
     }
@@ -1758,17 +1618,14 @@ void MSLGenerator::OutputDeclarationBody(const HLSLType& type, const char* name,
     // Then name
     m_writer.Write(" %s", name);
 
-    if (isRef)
-    {
+    if (isRef) {
         m_writer.Write(")");
     }
 
     // Add brackets for arrays
-    if (type.array)
-    {
+    if (type.array) {
         m_writer.Write("[");
-        if (type.arraySize != NULL)
-        {
+        if (type.arraySize != NULL) {
             OutputExpression(type.arraySize, NULL);
         }
         m_writer.Write("]");
@@ -1777,17 +1634,14 @@ void MSLGenerator::OutputDeclarationBody(const HLSLType& type, const char* name,
     // Semantics and registers unhandled for now
 
     // Assignment handling
-    if (assignment != NULL)
-    {
+    if (assignment != NULL) {
         m_writer.Write(" = ");
-        if (type.array)
-        {
+        if (type.array) {
             m_writer.Write("{ ");
             OutputExpressionList(assignment);
             m_writer.Write(" }");
         }
-        else
-        {
+        else {
             OutputTypedExpression(type, assignment, NULL);
         }
     }
@@ -1796,10 +1650,8 @@ void MSLGenerator::OutputDeclarationBody(const HLSLType& type, const char* name,
 void MSLGenerator::OutputExpressionList(HLSLExpression* expression)
 {
     int numExpressions = 0;
-    while (expression != NULL)
-    {
-        if (numExpressions > 0)
-        {
+    while (expression != NULL) {
+        if (numExpressions > 0) {
             m_writer.Write(", ");
         }
         OutputExpression(expression, NULL);
@@ -1809,13 +1661,11 @@ void MSLGenerator::OutputExpressionList(HLSLExpression* expression)
 }
 
 // Cast all expressions to given type.
-void MSLGenerator::OutputExpressionList(const HLSLType & type, HLSLExpression* expression)
+void MSLGenerator::OutputExpressionList(const HLSLType& type, HLSLExpression* expression)
 {
     int numExpressions = 0;
-    while (expression != NULL)
-    {
-        if (numExpressions > 0)
-        {
+    while (expression != NULL) {
+        if (numExpressions > 0) {
             m_writer.Write(", ");
         }
 
@@ -1829,13 +1679,10 @@ void MSLGenerator::OutputExpressionList(const HLSLType & type, HLSLExpression* e
 void MSLGenerator::OutputExpressionList(HLSLArgument* argument, HLSLExpression* expression)
 {
     int numExpressions = 0;
-    while (expression != NULL)
-    {
+    while (expression != NULL) {
         ASSERT(argument != NULL);
-        if (argument->modifier != HLSLArgumentModifier_Out)
-        {
-            if (numExpressions > 0)
-            {
+        if (argument->modifier != HLSLArgumentModifier_Out) {
+            if (numExpressions > 0) {
                 m_writer.Write(", ");
             }
 
@@ -1848,46 +1695,36 @@ void MSLGenerator::OutputExpressionList(HLSLArgument* argument, HLSLExpression*
     }
 }
 
-
-
 inline bool isAddressable(HLSLExpression* expression)
 {
-    if (expression->nodeType == HLSLNodeType_IdentifierExpression)
-    {
+    if (expression->nodeType == HLSLNodeType_IdentifierExpression) {
         return true;
     }
-    if (expression->nodeType == HLSLNodeType_ArrayAccess)
-    {
+    if (expression->nodeType == HLSLNodeType_ArrayAccess) {
         return true;
     }
-    if (expression->nodeType == HLSLNodeType_MemberAccess)
-    {
+    if (expression->nodeType == HLSLNodeType_MemberAccess) {
         HLSLMemberAccess* memberAccess = (HLSLMemberAccess*)expression;
         return !memberAccess->swizzle;
     }
     return false;
 }
 
-
 void MSLGenerator::OutputFunctionCallStatement(int indent, HLSLFunctionCall* functionCall, HLSLDeclaration* declaration)
 {
     // Nothing special about these cases:
-    if (functionCall->function->numOutputArguments == 0)
-    {
+    if (functionCall->function->numOutputArguments == 0) {
         m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
-        if (declaration)
-        {
+        if (declaration) {
             OutputDeclaration(declaration);
         }
-        else
-        {
+        else {
             OutputExpression(functionCall, NULL);
         }
         m_writer.EndLine(";");
         return;
     }
 
-
     // Transform this:
     // float foo = functionCall(bah, poo);
 
@@ -1904,12 +1741,10 @@ void MSLGenerator::OutputFunctionCallStatement(int indent, HLSLFunctionCall* fun
     OutputExpressionList(functionCall->function->argument, functionCall->argument);
     m_writer.EndLine(");");
 
-    HLSLExpression * expression = functionCall->argument;
-    HLSLArgument * argument = functionCall->function->argument;
-    while (argument != NULL)
-    {
-        if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout)
-        {
+    HLSLExpression* expression = functionCall->argument;
+    HLSLArgument* argument = functionCall->function->argument;
+    while (argument != NULL) {
+        if (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout) {
             m_writer.BeginLine(indent);
             OutputExpression(expression, NULL);
             // @@ This assignment may need a cast.
@@ -1925,99 +1760,96 @@ void MSLGenerator::OutputFunctionCallStatement(int indent, HLSLFunctionCall* fun
         argument = argument->nextArgument;
     }
 
-    if (declaration)
-    {
+    if (declaration) {
         m_writer.BeginLine(indent);
         OutputDeclarationType(declaration->type);
         m_writer.Write(" %s = out%d.__result;", declaration->name, functionCall->line);
         m_writer.EndLine();
     }
 
+    /* TODO: Alec, why is all this chopped out?
 
-/* TODO: Alec, why is all this chopped out?
+        int argumentIndex = 0;
+        HLSLArgument* argument = functionCall->function->argument;
+        HLSLExpression* expression = functionCall->argument;
+        while (argument != NULL)
+        {
+            if (!isAddressable(expression))
+            {
+                if (argument->modifier == HLSLArgumentModifier_Out)
+                {
+                    m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
+                    OutputDeclarationType(argument->type);
+                    m_writer.Write("tmp%d;", argumentIndex);
+                    m_writer.EndLine();
+                }
+                else if (argument->modifier == HLSLArgumentModifier_Inout)
+                {
+                    m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
+                    OutputDeclarationType(argument->type);
+                    m_writer.Write("tmp%d = ", argumentIndex);
+                    OutputExpression(expression, NULL);
+                    m_writer.EndLine(";");
+                }
+            }
+            argument = argument->nextArgument;
+            expression = expression->nextExpression;
+            argumentIndex++;
+        }
 
-    int argumentIndex = 0;
-    HLSLArgument* argument = functionCall->function->argument;
-    HLSLExpression* expression = functionCall->argument;
-    while (argument != NULL)
-    {
-        if (!isAddressable(expression))
+        m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
+        const char* name = functionCall->function->name;
+        m_writer.Write("%s(", name);
+        //OutputExpressionList(functionCall->argument);
+
+        // Output expression list with temporary substitution.
+        argumentIndex = 0;
+        argument = functionCall->function->argument;
+        expression = functionCall->argument;
+        while (expression != NULL)
         {
-            if (argument->modifier == HLSLArgumentModifier_Out)
+            if (!isAddressable(expression) && (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout))
             {
-                m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
-                OutputDeclarationType(argument->type);
-                m_writer.Write("tmp%d;", argumentIndex);
-                m_writer.EndLine();
+                m_writer.Write("tmp%d", argumentIndex);
             }
-            else if (argument->modifier == HLSLArgumentModifier_Inout)
+            else
             {
-                m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
-                OutputDeclarationType(argument->type);
-                m_writer.Write("tmp%d = ", argumentIndex);
                 OutputExpression(expression, NULL);
-                m_writer.EndLine(";");
             }
-        }
-        argument = argument->nextArgument;
-        expression = expression->nextExpression;
-        argumentIndex++;
-    }
 
-    m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
-    const char* name = functionCall->function->name;
-    m_writer.Write("%s(", name);
-    //OutputExpressionList(functionCall->argument);
-
-    // Output expression list with temporary substitution.
-    argumentIndex = 0;
-    argument = functionCall->function->argument;
-    expression = functionCall->argument;
-    while (expression != NULL)
-    {
-        if (!isAddressable(expression) && (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout))
-        {
-            m_writer.Write("tmp%d", argumentIndex);
-        }
-        else
-        {
-            OutputExpression(expression, NULL);
+            argument = argument->nextArgument;
+            expression = expression->nextExpression;
+            argumentIndex++;
+            if (expression)
+            {
+                m_writer.Write(", ");
+            }
         }
+        m_writer.EndLine(");");
 
-        argument = argument->nextArgument;
-        expression = expression->nextExpression;
-        argumentIndex++;
-        if (expression)
+        argumentIndex = 0;
+        argument = functionCall->function->argument;
+        expression = functionCall->argument;
+        while (expression != NULL)
         {
-            m_writer.Write(", ");
-        }
-    }
-    m_writer.EndLine(");");
+            if (!isAddressable(expression) && (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout))
+            {
+                m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
+                OutputExpression(expression, NULL);
+                m_writer.Write(" = tmp%d", argumentIndex);
+                m_writer.EndLine(";");
+            }
 
-    argumentIndex = 0;
-    argument = functionCall->function->argument;
-    expression = functionCall->argument;
-    while (expression != NULL)
-    {
-        if (!isAddressable(expression) && (argument->modifier == HLSLArgumentModifier_Out || argument->modifier == HLSLArgumentModifier_Inout))
-        {
-            m_writer.BeginLine(indent, functionCall->fileName, functionCall->line);
-            OutputExpression(expression, NULL);
-            m_writer.Write(" = tmp%d", argumentIndex);
-            m_writer.EndLine(";");
+            argument = argument->nextArgument;
+            expression = expression->nextExpression;
+            argumentIndex++;
         }
-
-        argument = argument->nextArgument;
-        expression = expression->nextExpression;
-        argumentIndex++;
-    }
-*/
+    */
 }
 
-void MSLGenerator::OutputFunctionCall(HLSLFunctionCall* functionCall, HLSLExpression * parentExpression)
+void MSLGenerator::OutputFunctionCall(HLSLFunctionCall* functionCall, HLSLExpression* parentExpression)
 {
-    if (functionCall->function->numOutputArguments > 0)
-    {
+    if (functionCall->function->numOutputArguments > 0) {
         ASSERT(false);
     }
 
@@ -2043,7 +1875,7 @@ void MSLGenerator::OutputFunctionCall(HLSLFunctionCall* functionCall, HLSLExpres
     }
 }
 
-const char* MSLGenerator::TranslateInputSemantic(const char * semantic)
+const char* MSLGenerator::TranslateInputSemantic(const char* semantic)
 {
     if (semantic == NULL)
         return NULL;
@@ -2051,8 +1883,7 @@ const char* MSLGenerator::TranslateInputSemantic(const char * semantic)
     uint32_t length, index;
     ParseSemantic(semantic, &length, &index);
 
-    if (m_target == HLSLTarget_VertexShader)
-    {
+    if (m_target == HLSLTarget_VertexShader) {
         // These are DX10 convention
         if (String_Equal(semantic, "SV_InstanceID"))
             return "instance_id";
@@ -2067,14 +1898,13 @@ const char* MSLGenerator::TranslateInputSemantic(const char * semantic)
             return "base_instance";
         //if (String_Equal(semantic, "DRAW_INDEX"))
         //    return "draw_index";
-        
+
         // TODO: primitive_id, barycentric
-        
+
         // Handle attributes
-        
+
         // Can set custom attributes via a callback
-        if (m_options.attributeCallback)
-        {
+        if (m_options.attributeCallback) {
             char name[64];
             ASSERT(length < sizeof(name));
 
@@ -2083,45 +1913,42 @@ const char* MSLGenerator::TranslateInputSemantic(const char * semantic)
 
             int attribute = m_options.attributeCallback(name, index);
 
-            if (attribute >= 0)
-            {
+            if (attribute >= 0) {
                 return m_tree->AddStringFormat("attribute(%d)", attribute);
             }
         }
-        
+
         if (String_Equal(semantic, "SV_Position"))
             return "attribute(POSITION)";
 
         return m_tree->AddStringFormat("attribute(%s)", semantic);
     }
-    else if (m_target == HLSLTarget_PixelShader)
-    {
+    else if (m_target == HLSLTarget_PixelShader) {
         // PS inputs
-        
+
         if (String_Equal(semantic, "SV_Position"))
             return "position";
-        
-        //  if (String_Equal(semantic, "POSITION"))
-        //    return "position";
+
+        // if (String_Equal(semantic, "POSITION"))
+        //   return "position";
         if (String_Equal(semantic, "SV_IsFrontFace"))
             return "front_facing";
-        
+
         // VS sets what layer to render into, ps can look at it.
         // Gpu Family 5.
         if (String_Equal(semantic, "SV_RenderTargetArrayIndex"))
             return "render_target_array_index";
-        
+
         // dual source? passes in underlying color
         if (String_Equal(semantic, "DST_COLOR"))
             return "color(0)";
-        
+
         if (String_Equal(semantic, "SV_SampleIndex"))
             return "sample_id";
         //if (String_Equal(semantic, "SV_Coverage")) return "sample_mask";
         //if (String_Equal(semantic, "SV_Coverage")) return "sample_mask,post_depth_coverage";
     }
-    else if (m_target == HLSLTarget_ComputeShader)
-    {
+    else if (m_target == HLSLTarget_ComputeShader) {
         // compute inputs
         if (String_Equal(semantic, "SV_DispatchThreadID"))
             return "thread_position_in_grid";
@@ -2129,7 +1956,7 @@ const char* MSLGenerator::TranslateInputSemantic(const char * semantic)
     return NULL;
 }
 
-const char* MSLGenerator::TranslateOutputSemantic(const char * semantic)
+const char* MSLGenerator::TranslateOutputSemantic(const char* semantic)
 {
     if (semantic == NULL)
         return NULL;
@@ -2137,11 +1964,10 @@ const char* MSLGenerator::TranslateOutputSemantic(const char * semantic)
     uint32_t length, index;
     ParseSemantic(semantic, &length, &index);
 
-    if (m_target == HLSLTarget_VertexShader)
-    {
+    if (m_target == HLSLTarget_VertexShader) {
         if (String_Equal(semantic, "SV_Position"))
             return "position";
-    
+
         // PSIZE is non-square in DX9, and square in DX10 (and MSL)
         // https://github.com/KhronosGroup/glslang/issues/1154
         if (String_Equal(semantic, "PSIZE"))
@@ -2150,24 +1976,23 @@ const char* MSLGenerator::TranslateOutputSemantic(const char * semantic)
         // control layer in Gpu Family 5
         if (String_Equal(semantic, "SV_RenderTargetArrayIndex"))
             return "render_target_array_index";
-        
+
         // TODO: add
         // SV_ViewportArrayIndex
         // SV_ClipDistance0..n, SV_CullDistance0..n
     }
-    else if (m_target == HLSLTarget_PixelShader)
-    {
-// Not supporting flags, add as bool to options if needed
-//            if (m_options.flags & MSLGenerator::Flag_NoIndexAttribute)
-//            {
-//                // No dual-source blending on iOS, and no index() attribute
-//                if (String_Equal(semantic, "COLOR0_1")) return NULL;
-//            }
-//            else
+    else if (m_target == HLSLTarget_PixelShader) {
+        // Not supporting flags, add as bool to options if needed
+        //            if (m_options.flags & MSLGenerator::Flag_NoIndexAttribute)
+        //            {
+        //                // No dual-source blending on iOS, and no index() attribute
+        //                if (String_Equal(semantic, "COLOR0_1")) return NULL;
+        //            }
+        //            else
         {
             // See these settings
             // MTLBlendFactorSource1Color, OneMinusSource1Color, Source1Alpha, OneMinuSource1Alpha.
-            
+
             // @@ IC: Hardcoded for this specific case, extend ParseSemantic?
             if (String_Equal(semantic, "COLOR0_1"))
                 return "color(0), index(1)";
@@ -2176,25 +2001,24 @@ const char* MSLGenerator::TranslateOutputSemantic(const char * semantic)
         // This is only in A14 and higher
         if (String_Equal(semantic, "SV_Berycentrics"))
             return "barycentric_coord";
-        
+
         // Is there an HLSL euivalent.  Have vulkan ext for PointSize
         // "point_coord"
-        
+
         // "primitive_id"
-        
-        if (strncmp(semantic, "SV_Target", length) == 0)
-        {
+
+        if (strncmp(semantic, "SV_Target", length) == 0) {
             return m_tree->AddStringFormat("color(%d)", index);
         }
-//            if (strncmp(semantic, "COLOR", length) == 0)
-//            {
-//                return m_tree->AddStringFormat("color(%d)", index);
-//            }
+        // if (strncmp(semantic, "COLOR", length) == 0)
+        // {
+        //     return m_tree->AddStringFormat("color(%d)", index);
+        // }
 
         // depth variants to preserve earlyz, use greater on reverseZ
         if (String_Equal(semantic, "SV_Depth"))
             return "depth(any)";
-        
+
         // These don't quite line up, since comparison is not ==
         // Metal can only use any/less/greater.  Preserve early z when outputting depth.
         // reverseZ would use greater.
@@ -2202,19 +2026,16 @@ const char* MSLGenerator::TranslateOutputSemantic(const char * semantic)
             return "depth(greater)";
         if (String_Equal(semantic, "SV_DepthLessEqual"))
             return "depth(less)";
-        
+
         if (String_Equal(semantic, "SV_Coverage"))
             return "sample_mask";
     }
-    else if (m_target == HLSLTarget_ComputeShader)
-    {
+    else if (m_target == HLSLTarget_ComputeShader) {
         // compute outputs
-        
     }
     return NULL;
 }
 
-
 const char* MSLGenerator::GetTypeName(const HLSLType& type, bool exactType)
 {
     bool promote = ((type.flags & HLSLTypeFlag_NoPromote) == 0);
@@ -2222,59 +2043,57 @@ const char* MSLGenerator::GetTypeName(const HLSLType& type, bool exactType)
     // number
     bool isHalfNumerics = promote && !m_options.treatHalfAsFloat;
     HLSLBaseType baseType = type.baseType;
-    
+
     // Note: these conversions should really be done during parsing
     // so that casting gets applied.
     if (!isHalfNumerics)
         baseType = HalfToFloatBaseType(baseType);
-    
+
     // MSL doesn't support double
     if (IsDouble(baseType))
         baseType = DoubleToFloatBaseType(baseType);
-    
+
     HLSLType remappedType(baseType);
     remappedType.typeName = type.typeName; // in case it's a struct
-    
+
     if (IsSamplerType(baseType) || IsNumericType(baseType) || baseType == HLSLBaseType_Void || baseType == HLSLBaseType_UserDefined)
         return GetTypeNameMetal(remappedType);
-    
+
     // texture
-    if (IsTextureType(baseType))
-    {
+    if (IsTextureType(baseType)) {
         // unclear if depth supports half, may have to be float always
-        
-        bool isHalfTexture  = promote && IsHalf(type.formatType) && !m_options.treatHalfAsFloat;
-        
+
+        bool isHalfTexture = promote && IsHalf(type.formatType) && !m_options.treatHalfAsFloat;
+
         // MSL docs state must be float type, but what about D16f texture?
         if (IsDepthTextureType(baseType))
             isHalfTexture = false;
-        
+
         // TODO: could use GetTypeNameMetal() but it doesn't include <> portion
         // so would have to pool and then return the result.
-        
+
         // This would allow more formats
         // const char* textureTypeName = GetTypeNameMetal(baseType);
         // const char* formatTypeName = GetFormatTypeName(baseType, formatType);
         // snprintf(buf, sizeof(buf), "%s<%s>", textureTypeName, formatTypeName);
-        
-        switch (baseType)
-        {
+
+        switch (baseType) {
             case HLSLBaseType_Depth2D:
                 return isHalfTexture ? "depth2d<half>" : "depth2d<float>";
             case HLSLBaseType_Depth2DArray:
                 return isHalfTexture ? "depth2d_array<half>" : "depth2d_array<float>";
             case HLSLBaseType_DepthCube:
                 return isHalfTexture ? "depthcube<half>" : "depthcube<float>";
-                
+
             /* TODO: also depth_ms_array, but HLSL6.6 equivalent
             case HLSLBaseType_Depth2DMS:
                 return isHalfTexture ? "depth2d_ms<half>" : "depth2d_ms<float>";
             */
-            
+
             // More types than just half/float for this
             case HLSLBaseType_RWTexture2D:
                 return isHalfTexture ? "texture2d<half, access::read_write>" : "texture2d<float, access::read_write>";
-                
+
             case HLSLBaseType_Texture2D:
                 return isHalfTexture ? "texture2d<half>" : "texture2d<float>";
             case HLSLBaseType_Texture2DArray:
@@ -2287,14 +2106,14 @@ const char* MSLGenerator::GetTypeName(const HLSLType& type, bool exactType)
                 return isHalfTexture ? "texturecube_array<half>" : "texturecube_array<float>";
             case HLSLBaseType_Texture2DMS:
                 return isHalfTexture ? "texture2d_ms<half>" : "texture2d_ms<float>";
-            
+
             default:
                 break;
         }
     }
-    
+
     Error("Unknown Type");
     return NULL;
 }
 
-} // M4
+} //namespace M4
diff --git a/hlslparser/src/MSLGenerator.h b/hlslparser/src/MSLGenerator.h
index 25cd4d34..1b69b028 100644
--- a/hlslparser/src/MSLGenerator.h
+++ b/hlslparser/src/MSLGenerator.h
@@ -3,20 +3,18 @@
 #include "CodeWriter.h"
 #include "HLSLTree.h"
 
-namespace M4
-{
+namespace M4 {
 
-class  HLSLTree;
+class HLSLTree;
 struct HLSLFunction;
 struct HLSLStruct;
-    
-struct MSLOptions
-{
+
+struct MSLOptions {
     int (*attributeCallback)(const char* name, uint32_t index) = NULL;
-    
+
     // no CLI to set offset
     uint32_t bufferRegisterOffset = 0;
-    
+
     bool writeFileLine = false;
     bool treatHalfAsFloat = false;
 };
@@ -24,8 +22,7 @@ struct MSLOptions
 /**
  * This class is used to generate MSL shaders.
  */
-class MSLGenerator
-{
+class MSLGenerator {
 public:
     MSLGenerator();
 
@@ -33,32 +30,29 @@ class MSLGenerator
     const char* GetResult() const;
 
 private:
-    
     // @@ Rename class argument. Add buffers & textures.
-    struct ClassArgument
-    {
+    struct ClassArgument {
         const char* name;
         HLSLType type;
         //const char* typeName;     // @@ Do we need more than the type name?
         const char* registerName;
         bool isRef;
-        
-        ClassArgument * nextArg;
-        
-        ClassArgument(const char* name, HLSLType type, const char * registerName, bool isRef) :
-            name(name), type(type), registerName(registerName), isRef(isRef)
-		{
-			nextArg = NULL;
-		}
+
+        ClassArgument* nextArg;
+
+        ClassArgument(const char* name, HLSLType type, const char* registerName, bool isRef) : name(name), type(type), registerName(registerName), isRef(isRef)
+        {
+            nextArg = NULL;
+        }
     };
 
-    void AddClassArgument(ClassArgument * arg);
+    void AddClassArgument(ClassArgument* arg);
 
     void Prepass(HLSLTree* tree, HLSLTarget target, HLSLFunction* entryFunction);
     void CleanPrepass();
-    
+
     void PrependDeclarations();
-    
+
     void OutputStaticDeclarations(int indent, HLSLStatement* statement);
     void OutputStatements(int indent, HLSLStatement* statement);
     void OutputAttributes(int indent, HLSLAttribute* attribute);
@@ -68,9 +62,9 @@ class MSLGenerator
     void OutputFunction(int indent, HLSLFunction* function);
     void OutputExpression(HLSLExpression* expression, HLSLExpression* parentExpression);
     void OutputTypedExpression(const HLSLType& type, HLSLExpression* expression, HLSLExpression* parentExpression);
-    bool NeedsCast(const HLSLType & target, const HLSLType & source);
+    bool NeedsCast(const HLSLType& target, const HLSLType& source);
     void OutputCast(const HLSLType& type);
-    
+
     void OutputArguments(HLSLArgument* argument);
     void OutputDeclaration(const HLSLType& type, const char* name, HLSLExpression* assignment, bool isRef = false, bool isConst = false, int alignment = 0);
     void OutputDeclarationType(const HLSLType& type, bool isConst = false, bool isRef = false, int alignment = 0, bool isTypeCast = false);
@@ -78,36 +72,34 @@ class MSLGenerator
     void OutputExpressionList(HLSLExpression* expression);
     void OutputExpressionList(const HLSLType& type, HLSLExpression* expression);
     void OutputExpressionList(HLSLArgument* argument, HLSLExpression* expression);
-    
+
     void OutputFunctionCallStatement(int indent, HLSLFunctionCall* functionCall, HLSLDeclaration* assingmentExpression);
-    void OutputFunctionCall(HLSLFunctionCall* functionCall, HLSLExpression * parentExpression);
+    void OutputFunctionCall(HLSLFunctionCall* functionCall, HLSLExpression* parentExpression);
 
     const char* TranslateInputSemantic(const char* semantic);
     const char* TranslateOutputSemantic(const char* semantic);
 
     const char* GetTypeName(const HLSLType& type, bool exactType);
     const char* GetAddressSpaceName(HLSLBaseType baseType, HLSLAddressSpace addressSpace) const;
-    
+
     bool CanSkipWrittenStatement(const HLSLStatement* statement) const;
-    
+
     void Error(const char* format, ...) const M4_PRINTF_ATTR(2, 3);
 
 private:
+    CodeWriter m_writer;
 
-    CodeWriter      m_writer;
+    HLSLTree* m_tree;
+    const char* m_entryName;
+    HLSLTarget m_target;
+    MSLOptions m_options;
 
-    HLSLTree*       m_tree;
-    const char*     m_entryName;
-    HLSLTarget      m_target;
-    MSLOptions      m_options;
+    mutable bool m_error;
 
-    mutable bool            m_error;
+    ClassArgument* m_firstClassArgument;
+    ClassArgument* m_lastClassArgument;
 
-    ClassArgument * m_firstClassArgument;
-    ClassArgument * m_lastClassArgument;
-    
-    HLSLFunction *  m_currentFunction;
+    HLSLFunction* m_currentFunction;
 };
 
-} // M4
-
+} //namespace M4
diff --git a/hlslparser/src/Main.cpp b/hlslparser/src/Main.cpp
index addaa5d8..b471f26f 100644
--- a/hlslparser/src/Main.cpp
+++ b/hlslparser/src/Main.cpp
@@ -1,23 +1,22 @@
 #include "HLSLParser.h"
 
 //#include "GLSLGenerator.h"
-#include "HLSLGenerator.h"
-#include "MSLGenerator.h"
-
 #include <stdio.h>
 #include <sys/stat.h>
 
 #include <filesystem>
 
+#include "HLSLGenerator.h"
+#include "MSLGenerator.h"
+
 using namespace std;
 
-enum Language
-{
+enum Language {
     Language_MSL,
-	Language_HLSL,
+    Language_HLSL,
 };
 
-bool ReadFile( const char* fileName, string& str )
+bool ReadFile(const char* fileName, string& str)
 {
     struct stat stats = {};
     if (stat(fileName, &stats) < 0) {
@@ -37,17 +36,16 @@ bool ReadFile( const char* fileName, string& str )
 
 void PrintUsage()
 {
-	fprintf(stderr,
-        "usage: hlslparser [-h|-g] -i shader.hlsl -o [shader.hlsl | shader.metal]\n"
-		 "Translate DX9-style HLSL shader to HLSL/MSL shader.\n"
-         " -i          input HLSL\n"
-         " -o          output HLSL or MSL\n"
-		 "optional arguments:\n"
-         " -g          debug mode, preserve comments\n"
-         " -h, --help  show this help message and exit\n"
-         " -line       write #file/line directive\n"
-         " -nohalf     turn half into float"
-		);
+    fprintf(stderr,
+            "usage: hlslparser [-h|-g] -i shader.hlsl -o [shader.hlsl | shader.metal]\n"
+            "Translate DX9-style HLSL shader to HLSL/MSL shader.\n"
+            " -i          input HLSL\n"
+            " -o          output HLSL or MSL\n"
+            "optional arguments:\n"
+            " -g          debug mode, preserve comments\n"
+            " -h, --help  show this help message and exit\n"
+            " -line       write #file/line directive\n"
+            " -nohalf     turn half into float");
 }
 
 // Taken from KrmaLog.cpp
@@ -61,12 +59,12 @@ static bool endsWith(const string& value, const string& ending)
     if (value.size() < ending.size())
         return false;
     uint32_t start = (uint32_t)(value.size() - ending.size());
-        
+
     for (uint32_t i = 0; i < ending.size(); ++i) {
         if (value[start + i] != ending[i])
             return false;
     }
-    
+
     return true;
 }
 
@@ -77,221 +75,193 @@ static string filenameNoExtension(const char* filename)
     if (dotPosStr == nullptr)
         return filename;
     auto dotPos = dotPosStr - filename;
-    
+
     // now chop off the extension
     string filenameNoExt = filename;
     return filenameNoExt.substr(0, dotPos);
 }
 
-int main( int argc, char* argv[] )
+int main(int argc, char* argv[])
 {
-	using namespace M4;
+    using namespace M4;
 
-	// Parse arguments
-	string fileName;
-	const char* entryName = NULL;
+    // Parse arguments
+    string fileName;
+    const char* entryName = NULL;
 
-	// TODO: could we take modern DX12 HLSL and translate to MSL only
-	// That would simplify all this.  What spirv-cross already does though.
-	// Could drop HLSLGenerator then, and just use this to gen MSL.
-	// Much of the glue code can just be in a header, but having it
-	// in parser, lets this only splice code that is needed.
+    // TODO: could we take modern DX12 HLSL and translate to MSL only
+    // That would simplify all this.  What spirv-cross already does though.
+    // Could drop HLSLGenerator then, and just use this to gen MSL.
+    // Much of the glue code can just be in a header, but having it
+    // in parser, lets this only splice code that is needed.
 
-	Language language = Language_MSL;
-	HLSLTarget target = HLSLTarget_PixelShader;
+    Language language = Language_MSL;
+    HLSLTarget target = HLSLTarget_PixelShader;
     string outputFileName;
     bool isDebug = false;
     bool isTreatHalfAsFloat = false;
     bool isWriteFileLine = false;
-    
-	for( int argn = 1; argn < argc; ++argn )
-	{
-		const char* const arg = argv[ argn ];
-
-		if( String_Equal( arg, "-h" ) || String_Equal( arg, "--help" ) )
-		{
-			PrintUsage();
-			return 0;
-		}
-		
-        else if( String_Equal( arg, "-o" ) || String_Equal( arg, "-output" ) )
-        {
-            if ( ++argn < argc )
-                outputFileName = argv[ argn ];
+
+    for (int argn = 1; argn < argc; ++argn) {
+        const char* const arg = argv[argn];
+
+        if (String_Equal(arg, "-h") || String_Equal(arg, "--help")) {
+            PrintUsage();
+            return 0;
         }
-        else if( String_Equal( arg, "-i" ) || String_Equal( arg, "-input" ) )
-		{
-            if ( ++argn < argc )
-                fileName = argv[ argn ];
-		}
-        else if ( String_Equal( arg, "-g" ))
-        {
+
+        else if (String_Equal(arg, "-o") || String_Equal(arg, "-output")) {
+            if (++argn < argc)
+                outputFileName = argv[argn];
+        }
+        else if (String_Equal(arg, "-i") || String_Equal(arg, "-input")) {
+            if (++argn < argc)
+                fileName = argv[argn];
+        }
+        else if (String_Equal(arg, "-g")) {
             // will preserve double-slash comments where possible
             isDebug = true;
         }
-        else if ( String_Equal( arg, "-nohalf" ))
-        {
+        else if (String_Equal(arg, "-nohalf")) {
             // will preserve double-slash comments where possible
             isTreatHalfAsFloat = true;
         }
-        else if ( String_Equal( arg, "-line" ))
-        {
+        else if (String_Equal(arg, "-line")) {
             // will preserve double-slash comments where possible
             isWriteFileLine = true;
         }
-        
-// This is derived from end characters of entry point
-//        else if( String_Equal( arg, "-vs" ) )
-//        {
-//            target = HLSLTarget_VertexShader;
-//        }
-//        else if( String_Equal( arg, "-fs" ) )
-//        {
-//            target = HLSLTarget_PixelShader;
-//        }
- // TODO: require a arg to set entryName
-//		else if( entryName == NULL )
-//		{
-//			entryName = arg;
-//		}
-		else
-		{
-			Log_Error( "Too many arguments\n" );
-			PrintUsage();
-			return 1;
-		}
-	}
-
-	if( fileName.empty() )
-	{
-		Log_Error( "Missing source filename\n" );
-		PrintUsage();
-		return 1;
-	}
-    if( !endsWith( fileName, "hlsl" ) )
-    {
-        Log_Error( "Input filename must end with .hlsl\n" );
+
+        // This is derived from end characters of entry point
+        //        else if( String_Equal( arg, "-vs" ) )
+        //        {
+        //            target = HLSLTarget_VertexShader;
+        //        }
+        //        else if( String_Equal( arg, "-fs" ) )
+        //        {
+        //            target = HLSLTarget_PixelShader;
+        //        }
+        // TODO: require a arg to set entryName
+        //		else if( entryName == NULL )
+        //		{
+        //			entryName = arg;
+        //		}
+        else {
+            Log_Error("Too many arguments\n");
+            PrintUsage();
+            return 1;
+        }
+    }
+
+    if (fileName.empty()) {
+        Log_Error("Missing source filename\n");
+        PrintUsage();
+        return 1;
+    }
+    if (!endsWith(fileName, "hlsl")) {
+        Log_Error("Input filename must end with .hlsl\n");
         PrintUsage();
         return 1;
     }
-    
-    if( outputFileName.empty() )
-    {
-        Log_Error( "Missing dest filename\n" );
+
+    if (outputFileName.empty()) {
+        Log_Error("Missing dest filename\n");
         PrintUsage();
         return 1;
     }
-    if( endsWith( outputFileName, "hlsl" ) )
-    {
+    if (endsWith(outputFileName, "hlsl")) {
         language = Language_HLSL;
     }
-    else if( endsWith( outputFileName, "metal" ) )
-    {
+    else if (endsWith(outputFileName, "metal")) {
         language = Language_MSL;
     }
-    else
-    {
-        Log_Error( "Output file must end with .hlsl or msls\n" );
+    else {
+        Log_Error("Output file must end with .hlsl or msls\n");
         PrintUsage();
         return 1;
     }
-    
+
     // replace the extension on the output file
-    outputFileName = filenameNoExtension( outputFileName.c_str() );
-    
+    outputFileName = filenameNoExtension(outputFileName.c_str());
+
     // Allow a mix of shaders in file.
     // Code now finds entry points.
     // outputFileName += (target == HLSLTarget_PixelShader) ? "PS" : "VS";
-    
-    if ( language == Language_MSL )
-    {
+
+    if (language == Language_MSL) {
         outputFileName += ".metal";
     }
-    else if ( language == Language_HLSL )
-    {
+    else if (language == Language_HLSL) {
         outputFileName += ".hlsl";
     }
-    
+
     // Win build on github is failing on this, so skip for now
     // find  full pathname of the fileName, so that errors are logged
     // in way that can be clicked to. absolute includes .. in it, canonical does not.
     std::error_code errorCode; // To shutup exceptions
     auto path = filesystem::path(fileName);
     fileName = filesystem::canonical(path, errorCode).generic_string();
-    
+
     // if this file doesn't exist, then canonical throws exception
     path = filesystem::path(outputFileName);
-    if (filesystem::exists(path))
-    {
+    if (filesystem::exists(path)) {
         outputFileName = filesystem::canonical(path, errorCode).generic_string();
-        
-        if ( outputFileName == fileName )
-        {
-            Log_Error( "Src and Dst filenames match.  Exiting.\n" );
+
+        if (outputFileName == fileName) {
+            Log_Error("Src and Dst filenames match.  Exiting.\n");
             return 1;
         }
     }
-    
+
     //------------------------------------
     // Now start the work
-    
-	// Read input file
+
+    // Read input file
     string source;
-    if (!ReadFile( fileName.c_str(), source ))
-    {
-        Log_Error( "Input file not found\n" );
+    if (!ReadFile(fileName.c_str(), source)) {
+        Log_Error("Input file not found\n");
         return 1;
     }
 
-	// Parse input file
-	Allocator allocator;
-	HLSLParser parser( &allocator, fileName.c_str(), source.data(), source.size() );
-    if (isDebug)
-    {
+    // Parse input file
+    Allocator allocator;
+    HLSLParser parser(&allocator, fileName.c_str(), source.data(), source.size());
+    if (isDebug) {
         parser.SetKeepComments(true);
     }
-	HLSLTree tree( &allocator );
-    
+    HLSLTree tree(&allocator);
+
     // TODO: tie this to CLI, MSL should set both to true
     HLSLParserOptions parserOptions;
     parserOptions.isHalfst = true;
     parserOptions.isHalfio = true;
-    
-	if( !parser.Parse( &tree, parserOptions ) )
-	{
-		Log_Error( "Parsing failed\n" );
-		return 1;
-	}
-    
+
+    if (!parser.Parse(&tree, parserOptions)) {
+        Log_Error("Parsing failed\n");
+        return 1;
+    }
+
     int status = 0;
-    
+
     // build a list of entryPoints
     Array<const char*> entryPoints(&allocator);
-    if (entryName != nullptr)
-    {
+    if (entryName != nullptr) {
         entryPoints.PushBack(entryName);
     }
-    else
-    {
+    else {
         // search all functions with designated endings
         HLSLStatement* statement = tree.GetRoot()->statement;
-        while (statement != NULL)
-        {
-            if (statement->nodeType == HLSLNodeType_Function)
-            {
+        while (statement != NULL) {
+            if (statement->nodeType == HLSLNodeType_Function) {
                 HLSLFunction* function = (HLSLFunction*)statement;
                 const char* name = function->name;
-                
-                if (endsWith(name, "VS"))
-                {
+
+                if (endsWith(name, "VS")) {
                     entryPoints.PushBack(name);
                 }
-                else if (endsWith(name, "PS"))
-                {
+                else if (endsWith(name, "PS")) {
                     entryPoints.PushBack(name);
                 }
-                else if (endsWith(name, "CS"))
-                {
+                else if (endsWith(name, "CS")) {
                     entryPoints.PushBack(name);
                 }
             }
@@ -299,11 +269,10 @@ int main( int argc, char* argv[] )
             statement = statement->nextStatement;
         }
     }
-    
+
     string output;
-    
-    for (uint32_t i = 0; i < (uint32_t)entryPoints.GetSize(); ++i)
-    {
+
+    for (uint32_t i = 0; i < (uint32_t)entryPoints.GetSize(); ++i) {
         const char* entryPoint = entryPoints[i];
         entryName = entryPoint;
         if (endsWith(entryPoint, "VS"))
@@ -312,67 +281,59 @@ int main( int argc, char* argv[] )
             target = HLSLTarget_PixelShader;
         else if (endsWith(entryPoint, "CS"))
             target = HLSLTarget_ComputeShader;
-            
+
         // Generate output
-        if (language == Language_HLSL)
-        {
+        if (language == Language_HLSL) {
             HLSLOptions options;
             options.writeFileLine = isWriteFileLine;
             options.treatHalfAsFloat = isTreatHalfAsFloat;
             options.writeVulkan = true; // TODO: tie to CLI
-            
+
             HLSLGenerator generator;
-            if (generator.Generate( &tree, target, entryName, options))
-            {
+            if (generator.Generate(&tree, target, entryName, options)) {
                 // write the buffer out
                 output += generator.GetResult();
             }
-            else
-            {
-                Log_Error( "Translation failed, aborting\n" );
+            else {
+                Log_Error("Translation failed, aborting\n");
                 status = 1;
             }
         }
-        else if (language == Language_MSL)
-        {
+        else if (language == Language_MSL) {
             MSLOptions options;
             options.writeFileLine = isWriteFileLine;
             options.treatHalfAsFloat = isTreatHalfAsFloat;
-            
+
             MSLGenerator generator;
-            if (generator.Generate(&tree, target, entryName, options))
-            {
+            if (generator.Generate(&tree, target, entryName, options)) {
                 // write the buffer out
                 output += generator.GetResult();
             }
-            else
-            {
-                Log_Error( "Translation failed, aborting\n" );
+            else {
+                Log_Error("Translation failed, aborting\n");
                 status = 1;
             }
         }
-        
+
         if (status != 0)
             break;
     }
-    
-    if (status == 0)
-    {
+
+    if (status == 0) {
         // using wb to avoid having Win convert \n to \r\n
-        FILE* fp = fopen( outputFileName.c_str(), "wb" );
-        if ( !fp )
-        {
-            Log_Error( "Could not open output file %s\n", outputFileName.c_str() );
+        FILE* fp = fopen(outputFileName.c_str(), "wb");
+        if (!fp) {
+            Log_Error("Could not open output file %s\n", outputFileName.c_str());
             return 1;
         }
-        
+
         fprintf(fp, "%s", output.c_str());
-        fclose( fp );
+        fclose(fp);
     }
-        
+
     // It's not enough to return 1 from main, but set exit code.
     if (status)
         exit(status);
-    
+
     return status;
 }
diff --git a/kram-preview/KramPreviewViewController.h b/kram-preview/KramPreviewViewController.h
index 2bf9e439..80df48ea 100644
--- a/kram-preview/KramPreviewViewController.h
+++ b/kram-preview/KramPreviewViewController.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
diff --git a/kram-preview/KramPreviewViewController.mm b/kram-preview/KramPreviewViewController.mm
index 9cdc071f..c03e7bc8 100644
--- a/kram-preview/KramPreviewViewController.mm
+++ b/kram-preview/KramPreviewViewController.mm
@@ -1,37 +1,38 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #import "KramPreviewViewController.h"
-#import <Quartz/Quartz.h>
 
-#include <CoreGraphics/CoreGraphics.h>
 #import <Accelerate/Accelerate.h>
+#include <CoreGraphics/CoreGraphics.h>
+#import <Quartz/Quartz.h>
 
 #include "KramLib.h"
 
 using namespace kram;
 
 // Same code in Preview and Thumbnail
-inline NSError* KLOGF(uint32_t code, const char* format, ...) {
+inline NSError* KLOGF(uint32_t code, const char* format, ...)
+{
     string str;
-    
+
     va_list args;
     va_start(args, format);
     /* int32_t len = */ append_vsprintf(str, format, args);
     va_end(args);
-    
+
     // log here, so it can see it in Console.  But this never appears.
     // How are you supposed to debug failures?  Resorted to passing a unique code into this call.
     // It wasn't originally supposed to generate an NSError
     //NSLog(@"%s", str.c_str());
-    
+
     // Console prints this as <private>, so what's the point of producing a localizedString ?
     // This doesn't seem to work to Console app, but maybe if logs are to terminal
     // sudo log config --mode "level:debug" --subsystem com.hialec.kramv
-    
+
     NSString* errorText = [NSString stringWithUTF8String:str.c_str()];
-    return [NSError errorWithDomain:@"com.hialec.kramv" code:code userInfo:@{NSLocalizedDescriptionKey: errorText}];
+    return [NSError errorWithDomain:@"com.hialec.kramv" code:code userInfo:@{NSLocalizedDescriptionKey : errorText}];
 }
 
 @interface KramPreviewViewController () <QLPreviewingController>
@@ -41,38 +42,45 @@ @implementation KramPreviewViewController {
     NSImageView* _imageView;
 }
 
-- (NSString *)nibName {
+- (NSString*)nibName
+{
     return @"KramPreviewViewController";
 }
 
-- (void)loadView {
+- (void)loadView
+{
     [super loadView];
     // Do any additional setup after loading the view.
-    
+
     _imageView = [[NSImageView alloc] initWithFrame:self.view.frame];
     [_imageView setTranslatesAutoresizingMaskIntoConstraints:NO]; //Required to opt-in to autolayout
 
     // no frame, already the default
     // _imageView.imageFrameStyle = NSImageFrameNone;
-    
+
     _imageView.imageScaling = NSImageScaleProportionallyUpOrDown;
-    
-    [self.view addSubview: _imageView];
-    
-    NSDictionary* views = @{@"myview": _imageView};
+
+    [self.view addSubview:_imageView];
+
+    NSDictionary* views = @{@"myview" : _imageView};
     [self.view addConstraints:[NSLayoutConstraint
-                               constraintsWithVisualFormat:@"H:|[myview]|" options:0 metrics:nil
-                               views:views]];
+                                  constraintsWithVisualFormat:@"H:|[myview]|"
+                                                      options:0
+                                                      metrics:nil
+                                                        views:views]];
     [self.view addConstraints:[NSLayoutConstraint
-                               constraintsWithVisualFormat:@"V:|[myview]|" options:0 metrics:nil
-                                views:views]];
+                                  constraintsWithVisualFormat:@"V:|[myview]|"
+                                                      options:0
+                                                      metrics:nil
+                                                        views:views]];
     //[NSLayoutConstraint activateConstraints: self.view.constraints];
 }
 
 // This isn't a view, but hoping this is called
-- (void)viewDidAppear {
+- (void)viewDidAppear
+{
     [super viewDidAppear];
-    
+
     // this must be called after layer is ready
     //self.view.layer.backgroundColor = [NSColor blackColor].CGColor;
     _imageView.layer.backgroundColor = [NSColor blackColor].CGColor;
@@ -82,9 +90,9 @@ - (void)viewDidAppear {
  * Implement this method and set QLSupportsSearchableItems to YES in the Info.plist of the extension if you support CoreSpotlight.
  *
 - (void)preparePreviewOfSearchableItemWithIdentifier:(NSString *)identifier queryString:(NSString *)queryString completionHandler:(void (^)(NSError * _Nullable))handler {
-    
+
     // Perform any setup necessary in order to prepare the view.
-    
+
     // Call the completion handler so Quick Look knows that the preview is fully loaded.
     // Quick Look will display a loading spinner while the completion handler is not called.
 
@@ -92,43 +100,43 @@ - (void)preparePreviewOfSearchableItemWithIdentifier:(NSString *)identifier quer
 }
 */
 
-- (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSError * _Nullable))handler {
-    
+- (void)preparePreviewOfFileAtURL:(NSURL*)url completionHandler:(void (^)(NSError* _Nullable))handler
+{
     NSError* error = nil;
     const char* filename = [url fileSystemRepresentation];
 
-//    if (![_imageView isKindOfClass:[NSImageView class]]) {
-//        error = KLOGF(9, "kramv %s expected NSImageView \n", filename);
-//        handler(error);
-//        return;
-//    }
-    
+    // if (![_imageView isKindOfClass:[NSImageView class]]) {
+    //     error = KLOGF(9, "kramv %s expected NSImageView \n", filename);
+    //     handler(error);
+    //     return;
+    // }
+
     // Add the supported content types to the QLSupportedContentTypes array in the Info.plist of the extension.
     // Perform any setup necessary in order to prepare the view.
-    
+
     // The following is adapted out of Thumbnailer
-    
+
     // No request here, may need to use view size
     uint32_t maxWidth = _imageView.frame.size.width;
     uint32_t maxHeight = _imageView.frame.size.height;
-    
+
     // ignore upper case extensions
     if (!isSupportedFilename(filename)) {
         error = KLOGF(1, "kramv %s only supports ktx, ktx2, dds files\n", filename);
         handler(error);
         return;
     }
-         
+
     KTXImage image;
     KTXImageData imageData;
     TexEncoder decoderType = kTexEncoderUnknown;
-   
+
     if (!imageData.open(filename, image)) {
         error = KLOGF(2, "kramv %s coould not open file\n", filename);
         handler(error);
         return;
     }
-    
+
     // This will set decoder
     auto textureType = MyMTLTextureType2D; // image.textureType
     if (!validateFormatAndDecoder(textureType, image.pixelFormat, decoderType)) {
@@ -136,13 +144,13 @@ - (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSErr
         handler(error);
         return;
     }
-    
+
     bool isPremul = image.isPremul();
     bool isSrgb = isSrgbFormat(image.pixelFormat);
-    
+
     // unpack a level to get the blocks
     uint32_t mipNumber = 0;
-    
+
     uint32_t mipCount = image.mipCount();
     uint32_t w, h, d;
     for (uint32_t i = 0; i < mipCount; ++i) {
@@ -151,25 +159,24 @@ - (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSErr
             mipNumber++;
         }
     }
-    
+
     // clamp to smallest
     mipNumber = std::min(mipNumber, mipCount - 1);
     image.mipDimensions(mipNumber, w, h, d);
-    
+
     uint32_t chunkNum = 0; // TODO: could embed chunk(s) to gen thumbnail from, cube/array?
     uint32_t numChunks = image.totalChunks();
-    
+
     vector<uint8_t> mipData;
 
     // new decode the blocks in that chunk
     if (isBlockFormat(image.pixelFormat)) {
-        
         uint64_t mipLength = image.mipLevels[mipNumber].length;
-        
-         // then decode any blocks to rgba8u, not dealing with HDR formats yet
+
+        // then decode any blocks to rgba8u, not dealing with HDR formats yet
         if (image.isSupercompressed()) {
             const uint8_t* srcData = image.fileData + image.mipLevels[mipNumber].offset;
-            
+
             mipData.resize(mipLength * numChunks);
             uint8_t* dstData = mipData.data();
             if (!image.unpackLevel(mipNumber, srcData, dstData)) {
@@ -177,7 +184,7 @@ - (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSErr
                 handler(error);
                 return;
             }
-            
+
             // now extract the chunk for the thumbnail out of that level
             if (numChunks > 1) {
                 macroUnusedVar(chunkNum);
@@ -187,66 +194,63 @@ - (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSErr
                 mipData.resize(mipLength);
             }
         }
-        else
-        {
+        else {
             // this just truncate to chunk 0 instead of copying chunkNum first
             mipData.resize(mipLength);
-            
+
             const uint8_t* srcData = image.fileData + image.mipLevels[mipNumber].offset;
-            
+
             memcpy(mipData.data(), srcData, mipLength);
         }
-        
+
         KramDecoder decoder;
         KramDecoderParams params;
-        
+
         // TODO: should honor swizzle in the ktx image
         // TODO: probaby need an snorm rgba format to convert the snorm versions, so they're not all red
         // if sdf, will be signed format and that will stay red
-       
-        switch(image.pixelFormat)
-        {
+
+        switch (image.pixelFormat) {
             // To avoid showing single channel content in red, replicate to rgb
             case MyMTLPixelFormatBC4_RUnorm:
             case MyMTLPixelFormatEAC_R11Unorm:
                 params.swizzleText = "rrr1";
                 break;
-                
+
             default:
                 break;
         }
-        
+
         vector<uint8_t> dstMipData;
-        
+
         // only space for one chunk for now
         dstMipData.resize(numChunks * h * w * sizeof(Color));
-        
+
         // want to just decode one chunk of the level that was unpacked abovve
         if (!decoder.decodeBlocks(w, h, mipData.data(), (int32_t)mipData.size(), image.pixelFormat, dstMipData, params)) {
             error = KLOGF(6, "kramv %s failed to decode blocks\n", filename);
             handler(error);
             return;
         }
-        
+
         mipData = dstMipData;
     }
-    else if (isExplicitFormat(image.pixelFormat))
-    {
+    else if (isExplicitFormat(image.pixelFormat)) {
         Image image2D;
         if (!image2D.loadThumbnailFromKTX(image, mipNumber)) {
             error = KLOGF(7, "kramv %s failed to convert image to 4 channels\n", filename);
             handler(error);
             return;
         }
-        
+
         // TODO: could swizzle height (single channel) textures to rrr1
-        
+
         // copy from Color back to uint8_t
         uint32_t mipSize = h * w * sizeof(Color);
         mipData.resize(mipSize);
         memcpy(mipData.data(), image2D.pixels().data(), mipSize);
     }
-    
+
     // https://developer.apple.com/library/archive/documentation/GraphicsImaging/Conceptual/drawingwithquartz2d/dq_images/dq_images.html#//apple_ref/doc/uid/TP30001066-CH212-TPXREF101
 
     uint32_t rowBytes = w * sizeof(Color);
@@ -254,25 +258,25 @@ - (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSErr
     // use vimage in the Accelerate.framework
     // https://developer.apple.com/library/archive/releasenotes/Performance/RN-vecLib/index.html#//apple_ref/doc/uid/TP40001049
 
-    vImage_Buffer buf = { mipData.data(), h, w, rowBytes };
+    vImage_Buffer buf = {mipData.data(), h, w, rowBytes};
 
     // Declare the pixel format for the vImage_Buffer
     vImage_CGImageFormat format = {
-        .bitsPerComponent   = 8,
-        .bitsPerPixel       = 32,
+        .bitsPerComponent = 8,
+        .bitsPerPixel = 32,
     };
-    
+
     format.bitmapInfo = kCGBitmapByteOrderDefault | (CGBitmapInfo)(isPremul ? kCGImageAlphaPremultipliedLast : kCGImageAlphaLast);
     format.colorSpace = isSrgb ? CGColorSpaceCreateWithName(kCGColorSpaceSRGB) : CGColorSpaceCreateDeviceRGB();
-    
+
     // don't need to allocate, can requse memory from mip
 
     // TODO: might want to convert to PNG, but maybe thumbnail system does that automatically?
     // see how big thumbs.db is after running this
-    
+
     // This doesn't allocate, but in an imageView that must outlast the handle call, does that work?
     bool skipPixelCopy = false;
-    
+
     vImage_Error err = 0;
     CGImageRef cgImage = vImageCreateCGImageFromBuffer(&buf, &format, NULL, NULL, skipPixelCopy ? kvImageNoAllocate : kvImageNoFlags, &err);
     if (err) {
@@ -283,30 +287,29 @@ - (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSErr
     CGRect rect = CGRectMake(0, 0, w, h);
 
     NSImage* nsImage = [[NSImage alloc] initWithCGImage:cgImage size:rect.size];
-    
+
     NSImageView* nsImageView = _imageView; // (NSImageView*)self.view;
-    
+
     // Copositing is like it's using NSCompositeCopy instead of SourceOver
     // The default is NSCompositeSourceOver. NSRectFill() ignores
     // -[NSGraphicsContext compositingOperation] and continues to use NSCompositeCopy.
     // So may have to use NSFillRect which uses SourceOver
     // https://cocoadev.github.io/NSCompositingOperation/
-    
+
     nsImageView.image = nsImage;
 
     // This seems to cause plugin to fail with NoAllocate set
     // This leaks a CGImageRef, but the CGImage doesn't hold any memory w/NoAllocate.
     if (!skipPixelCopy)
         CGImageRelease(cgImage);
-    
+
     // TODO: could add description with info from texture (format, etc)
     // self.textView.text = ...
-    
+
     // Call the completion handler so Quick Look knows that the preview is fully loaded.
     // Quick Look will display a loading spinner while the completion handler is not called.
-    
+
     handler(nil);
 }
 
 @end
-
diff --git a/kram-preview/kram_preview.entitlements b/kram-preview/kram_preview.entitlements
index f2ef3ae0..18aff0ce 100644
--- a/kram-preview/kram_preview.entitlements
+++ b/kram-preview/kram_preview.entitlements
@@ -2,9 +2,9 @@
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
-    <key>com.apple.security.app-sandbox</key>
-    <true/>
-    <key>com.apple.security.files.user-selected.read-only</key>
-    <true/>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.security.files.user-selected.read-only</key>
+	<true/>
 </dict>
 </plist>
diff --git a/kram-profile/CBA/Analysis.cpp b/kram-profile/CBA/Analysis.cpp
new file mode 100755
index 00000000..527cf227
--- /dev/null
+++ b/kram-profile/CBA/Analysis.cpp
@@ -0,0 +1,628 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+
+//#ifdef _MSC_VER
+//struct IUnknown; // workaround for old Win SDK header failures when using /permissive-
+//#endif
+
+// This is for windows.h
+//#ifndef NOMINMAX
+//#define NOMINMAX
+//#endif
+
+#include "Analysis.h"
+#include "Arena.h"
+//#include "Colors.h"
+#include "Utils.h"
+//#include "external/flat_hash_map/bytell_hash_map.hpp"
+//#include "external/inih/cpp/INIReader.h"
+#include <unordered_map>
+#include <algorithm>
+#include <assert.h>
+#include <string>
+#include <string.h>
+#include <map>
+#include <vector>
+
+// from kram
+// returns length of chars appended, -1 if failure
+#define STL_NAMESPACE std
+using namespace STL_NAMESPACE;
+int32_t append_sprintf(string& str, const char* format, ...) __printflike(2, 3);
+
+namespace col
+{
+    const char* kBold = "";
+    const char* kRed = "";
+    const char* kGreen = "";
+    const char* kYellow = "";
+    const char* kBlue = "";
+    const char* kMagenta = "";
+    const char* kCyan = "";
+    const char* kWhite = "";
+    const char* kReset = "";
+
+    /* not using
+    kBold = "\x1B[1m";
+    kRed = "\x1B[91m";
+    kGreen = "\x1B[32m";
+    kYellow = "\x1B[33m";
+    kBlue = "\x1B[34m";
+    kMagenta = "\x1B[35m";
+    kCyan = "\x1B[36m";
+    kWhite = "\x1B[37m";
+    kReset = "\x1B[0m";
+    */
+}
+
+struct Config
+{
+    int fileParseCount = 10;
+    int fileCodegenCount = 10;
+    int templateCount = 30;
+    int functionCount = 30;
+    int headerCount = 10;
+    int headerChainCount = 5;
+
+    int minFileTime = 10;
+
+    int maxName = 70;
+
+    bool onlyRootHeaders = true;
+};
+
+struct pair_hash
+{
+    template <class T1, class T2>
+    std::size_t operator () (const std::pair<T1,T2>& p) const
+    {
+        auto h1 = std::hash<T1>{}(p.first);
+        auto h2 = std::hash<T2>{}(p.second);
+        return h1 + 0x9e3779b9 + (h2<<6) + (h2>>2);
+    }
+};
+
+
+struct Analysis
+{
+    Analysis(const BuildEvents& events_, BuildNames& buildNames_, std::string& out_)
+    : events(events_)
+    , buildNames(buildNames_)
+    , out(out_)
+    {
+        functions.reserve(256);
+        instantiations.reserve(256);
+        parseFiles.reserve(64);
+        codegenFiles.reserve(64);
+        headerMap.reserve(256);
+    }
+
+    const BuildEvents& events;
+    BuildNames& buildNames;
+
+    std::string& out;
+
+    std::string_view GetBuildName(DetailIndex index)
+    {
+        assert(index.idx >= 0);
+        assert(index.idx < static_cast<int>(buildNames.size()));
+
+        return buildNames[index];
+    }
+
+    void ProcessEvent(EventIndex eventIndex);
+    int largestDetailIndex = 0;
+    void EndAnalysis();
+
+    void FindExpensiveHeaders();
+    void ReadConfig();
+
+    DetailIndex FindPath(EventIndex eventIndex) const;
+
+    struct InstantiateEntry
+    {
+        int count = 0;
+        int64_t us = 0;
+    };
+    struct FileEntry
+    {
+        DetailIndex file;
+        int64_t us;
+    };
+    struct IncludeChain
+    {
+        std::vector<DetailIndex> files;
+        int64_t us = 0;
+        int count = 0;
+    };
+    struct IncludeEntry
+    {
+        int64_t us = 0;
+        int count = 0;
+        bool root = false;
+        std::vector<IncludeChain> includePaths;
+    };
+
+    std::unordered_map<DetailIndex, std::string_view> collapsedNames;
+    std::string_view GetCollapsedName(DetailIndex idx);
+    void EmitCollapsedTemplates();
+    void EmitCollapsedTemplateOpt();
+    void EmitCollapsedInfo(
+        const std::unordered_map<std::string_view, InstantiateEntry> &collapsed,
+        const char *header_string);
+
+    // key is (name,objfile), value is milliseconds
+    typedef std::pair<DetailIndex, DetailIndex> IndexPair;
+    std::unordered_map<IndexPair, int64_t, pair_hash> functions;
+    std::unordered_map<EventIndex, InstantiateEntry> instantiations;
+    std::vector<FileEntry> parseFiles;
+    std::vector<FileEntry> codegenFiles;
+    int64_t totalParseUs = 0;
+    int64_t totalCodegenUs = 0;
+    int totalParseCount = 0;
+
+    std::unordered_map<std::string_view, IncludeEntry> headerMap;
+    std::vector<std::pair<std::string_view, int64_t>> expensiveHeaders;
+
+    Config config;
+};
+
+DetailIndex Analysis::FindPath(EventIndex eventIndex) const
+{
+    while(eventIndex > EventIndex())
+    {
+        const BuildEvent& ev = events[eventIndex];
+        if (ev.type == BuildEventType::kCompiler || ev.type == BuildEventType::kFrontend || ev.type == BuildEventType::kBackend || ev.type == BuildEventType::kOptModule)
+            if (ev.detailIndex != DetailIndex())
+                return ev.detailIndex;
+        eventIndex = ev.parent;
+    }
+    return DetailIndex();
+}
+
+void Analysis::ProcessEvent(EventIndex eventIndex)
+{
+    const BuildEvent& event = events[eventIndex];
+    largestDetailIndex = (std::max)(largestDetailIndex, event.detailIndex.idx);
+
+    if (event.type == BuildEventType::kOptFunction)
+    {
+        auto funKey = std::make_pair(event.detailIndex, FindPath(eventIndex));
+        functions[funKey] += event.dur;
+    }
+
+    if (event.type == BuildEventType::kInstantiateClass || event.type == BuildEventType::kInstantiateFunction)
+    {
+        auto& e = instantiations[eventIndex];
+        ++e.count;
+        e.us += event.dur;
+    }
+
+    if (event.type == BuildEventType::kFrontend)
+    {
+        totalParseUs += event.dur;
+        ++totalParseCount;
+        if (event.dur >= config.minFileTime * 1000)
+        {
+            FileEntry fe;
+            fe.file = FindPath(eventIndex);
+            fe.us = event.dur;
+            parseFiles.emplace_back(fe);
+        }
+    }
+    if (event.type == BuildEventType::kBackend)
+    {
+        totalCodegenUs += event.dur;
+        if (event.dur >= config.minFileTime * 1000)
+        {
+            FileEntry fe;
+            fe.file = FindPath(eventIndex);
+            fe.us = event.dur;
+            codegenFiles.emplace_back(fe);
+        }
+    }
+    if (event.type == BuildEventType::kParseFile)
+    {
+        std::string_view path = GetBuildName(event.detailIndex);
+        if (utils::IsHeader(path))
+        {
+            IncludeEntry& e = headerMap[path];
+            e.us += event.dur;
+            ++e.count;
+
+            // record chain of ParseFile entries leading up to this one
+            IncludeChain chain;
+            chain.us = event.dur;
+            EventIndex parseIndex = event.parent;
+            bool hasHeaderBefore = false;
+            while(parseIndex.idx >= 0)
+            {
+                const BuildEvent& ev2 = events[parseIndex];
+                if (ev2.type != BuildEventType::kParseFile)
+                    break;
+                std::string_view ev2path = GetBuildName(ev2.detailIndex);
+                bool isHeader = utils::IsHeader(ev2path);
+                if (!isHeader)
+                    break;
+                chain.files.push_back(ev2.detailIndex);
+                hasHeaderBefore |= isHeader;
+                parseIndex = ev2.parent;
+            }
+
+            e.root |= !hasHeaderBefore;
+            e.includePaths.push_back(chain);
+        }
+    }
+}
+
+static std::string_view CollapseName(const std::string_view& elt)
+{
+    // Parsing op<, op<<, op>, and op>> seems hard.  Just skip'm all
+    if (elt.find("operator") != std::string::npos)
+        return elt;
+
+    std::string retval;
+    retval.reserve(elt.size());
+    auto b_range = elt.begin();
+    auto e_range = elt.begin();
+    while (b_range != elt.end())
+    {
+       e_range = std::find(b_range, elt.end(), '<');
+        if (e_range == elt.end())
+            break;
+        ++e_range;
+        retval.append(b_range, e_range);
+        retval.append("$");
+        b_range = e_range;
+        int open_count = 1;
+        // find the matching close angle bracket
+        for (; b_range != elt.end(); ++b_range)
+        {
+            if (*b_range == '<')
+            {
+                ++open_count;
+                continue;
+            }
+            if (*b_range == '>')
+            {
+                if (--open_count == 0)
+                {
+                    break;
+                }
+                continue;
+            }
+        }
+        // b_range is now pointing at a close angle, or it is at the end of the string
+    }
+    if (b_range > e_range)
+    {
+       // we are in a wacky case where something like op> showed up in a mangled name.
+       // just bail.
+       // TODO: this still isn't correct, but it avoids crashes.
+       return elt;
+    }
+    // append the footer
+    retval.append(b_range, e_range);
+
+    size_t size = retval.size();
+    char* ptr = (char*)ArenaAllocate(size+1);
+    memcpy(ptr, retval.c_str(), size+1);
+    return std::string_view(ptr, size);
+}
+
+std::string_view Analysis::GetCollapsedName(DetailIndex detail)
+{
+    std::string_view& name = collapsedNames[detail];
+    if (name.empty())
+        name = CollapseName(GetBuildName(detail));
+    return name;
+}
+
+void Analysis::EmitCollapsedInfo(
+    const std::unordered_map<std::string_view, InstantiateEntry> &collapsed,
+    const char *header_string)
+{
+    std::vector<std::pair<std::string, InstantiateEntry>> sorted_collapsed;
+    sorted_collapsed.resize(std::min<size_t>(config.templateCount, collapsed.size()));
+    auto cmp = [](const auto &lhs, const auto &rhs) {
+        return std::tie(lhs.second.us, lhs.second.count, lhs.first) > std::tie(rhs.second.us, rhs.second.count, rhs.first);
+    };
+    std::partial_sort_copy(
+        collapsed.begin(), collapsed.end(),
+        sorted_collapsed.begin(), sorted_collapsed.end(),
+        cmp);
+
+    append_sprintf(out, "%s%s**** %s%s:\n", col::kBold, col::kMagenta, header_string, col::kReset);
+    for (const auto &elt : sorted_collapsed)
+    {
+        std::string dname = elt.first;
+        if (static_cast<int>(dname.size()) > config.maxName)
+            dname = dname.substr(0, config.maxName - 2) + "...";
+        int ms = int(elt.second.us / 1000);
+        int avg = int(ms / elt.second.count);
+        append_sprintf(out, "%s%6i%s ms: %s (%i times, avg %i ms)\n", col::kBold, ms, col::kReset, dname.c_str(), elt.second.count, avg);
+    }
+    append_sprintf(out, "\n");
+}
+void Analysis::EmitCollapsedTemplates()
+{
+    std::unordered_map<std::string_view, InstantiateEntry> collapsed;
+    for (const auto& inst : instantiations)
+    {
+        const std::string_view name = GetCollapsedName(events[inst.first].detailIndex);
+        auto &stats = collapsed[name];
+
+        bool recursive = false;
+        EventIndex p = events[inst.first].parent;
+        while (p != EventIndex(-1))
+        {
+            auto &event = events[p];
+            if (event.type == BuildEventType::kInstantiateClass || event.type == BuildEventType::kInstantiateFunction)
+            {
+                const std::string_view ancestor_name = GetCollapsedName(event.detailIndex);
+                if (ancestor_name == name)
+                {
+                    recursive = true;
+                    break;
+                }
+            }
+            p = event.parent;
+        }
+        if (!recursive)
+        {
+            stats.us += inst.second.us;
+            stats.count += inst.second.count;
+        }
+    }
+    EmitCollapsedInfo(collapsed, "Template sets that took longest to instantiate");
+}
+
+void Analysis::EmitCollapsedTemplateOpt()
+{
+    std::unordered_map<std::string_view, InstantiateEntry> collapsed;
+    for (const auto& fn : functions)
+    {
+        auto fnNameIndex = fn.first.first;
+        const std::string_view fnName = GetBuildName(fnNameIndex);
+        // if we're not related to templates at all, skip
+        if (fnName.find('<') == std::string::npos)
+            continue;
+
+        auto &stats = collapsed[GetCollapsedName(fnNameIndex)];
+        ++stats.count;
+        stats.us += fn.second;
+    }
+    EmitCollapsedInfo(collapsed, "Function sets that took longest to compile / optimize");
+}
+
+void Analysis::EndAnalysis()
+{
+    if (totalParseUs || totalCodegenUs)
+    {
+        append_sprintf(out, "%s%s**** Time summary%s:\n", col::kBold, col::kMagenta, col::kReset);
+        append_sprintf(out, "Compilation (%i times):\n", totalParseCount);
+        append_sprintf(out, "  Parsing (frontend):        %s%7.1f%s s\n", col::kBold, static_cast<double>(totalParseUs) / 1000000.0, col::kReset);
+        append_sprintf(out, "  Codegen & opts (backend):  %s%7.1f%s s\n", col::kBold, static_cast<double>(totalCodegenUs) / 1000000.0, col::kReset);
+        append_sprintf(out, "\n");
+    }
+
+    if (!parseFiles.empty())
+    {
+        std::vector<int> indices;
+        indices.resize(parseFiles.size());
+        for (size_t i = 0; i < indices.size(); ++i)
+            indices[i] = int(i);
+        std::sort(indices.begin(), indices.end(), [&](int indexA, int indexB) {
+            const auto& a = parseFiles[indexA];
+            const auto& b = parseFiles[indexB];
+            if (a.us != b.us)
+                return a.us > b.us;
+            return GetBuildName(a.file) < GetBuildName(b.file);
+            });
+        append_sprintf(out, "%s%s**** Files that took longest to parse (compiler frontend)%s:\n", col::kBold, col::kMagenta, col::kReset);
+        for (size_t i = 0, n = std::min<size_t>(config.fileParseCount, indices.size()); i != n; ++i)
+        {
+            const auto& e = parseFiles[indices[i]];
+            append_sprintf(out, "%s%6i%s ms: %s\n", col::kBold, int(e.us/1000), col::kReset, GetBuildName(e.file).data());
+        }
+        append_sprintf(out, "\n");
+    }
+    if (!codegenFiles.empty())
+    {
+        std::vector<int> indices;
+        indices.resize(codegenFiles.size());
+        for (size_t i = 0; i < indices.size(); ++i)
+            indices[i] = int(i);
+        std::sort(indices.begin(), indices.end(), [&](int indexA, int indexB) {
+            const auto& a = codegenFiles[indexA];
+            const auto& b = codegenFiles[indexB];
+            if (a.us != b.us)
+                return a.us > b.us;
+            return GetBuildName(a.file) < GetBuildName(b.file);
+            });
+        append_sprintf(out, "%s%s**** Files that took longest to codegen (compiler backend)%s:\n", col::kBold, col::kMagenta, col::kReset);
+        for (size_t i = 0, n = std::min<size_t>(config.fileCodegenCount, indices.size()); i != n; ++i)
+        {
+            const auto& e = codegenFiles[indices[i]];
+            append_sprintf(out, "%s%6i%s ms: %s\n", col::kBold, int(e.us/1000), col::kReset, GetBuildName(e.file).data());
+        }
+        append_sprintf(out, "\n");
+    }
+
+    if (!instantiations.empty())
+    {
+        std::vector<std::pair<DetailIndex, InstantiateEntry>> instArray;
+        instArray.resize(largestDetailIndex+1);
+        for (const auto& inst : instantiations) //collapse the events
+        {
+            DetailIndex d = events[inst.first].detailIndex;
+            instArray[d.idx].first = d;
+            instArray[d.idx].second.us += inst.second.us;
+            instArray[d.idx].second.count += inst.second.count;
+        }
+        size_t n = std::min<size_t>(config.templateCount, instArray.size());
+        auto cmp = [&](const auto&a, const auto &b) {
+            return
+                std::tie(a.second.us, a.second.count, a.first) >
+                std::tie(b.second.us, b.second.count, b.first);
+        };
+        std::partial_sort(instArray.begin(), instArray.begin()+n, instArray.end(), cmp);
+        append_sprintf(out, "%s%s**** Templates that took longest to instantiate%s:\n", col::kBold, col::kMagenta, col::kReset);
+        for (size_t i = 0; i != n; ++i)
+        {
+            const auto& e = instArray[i];
+            std::string dname = std::string(GetBuildName(e.first));
+            if (static_cast<int>(dname.size()) > config.maxName)
+                dname = dname.substr(0, config.maxName-2) + "...";
+            int ms = int(e.second.us / 1000);
+            int avg = int(ms / std::max(e.second.count,1));
+            append_sprintf(out, "%s%6i%s ms: %s (%i times, avg %i ms)\n", col::kBold, ms, col::kReset, dname.c_str(), e.second.count, avg);
+        }
+        append_sprintf(out, "\n");
+
+        EmitCollapsedTemplates();
+    }
+
+    if (!functions.empty())
+    {
+        std::vector<std::pair<IndexPair, int64_t>> functionsArray;
+        std::vector<int> indices;
+        functionsArray.reserve(functions.size());
+        indices.reserve(functions.size());
+        for (const auto& fn : functions)
+        {
+            functionsArray.emplace_back(fn);
+            indices.emplace_back((int)indices.size());
+        }
+
+        std::sort(indices.begin(), indices.end(), [&](int indexA, int indexB) {
+            const auto& a = functionsArray[indexA];
+            const auto& b = functionsArray[indexB];
+            if (a.second != b.second)
+                return a.second > b.second;
+            return GetBuildName(a.first.first) < GetBuildName(b.first.first);
+            });
+        append_sprintf(out, "%s%s**** Functions that took longest to compile%s:\n", col::kBold, col::kMagenta, col::kReset);
+        for (size_t i = 0, n = std::min<size_t>(config.functionCount, indices.size()); i != n; ++i)
+        {
+            const auto& e = functionsArray[indices[i]];
+            std::string dname = std::string(GetBuildName(e.first.first));
+            if (static_cast<int>(dname.size()) > config.maxName)
+                dname = dname.substr(0, config.maxName-2) + "...";
+            int ms = int(e.second / 1000);
+            append_sprintf(out, "%s%6i%s ms: %s (%s)\n", col::kBold, ms, col::kReset, dname.c_str(), GetBuildName(e.first.second).data());
+        }
+        append_sprintf(out, "\n");
+        EmitCollapsedTemplateOpt();
+    }
+
+    FindExpensiveHeaders();
+
+    if (!expensiveHeaders.empty())
+    {
+        append_sprintf(out, "%s%s**** Expensive headers%s:\n", col::kBold, col::kMagenta, col::kReset);
+        for (const auto& e : expensiveHeaders)
+        {
+            const auto& es = headerMap[e.first];
+            int ms = int(e.second / 1000);
+            int avg = ms / es.count;
+            append_sprintf(out, "%s%i%s ms: %s%s%s (included %i times, avg %i ms), included via:\n", col::kBold, ms, col::kReset, col::kBold, e.first.data(), col::kReset, es.count, avg);
+            int pathCount = 0;
+
+            // print most costly include chains
+            // merge identical include chains, recording their (count, totalTimeUs)
+            std::map<std::vector<DetailIndex>, std::pair<int,int64_t>> mergedIncludeChains;
+            for (const auto& chain : es.includePaths)
+            {
+                auto& dst = mergedIncludeChains[chain.files];
+                dst.first++;
+                dst.second += chain.us;
+            }
+            std::vector<IncludeChain> sortedIncludeChains;
+            sortedIncludeChains.reserve(mergedIncludeChains.size());
+            for (const auto& chain : mergedIncludeChains)
+            {
+                IncludeChain dst;
+                dst.files = chain.first;
+                dst.count = chain.second.first;
+                dst.us = chain.second.second;
+                sortedIncludeChains.emplace_back(dst);
+            }
+            std::sort(sortedIncludeChains.begin(), sortedIncludeChains.end(), [](const auto& a, const auto& b)
+            {
+                if (a.count != b.count)
+                    return a.count > b.count;
+                if (a.us != b.us)
+                    return a.us > b.us;
+                return a.files < b.files;
+            });
+            for (const auto& chain : sortedIncludeChains)
+            {
+                append_sprintf(out, "  %ix: ", chain.count);
+                for (auto it = chain.files.rbegin(), itEnd = chain.files.rend(); it != itEnd; ++it)
+                {
+                    append_sprintf(out, "%s ", utils::GetFilename(GetBuildName(*it)).data());
+                }
+                if (chain.files.empty())
+                    append_sprintf(out, "<direct include>");
+                append_sprintf(out, "\n");
+                ++pathCount;
+                if (pathCount > config.headerChainCount)
+                    break;
+            }
+            if (pathCount > config.headerChainCount)
+            {
+                append_sprintf(out, "  ...\n");
+            }
+
+            append_sprintf(out, "\n");
+        }
+    }
+}
+
+void Analysis::FindExpensiveHeaders()
+{
+    expensiveHeaders.reserve(headerMap.size());
+    for (const auto& kvp : headerMap)
+    {
+        if (config.onlyRootHeaders && !kvp.second.root)
+            continue;
+        expensiveHeaders.push_back(std::make_pair(kvp.first, kvp.second.us));
+    }
+    std::sort(expensiveHeaders.begin(), expensiveHeaders.end(), [&](const auto& a, const auto& b)
+    {
+        if (a.second != b.second)
+            return a.second > b.second;
+        return a.first < b.first;
+    });
+    if (static_cast<int>(expensiveHeaders.size()) > config.headerCount)
+        expensiveHeaders.resize(config.headerCount);
+}
+
+void Analysis::ReadConfig()
+{
+// No longer reading ini file
+//    INIReader ini("ClangBuildAnalyzer.ini");
+//
+//    config.fileParseCount   = (int)ini.GetInteger("counts", "fileParse",    config.fileParseCount);
+//    config.fileCodegenCount = (int)ini.GetInteger("counts", "fileCodegen",  config.fileCodegenCount);
+//    config.functionCount    = (int)ini.GetInteger("counts", "function",     config.functionCount);
+//    config.templateCount    = (int)ini.GetInteger("counts", "template",     config.templateCount);
+//    config.headerCount      = (int)ini.GetInteger("counts", "header",       config.headerCount);
+//    config.headerChainCount = (int)ini.GetInteger("counts", "headerChain",  config.headerChainCount);
+//
+//    config.minFileTime      = (int)ini.GetInteger("minTimes", "file",       config.minFileTime);
+//
+//    config.maxName          = (int)ini.GetInteger("misc", "maxNameLength",  config.maxName);
+//    config.onlyRootHeaders  =      ini.GetBoolean("misc", "onlyRootHeaders",config.onlyRootHeaders);
+}
+
+
+void DoAnalysis(const BuildEvents& events, BuildNames& names, std::string& out)
+{
+    Analysis a(events, names, out);
+    a.ReadConfig();
+    for (int i = 0, n = (int)events.size(); i != n; ++i)
+        a.ProcessEvent(EventIndex(i));
+    a.EndAnalysis();
+}
diff --git a/kram-profile/CBA/Analysis.h b/kram-profile/CBA/Analysis.h
new file mode 100755
index 00000000..83f93f35
--- /dev/null
+++ b/kram-profile/CBA/Analysis.h
@@ -0,0 +1,9 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+#pragma once
+
+#include <string>
+
+#include "BuildEvents.h"
+
+void DoAnalysis(const BuildEvents& events, BuildNames& names, std::string& out);
diff --git a/kram-profile/CBA/Arena.cpp b/kram-profile/CBA/Arena.cpp
new file mode 100755
index 00000000..c5d69e75
--- /dev/null
+++ b/kram-profile/CBA/Arena.cpp
@@ -0,0 +1,49 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+
+#include <cstdint>
+#include <algorithm>
+#include <vector>
+
+struct ArenaBlock
+{
+    uint8_t* buffer;
+    size_t bufferSize;
+    size_t used;
+};
+
+static std::vector<ArenaBlock> s_Blocks;
+
+const size_t kDefaultBlockSize = 65536;
+
+
+void ArenaInitialize()
+{
+}
+
+void ArenaDelete()
+{
+    for (auto& b : s_Blocks)
+        delete[] b.buffer;
+    s_Blocks.clear();
+}
+
+void* ArenaAllocate(size_t size)
+{
+    // do we need a new block?
+    if (s_Blocks.empty() || s_Blocks.back().used + size > s_Blocks.back().bufferSize)
+    {
+        ArenaBlock block;
+        block.bufferSize = std::max(size, kDefaultBlockSize);
+        block.buffer = new uint8_t[block.bufferSize];
+        block.used = 0;
+        s_Blocks.emplace_back(block);
+    }
+    
+    // allocate from the last block
+    ArenaBlock& b = s_Blocks.back();
+    void* ptr = b.buffer + b.used;
+    b.used += size;
+    return ptr;
+}
+
diff --git a/kram-profile/CBA/Arena.h b/kram-profile/CBA/Arena.h
new file mode 100755
index 00000000..0656d73b
--- /dev/null
+++ b/kram-profile/CBA/Arena.h
@@ -0,0 +1,7 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+#pragma once
+
+void ArenaInitialize();
+void ArenaDelete();
+void* ArenaAllocate(size_t size);
diff --git a/kram-profile/CBA/BuildEvents.cpp b/kram-profile/CBA/BuildEvents.cpp
new file mode 100755
index 00000000..36a9dec7
--- /dev/null
+++ b/kram-profile/CBA/BuildEvents.cpp
@@ -0,0 +1,680 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+#include "BuildEvents.h"
+
+// This is for windows.h
+//#ifndef NOMINMAX
+//#define NOMINMAX
+//#endif
+
+#include "Arena.h"
+//#include "Colors.h"
+#include "Utils.h"
+//#include "external/cute_files.h"
+//#include "external/flat_hash_map/bytell_hash_map.hpp"
+//#include "external/llvm-Demangle/include/Demangle.h"
+#include "simdjson.h"
+//#include "external/xxHash/xxhash.h"
+#include <assert.h>
+#include <iterator>
+#include <mutex>
+#include <string>
+
+// Taken out of kram
+// case-sensitive fnv1a hash, can pass existing hash to continue a hash
+inline uint32_t HashFnv1a(const char* val, uint32_t hash = 0x811c9dc5) {
+    const uint32_t prime  = 0x01000193; // 16777619 (32-bit)
+    while (*val) {
+        hash = (hash * prime) ^ (uint32_t)*val++;
+    }
+    return hash;
+}
+
+extern "C" const char* _Nullable demangleSymbolName(const char* _Nonnull symbolName_);
+
+struct HashedString
+{
+    explicit HashedString(const char* s)
+    {
+        len = strlen(s);
+        hash = HashFnv1a(s); // , len, 0);
+        str = s;
+    }
+    size_t hash;
+    size_t len;
+    const char* str;
+};
+namespace std
+{
+    template<> struct hash<HashedString>
+    {
+        size_t operator()(const HashedString& v) const
+        {
+            return v.hash;
+        }
+    };
+    template<> struct equal_to<HashedString>
+    {
+        bool operator()(const HashedString& a, const HashedString& b) const
+        {
+            return a.hash == b.hash && a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
+        }
+    };
+} // namespace std
+
+typedef std::unordered_map<HashedString, DetailIndex> NameToIndexMap;
+
+/*
+static void DebugPrintEvents(const BuildEvents& events, const BuildNames& names)
+{
+    for (size_t i = 0; i < events.size(); ++i)
+    {
+        const BuildEvent& event = events[EventIndex(int(i))];
+        const std::string_view namesSubstr = names[event.detailIndex].substr(0, 130);
+        printf("%4zi: t=%i t1=%7lld t2=%7lld par=%4i ch=%4zi det=%.*s\n", i, (int) event.type, event.ts, event.ts+event.dur, event.parent.idx, event.children.size(), (int)namesSubstr.size(), namesSubstr.data());
+    }
+}
+*/
+
+static void FindParentChildrenIndices(BuildEvents& events)
+{
+    if (events.empty())
+        return;
+
+    // sort events by start time so that parent events go before child events
+    std::vector<EventIndex> sortedIndices;
+    sortedIndices.resize(events.size());
+    for (int i = 0, n = (int)events.size(); i != n; ++i)
+        sortedIndices[i] = EventIndex(i);
+    std::sort(sortedIndices.begin(), sortedIndices.end(), [&](EventIndex ia, EventIndex ib){
+        const auto& ea = events[ia];
+        const auto& eb = events[ib];
+        if (ea.ts != eb.ts)
+            return ea.ts < eb.ts;
+        // break start time ties by making longer events go first (they must be parent)
+        if (ea.dur != eb.dur)
+            return ea.dur > eb.dur;
+        // break ties by assuming that later events in sequence must start parent
+        return ia > ib;
+    });
+
+    // figure out the event hierarchy; for now the parent/child indices are into
+    // the "sortedIndices" array and not event indices in the "events" array.
+    // As a result, we will be digging into .idx members a lot, as we are temporarily
+    // putting the wrong kind of index into 'parent'.
+    int root = 0;
+    BuildEvent* evRoot = &events[sortedIndices[root]];
+    evRoot->parent.idx = -1;
+    for (int i = 1, n = (int)events.size(); i != n; ++i)
+    {
+        BuildEvent* ev2 = &events[sortedIndices[i]];
+        while (root != -1)
+        {
+            // add slice if within bounds
+            if (ev2->ts >= evRoot->ts && ev2->ts+ev2->dur <= evRoot->ts+evRoot->dur)
+            {
+                ev2->parent.idx = root;
+                evRoot->children.push_back(EventIndex(i));
+                break;
+            }
+
+            root = evRoot->parent.idx;
+            if (root != -1)
+                evRoot = &events[sortedIndices[root]];
+        }
+        if (root == -1)
+        {
+            ev2->parent.idx = -1;
+        }
+        root = i;
+        evRoot = &events[sortedIndices[i]];
+    }
+
+    // fixup event parent/child indices to be into "events" array
+    for (auto& e : events)
+    {
+        for (auto& c : e.children)
+            c = sortedIndices[c.idx];
+        if (e.parent.idx != -1)
+            e.parent = sortedIndices[e.parent.idx];
+    }
+
+#ifndef NDEBUG
+    for (int i = 0, n = (int)events.size(); i != n; ++i)
+    {
+        assert(i != events[EventIndex(i)].parent.idx);
+    }
+#endif
+}
+
+struct BuildEventsParser
+{
+    BuildEventsParser()
+    {
+        // make sure zero index is empty
+        NameToIndex("", resultNameToIndex);
+        resultNames.push_back(std::string_view(resultNameToIndex.begin()->first.str, 0));
+
+        resultEvents.reserve(2048);
+        resultNames.reserve(2048);
+    }
+
+    BuildEvents resultEvents;
+    BuildNames resultNames;
+    NameToIndexMap resultNameToIndex;
+    std::mutex resultMutex;
+    std::mutex arenaMutex;
+
+    void AddEvents(BuildEvents& add, const NameToIndexMap& nameToIndex)
+    {
+        // we got job-local build events and name-to-index mapping;
+        // add them to the global result with any necessary remapping.
+        // gotta take a mutex since we're modifying shared state here.
+        std::scoped_lock lock(resultMutex);
+
+        // move events to end of result events list
+        int offset = (int)resultEvents.size();
+        std::move(add.begin(), add.end(), std::back_inserter(resultEvents));
+        add.clear();
+
+        // create remapping from name indices, adding them to global remapping
+        // list if necessary.
+        std::unordered_map<DetailIndex, DetailIndex> detailRemap;
+        for (const auto& kvp : nameToIndex)
+        {
+            const auto& existing = resultNameToIndex.find(kvp.first);
+            if (existing == resultNameToIndex.end())
+            {
+                DetailIndex index((int)resultNameToIndex.size());
+                resultNameToIndex.insert(std::make_pair(kvp.first, index));
+                resultNames.push_back(std::string_view(kvp.first.str, kvp.first.len));
+                detailRemap[kvp.second] = index;
+            }
+            else
+            {
+                detailRemap[kvp.second] = existing->second;
+            }
+        }
+
+        // adjust the added event indices
+        for (size_t i = offset, n = resultEvents.size(); i != n; ++i)
+        {
+            BuildEvent& ev = resultEvents[EventIndex(int(i))];
+            if (ev.parent.idx >= 0)
+                ev.parent.idx += offset;
+            for (auto& ch : ev.children)
+                ch.idx += offset;
+            if (ev.detailIndex.idx != 0)
+            {
+                assert(ev.detailIndex.idx >= 0);
+                assert(ev.detailIndex.idx < static_cast<int>(nameToIndex.size()));
+                ev.detailIndex = detailRemap[ev.detailIndex];
+                assert(ev.detailIndex.idx >= 0);
+                assert(ev.detailIndex.idx < static_cast<int>(resultNameToIndex.size()));
+            }
+        }
+
+        assert(resultNameToIndex.size() == resultNames.size());
+    }
+
+
+    DetailIndex NameToIndex(const char* str, NameToIndexMap& nameToIndex)
+    {
+        HashedString hashedName(str);
+        auto it = nameToIndex.find(hashedName);
+        if (it != nameToIndex.end())
+            return it->second;
+
+        char* strCopy;
+        {
+            // arena allocator is not thread safe, take a mutex
+            std::scoped_lock lock(arenaMutex);
+            strCopy = (char*)ArenaAllocate(hashedName.len+1);
+        }
+        memcpy(strCopy, str, hashedName.len+1);
+        hashedName.str = strCopy;
+
+        DetailIndex index((int)nameToIndex.size());
+        nameToIndex.insert(std::make_pair(hashedName, index));
+        return index;
+    }
+
+    bool ParseRoot(simdjson::dom::element& it, const std::string& curFileName)
+    {
+        simdjson::dom::element nit;
+        if (it["traceEvents"].get(nit))
+            return false;
+        return ParseTraceEvents(nit, curFileName);
+    }
+
+    bool ParseTraceEvents(simdjson::dom::element& it, const std::string& curFileName)
+    {
+        if (!it.is_array())
+            return false;
+
+        NameToIndexMap nameToIndexLocal;
+        NameToIndex("", nameToIndexLocal); // make sure zero index is empty
+        BuildEvents fileEvents;
+        fileEvents.reserve(256);
+        for (simdjson::dom::element nit : it)
+        {
+            ParseEvent(nit, curFileName, fileEvents, nameToIndexLocal);
+        }
+        if (fileEvents.empty())
+            return false;
+
+        FindParentChildrenIndices(fileEvents);
+        if (fileEvents.back().parent.idx != -1)
+        {
+            //printf("%sWARN: the last trace event should be root; was not in '%s'.%s\n", col::kRed, curFileName.c_str(), col::kReset);
+            return false;
+        }
+        AddEvents(fileEvents, nameToIndexLocal);
+        return true;
+    }
+
+    static bool StrEqual(std::string_view a, const char* b)
+    {
+        return a == b;
+    }
+
+    static bool StartsWith(std::string_view a, const char* b, int blen)
+    {
+        return static_cast<int>(a.size()) >= blen && a.compare(0, blen, b) == 0;
+    }
+
+    const char* kPid = "pid";
+    const char* kTid = "tid";
+    const char* kPh = "ph";
+    const char* kName = "name";
+    const char* kTs = "ts";
+    const char* kDur = "dur";
+    const char* kArgs = "args";
+    const char* kDetail = "detail";
+
+    void ParseEvent(simdjson::dom::element& it, const std::string& curFileName, BuildEvents& fileEvents, NameToIndexMap& nameToIndexLocal)
+    {
+        simdjson::dom::object node;
+        if (it.get(node))
+        {
+            //printf("%sERROR: 'traceEvents' elements in JSON should be objects.%s\n", col::kRed, col::kReset);
+            resultEvents.clear();
+            return;
+        }
+
+        BuildEvent event;
+        bool valid = true;
+        std::string_view detailPtr;
+        for (simdjson::dom::key_value_pair kv : node)
+        {
+            std::string_view nodeKey = kv.key;
+            if (StrEqual(nodeKey, kPid))
+            {
+                if (!kv.value.is_int64())  // starting with Clang/LLVM 11 process IDs are not necessarily 1
+                    valid = false;
+            }
+            else if (StrEqual(nodeKey, kTid))
+            {
+                if (!kv.value.is_int64()) // starting with Clang/LLVM 11 thread IDs are not necessarily 0
+                    valid = false;
+            }
+            else if (StrEqual(nodeKey, kPh))
+            {
+                if (!kv.value.is_string() || !StrEqual(kv.value.get_string(), "X"))
+                    valid = false;
+            }
+            else if (StrEqual(nodeKey, kName) && kv.value.is_string() && valid)
+            {
+                std::string_view name = kv.value.get_string();
+                if (StrEqual(name, "ExecuteCompiler"))
+                    event.type = BuildEventType::kCompiler;
+                else if (StrEqual(name, "Frontend"))
+                    event.type = BuildEventType::kFrontend;
+                else if (StrEqual(name, "Backend"))
+                    event.type = BuildEventType::kBackend;
+                else if (StrEqual(name, "Source"))
+                    event.type = BuildEventType::kParseFile;
+                else if (StrEqual(name, "ParseTemplate"))
+                    event.type = BuildEventType::kParseTemplate;
+                else if (StrEqual(name, "ParseClass"))
+                    event.type = BuildEventType::kParseClass;
+                else if (StrEqual(name, "InstantiateClass"))
+                    event.type = BuildEventType::kInstantiateClass;
+                else if (StrEqual(name, "InstantiateFunction"))
+                    event.type = BuildEventType::kInstantiateFunction;
+                else if (StrEqual(name, "OptModule"))
+                    event.type = BuildEventType::kOptModule;
+                else if (StrEqual(name, "OptFunction"))
+                    event.type = BuildEventType::kOptFunction;
+            }
+            else if (StrEqual(nodeKey, kTs))
+            {
+                if (kv.value.is_int64())
+                    event.ts = kv.value.get_int64();
+                else
+                    valid = false;
+            }
+            else if (StrEqual(nodeKey, kDur))
+            {
+                if (kv.value.is_int64())
+                    event.dur = kv.value.get_int64();
+                else
+                    valid = false;
+            }
+            else if (StrEqual(nodeKey, kArgs))
+            {
+                if (kv.value.is_object())
+                {
+                    simdjson::dom::object kvo(kv.value);
+                    simdjson::dom::key_value_pair args = *kvo.begin();
+                    if (args.value.is_string())
+                        detailPtr = args.value.get_string();
+                }
+            }
+        };
+
+        if (event.type== BuildEventType::kUnknown || !valid)
+            return;
+
+        // if the "compiler" event has no detail name, use the current json file name
+        if (detailPtr.empty() && event.type == BuildEventType::kCompiler)
+            detailPtr = curFileName;
+        if (!detailPtr.empty())
+        {
+            std::string detailString;
+            if (event.type == BuildEventType::kParseFile || event.type == BuildEventType::kOptModule)
+            {
+                // do various cleanups/nice-ifications of the detail name:
+                // make paths shorter (i.e. relative to project) where possible
+                detailString = utils::GetNicePath(detailPtr);
+                
+                // switch json to .o or .obj (or .cpp)
+                if (utils::EndsWith(detailString, ".json"))
+                {
+                    detailString = std::string(detailString.substr(0, detailString.length()-4)) + "o";
+                }
+            }
+            else
+            {
+                detailString = detailPtr;
+                
+                // Use the kram demangle
+                // clang needs to fix this, since Win clang symbols don't demangle using macOS demangle
+                if (event.type == BuildEventType::kOptFunction)
+                {
+                    const char* demangledName = demangleSymbolName(detailString.c_str());
+                    if (demangledName != nullptr)
+                        detailString = demangledName;
+                }
+            }
+            
+            
+            /* don't do this
+            // don't report the clang trace .json file, instead get the object file at the same location if it's there
+            if (utils::EndsWith(detailString, ".json"))
+            {
+                std::string candidate = std::string(detailString.substr(0, detailString.length()-4)) + "o";
+                // check for .o
+                if (cf_file_exists(candidate.c_str()))
+                    detailString = candidate;
+                else
+                {
+                    // check for .obj
+                    candidate += "bj";
+                    if (cf_file_exists(candidate.c_str()))
+                        detailString = candidate;
+                }
+            }
+            
+            // TODO: may need to demangle again
+            // demangle possibly mangled names
+            if (event.type == BuildEventType::kOptFunction)
+                detailString = llvm::demangle(detailString);
+            */
+            
+            event.detailIndex = NameToIndex(detailString.c_str(), nameToIndexLocal);
+        }
+
+        fileEvents.emplace_back(event);
+    }
+};
+
+BuildEventsParser* CreateBuildEventsParser()
+{
+    BuildEventsParser* p = new BuildEventsParser();
+    return p;
+}
+void DeleteBuildEventsParser(BuildEventsParser* parser)
+{
+    delete parser;
+}
+
+BuildEvents& GetBuildEvents(BuildEventsParser& parser)
+{
+    return parser.resultEvents;
+}
+BuildNames& GetBuildNames(BuildEventsParser& parser)
+{
+    return parser.resultNames;
+}
+
+bool ParseBuildEvents(BuildEventsParser* parser, const uint8_t* buf, size_t bufSize, const std::string& fileName)
+{
+    using namespace simdjson;
+    dom::parser p;
+    dom::element doc;
+    auto error = p.parse(buf, bufSize).get(doc);
+    if (error)
+    {
+//        printf("%sWARN: JSON parse error in %s: %s.%s\n", col::kYellow, fileName.c_str(), error_message(error), col::kReset);
+        return false;
+    }
+
+    return parser->ParseRoot(doc, fileName);
+    //DebugPrintEvents(outEvents, outNames);
+}
+
+/*
+struct BufferedWriter
+{
+    BufferedWriter(FILE* f)
+    : size(0)
+    , file(f)
+    {
+        hasher = XXH64_createState();
+        XXH64_reset(hasher, 0);
+    }
+    ~BufferedWriter()
+    {
+        Flush();
+        XXH64_hash_t hash = XXH64_digest(hasher);
+        fwrite(&hash, sizeof(hash), 1, file);
+        fclose(file);
+        XXH64_freeState(hasher);
+    }
+
+    template<typename T> void Write(const T& t)
+    {
+        Write(&t, sizeof(t));
+    }
+    void Write(const void* ptr, size_t sz)
+    {
+        if (sz == 0) return;
+        if (sz >= kBufferSize)
+        {
+            if( size > 0 )
+            {
+                Flush();
+            }
+
+            XXH64_update(hasher, ptr, sz);
+            fwrite(ptr, sz, 1, file);
+            return;
+        }
+        if (sz + size > kBufferSize)
+            Flush();
+        memcpy(&buffer[size], ptr, sz);
+        size += sz;
+    }
+
+
+    void Flush()
+    {
+        fwrite(buffer, size, 1, file);
+        XXH64_update(hasher, buffer, size);
+        size = 0;
+    }
+
+    enum { kBufferSize = 65536 };
+    uint8_t buffer[kBufferSize];
+    size_t size;
+    FILE* file;
+    XXH64_state_t* hasher;
+};
+
+struct BufferedReader
+{
+    BufferedReader(FILE* f)
+    : pos(0)
+    {
+        fseek(f, 0, SEEK_END);
+        size_t fsize = ftello64(f);
+        fseek(f, 0, SEEK_SET);
+        buffer = new uint8_t[fsize];
+        bufferSize = fsize;
+        fread(buffer, bufferSize, 1, f);
+        fclose(f);
+    }
+    ~BufferedReader()
+    {
+        delete[] buffer;
+    }
+
+    template<typename T> void Read(T& t)
+    {
+        Read(&t, sizeof(t));
+    }
+    void Read(void* ptr, size_t sz)
+    {
+        if (pos + sz > bufferSize)
+        {
+            memset(ptr, 0, sz);
+            return;
+        }
+        memcpy(ptr, &buffer[pos], sz);
+        pos += sz;
+    }
+
+    uint8_t* buffer;
+    size_t pos;
+    size_t bufferSize;
+};
+
+const uint32_t kFileMagic = 'CBA0';
+
+bool SaveBuildEvents(BuildEventsParser* parser, const std::string& fileName)
+{
+    FILE* f = fopen(fileName.c_str(), "wb");
+    if (f == nullptr)
+    {
+        printf("%sERROR: failed to save to file '%s'%s\n", col::kRed, fileName.c_str(), col::kReset);
+        return false;
+    }
+
+    BufferedWriter w(f);
+
+    w.Write(kFileMagic);
+    int64_t eventsCount = parser->resultEvents.size();
+    w.Write(eventsCount);
+    for(const auto& e : parser->resultEvents)
+    {
+        int32_t eType = (int32_t)e.type;
+        w.Write(eType);
+        w.Write(e.ts);
+        w.Write(e.dur);
+        w.Write(e.detailIndex.idx);
+        w.Write(e.parent.idx);
+        int64_t childCount = e.children.size();
+        w.Write(childCount);
+        w.Write(e.children.data(), childCount * sizeof(e.children[0]));
+    }
+
+    int64_t namesCount = parser->resultNames.size();
+    w.Write(namesCount);
+    for(const auto& n : parser->resultNames)
+    {
+        uint32_t nSize = (uint32_t)n.size();
+        w.Write(nSize);
+        w.Write(n.data(), nSize);
+    }
+
+    return true;
+}
+
+bool LoadBuildEvents(const std::string& fileName, BuildEvents& outEvents, BuildNames& outNames)
+{
+    FILE* f = fopen(fileName.c_str(), "rb");
+    if (f == nullptr)
+    {
+        printf("%sERROR: failed to open file '%s'%s\n", col::kRed, fileName.c_str(), col::kReset);
+        return false;
+    }
+
+    BufferedReader r(f);
+    if (r.bufferSize < 12) // 4 bytes magic header, 8 bytes hash at end
+    {
+        printf("%sERROR: corrupt input file '%s' (size too small)%s\n", col::kRed, fileName.c_str(), col::kReset);
+        return false;
+    }
+    // check header magic
+    int32_t magic = 0;
+    r.Read(magic);
+    if (magic != kFileMagic)
+    {
+        printf("%sERROR: unknown format of input file '%s'%s\n", col::kRed, fileName.c_str(), col::kReset);
+        return false;
+    }
+    // chech hash checksum
+    XXH64_hash_t hash = XXH64(r.buffer, r.bufferSize-sizeof(XXH64_hash_t), 0);
+    if (memcmp(&hash, r.buffer+r.bufferSize-sizeof(XXH64_hash_t), sizeof(XXH64_hash_t)) != 0)
+    {
+        printf("%sERROR: corrupt input file '%s' (checksum mismatch)%s\n", col::kRed, fileName.c_str(), col::kReset);
+        return false;
+    }
+
+    int64_t eventsCount = 0;
+    r.Read(eventsCount);
+    outEvents.resize(eventsCount);
+    for(auto& e : outEvents)
+    {
+        int32_t eType;
+        r.Read(eType);
+        e.type = (BuildEventType)eType;
+        r.Read(e.ts);
+        r.Read(e.dur);
+        r.Read(e.detailIndex.idx);
+        r.Read(e.parent.idx);
+        int64_t childCount = 0;
+        r.Read(childCount);
+        e.children.resize(childCount);
+        if (childCount != 0)
+            r.Read(&e.children[0], childCount * sizeof(e.children[0]));
+    }
+
+    int64_t namesCount = 0;
+    r.Read(namesCount);
+    outNames.resize(namesCount);
+    for(auto& n : outNames)
+    {
+        uint32_t nSize = 0;
+        r.Read(nSize);
+        char* ptr = (char*)ArenaAllocate(nSize+1);
+        memset(ptr, 0, nSize+1);
+        n = std::string_view(ptr, nSize);
+        if (nSize != 0)
+            r.Read(ptr, nSize);
+    }
+
+    return true;
+}
+*/
diff --git a/kram-profile/CBA/BuildEvents.h b/kram-profile/CBA/BuildEvents.h
new file mode 100755
index 00000000..79e15258
--- /dev/null
+++ b/kram-profile/CBA/BuildEvents.h
@@ -0,0 +1,121 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+#pragma once
+#define _CRT_SECURE_NO_WARNINGS
+#include <stdint.h>
+#include <string>
+#include <string_view>
+#include <vector>
+#include <utility>
+
+
+//#ifdef _MSC_VER
+//#define ftello64 _ftelli64
+//#elif defined(__APPLE__)
+//#define ftello64 ftello
+//#endif
+
+
+enum class BuildEventType
+{
+    kUnknown,
+    kCompiler,
+    kFrontend,
+    kBackend,
+    kParseFile,
+    kParseTemplate,
+    kParseClass,
+    kInstantiateClass,
+    kInstantiateFunction,
+    kOptModule,
+    kOptFunction,
+};
+
+struct DetailIndex
+{
+    int idx;
+    explicit DetailIndex(int d = 0) : idx(d) {}
+    
+#if __cplusplus >= 202002L
+    // C++20 can autogen most of these from like <=> operator
+    auto operator<=>(const DetailIndex& rhs) const = default;
+#else
+    bool operator==(DetailIndex rhs) const { return idx == rhs.idx; }
+    bool operator!=(DetailIndex rhs) const { return idx != rhs.idx; }
+    bool operator<(DetailIndex rhs) const { return idx < rhs.idx; }
+    bool operator>(DetailIndex rhs) const { return idx > rhs.idx; }
+    bool operator<=(DetailIndex rhs) const { return idx <= rhs.idx; }
+    bool operator>=(DetailIndex rhs) const { return idx >= rhs.idx; }
+#endif
+};
+
+struct EventIndex
+{
+    int idx;
+    explicit EventIndex(int e = -1) : idx(e) {}
+    
+#if __cplusplus >= 202002L
+    // C++20 can autogen most of these from like <=> operator
+    auto operator<=>(const EventIndex& rhs) const = default;
+#else
+    bool operator==(EventIndex rhs) const { return idx == rhs.idx; }
+    bool operator!=(EventIndex rhs) const { return idx != rhs.idx; }
+    bool operator<(EventIndex rhs) const { return idx < rhs.idx; }
+    bool operator>(EventIndex rhs) const { return idx > rhs.idx; }
+    bool operator<=(EventIndex rhs) const { return idx <= rhs.idx; }
+    bool operator>=(EventIndex rhs) const { return idx >= rhs.idx; }
+#endif
+};
+
+namespace std
+{
+    template <> struct hash<DetailIndex>
+    {
+        size_t operator()(DetailIndex x) const
+        {
+            return hash<int>()(x.idx);
+        }
+    };
+    template <> struct hash<EventIndex>
+    {
+        size_t operator()(EventIndex x) const
+        {
+            return hash<int>()(x.idx);
+        }
+    };
+}
+
+struct BuildEvent
+{
+    BuildEventType type = BuildEventType::kUnknown;
+    int64_t ts = 0;
+    int64_t dur = 0;
+    DetailIndex detailIndex;
+    EventIndex parent{ -1 };
+    std::vector<EventIndex> children;
+};
+
+template <typename T, typename Idx>
+struct IndexedVector : std::vector<T>
+{
+    using std::vector<T>::vector;
+    typename std::vector<T>::reference       operator[](Idx pos) { return this->begin()[pos.idx]; }
+    typename std::vector<T>::const_reference operator[](Idx pos) const { return this->begin()[pos.idx]; }
+};
+typedef IndexedVector<std::string_view, DetailIndex> BuildNames;
+typedef IndexedVector<BuildEvent, EventIndex> BuildEvents;
+
+struct BuildEventsParser;
+BuildEventsParser* CreateBuildEventsParser();
+void DeleteBuildEventsParser(BuildEventsParser* parser);
+
+// NOTE: can be called in parallel
+bool ParseBuildEvents(BuildEventsParser* parser, const uint8_t* buf, size_t bufSize, const std::string& fileName);
+
+//bool SaveBuildEvents(BuildEventsParser* parser, const std::string& fileName);
+//
+//bool LoadBuildEvents(const std::string& fileName, BuildEvents& outEvents, BuildNames& outNames);
+
+BuildEvents& GetBuildEvents(BuildEventsParser& parser);
+BuildNames& GetBuildNames(BuildEventsParser& parser);
+
diff --git a/kram-profile/CBA/CBA.h b/kram-profile/CBA/CBA.h
new file mode 100644
index 00000000..d8d48bcb
--- /dev/null
+++ b/kram-profile/CBA/CBA.h
@@ -0,0 +1,16 @@
+#import "Foundation/Foundation.h"
+
+@interface CBA : NSObject
+
+- (_Nonnull instancetype)init;
+- (void)deinit;
+
+// Can parseAll or one file at time
+- (void)parse:(NSData* _Nonnull)file filename:(NSString* _Nonnull)filename;
+- (void)parseAll:(NSArray<NSData*> * _Nonnull)files filenames:(NSArray<NSString*> * _Nonnull)filenames;
+
+// This isn't so useful, since need specific files to parse
+- (NSString* _Nonnull)analyzeAll;
+- (NSString* _Nonnull)analyze:(NSArray<NSString*> * _Nonnull)filenames;
+
+@end
diff --git a/kram-profile/CBA/CBA.mm b/kram-profile/CBA/CBA.mm
new file mode 100644
index 00000000..f5f48cef
--- /dev/null
+++ b/kram-profile/CBA/CBA.mm
@@ -0,0 +1,82 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+
+#import "CBA.h"
+
+// This is for windows.h
+//#define _CRT_SECURE_NO_WARNINGS
+//#define NOMINMAX
+
+#include "Analysis.h"
+#include "Arena.h"
+#include "BuildEvents.h"
+//#include "Colors.h"
+#include "Utils.h"
+
+#include <stdio.h>
+#include <string>
+#include <time.h>
+#include <algorithm>
+#include <set>
+#include <cassert>
+
+//#ifdef _MSC_VER
+//struct IUnknown; // workaround for old Win SDK header failures when using /permissive-
+//#endif
+
+    
+@implementation CBA {
+    BuildEventsParser* parser;
+}
+
+- (_Nonnull instancetype)init {
+    ArenaInitialize();
+    
+    parser = CreateBuildEventsParser();
+    
+    return self;
+}
+
+- (void)deinit {
+    // Shutdown the parser
+    DeleteBuildEventsParser(parser);
+    parser = nullptr;
+    
+    ArenaDelete();
+}
+
+// This is bad because it runs single-threaded, and doesn't cache anything across builds.
+// TODO: restructure, so parser is built once
+// feed files to it individually, and then request analysis on a few of the events/names
+// TODO: reformat output to Perfetto json, can then display it visually.
+- (void)parseAll:(NSArray<NSData*> * _Nonnull)files filenames:(NSArray<NSString*> * _Nonnull)filenames
+{
+    for (uint32_t i = 0; i < files.count; ++i) {
+        [self parse:files[i] filename:filenames[i]];
+    }
+}
+
+- (void)parse:(NSData* _Nonnull)file filename:(NSString* _Nonnull)filename {
+    const char* filename_ = [filename UTF8String];
+    ParseBuildEvents(parser, (const uint8_t*)file.bytes, file.length, filename_);
+}
+
+
+- (NSString* _Nonnull)analyzeAll {
+    // Run the analysis on data from the parser.
+    std::string out;
+    DoAnalysis(GetBuildEvents(*parser), GetBuildNames(*parser), out);
+    
+    return [NSString stringWithUTF8String:out.c_str()];
+}
+
+- (NSString* _Nonnull)analyze:(NSArray<NSString*> * _Nonnull)filenames {
+    // Run the analysis on data from the parser.
+    std::string out;
+    DoAnalysis(GetBuildEvents(*parser), GetBuildNames(*parser), out);
+    
+    return [NSString stringWithUTF8String:out.c_str()];
+}
+
+@end
+
diff --git a/kram-profile/CBA/Utils.cpp b/kram-profile/CBA/Utils.cpp
new file mode 100755
index 00000000..851a0e21
--- /dev/null
+++ b/kram-profile/CBA/Utils.cpp
@@ -0,0 +1,93 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+#include "Utils.h"
+
+//#include "external/cwalk/cwalk.h"
+#include <string.h>
+
+inline char ToLower(char c) { return (c >= 'A' && c <= 'Z') ? (c + 'a' - 'A') : c; }
+
+/*
+inline char ToUpper(char c) { return (c >= 'a' && c <= 'z') ? (c - ('a' - 'A')) : c; }
+
+void utils::Lowercase(std::string& path)
+{
+    for (size_t i = 0, n = path.size(); i != n; ++i)
+        path[i] = ToLower(path[i]);
+}
+
+
+bool utils::BeginsWith(const std::string& str, const std::string& prefix)
+{
+    if (str.size() < prefix.size())
+        return false;
+    for (size_t i = 0, n = prefix.size(); i != n; ++i)
+    {
+        char c1 = ToLower(str[i]);
+        char c2 = ToLower(prefix[i]);
+        if (c1 != c2)
+            return false;
+    }
+    return true;
+}
+*/
+ 
+bool utils::EndsWith(const std::string_view& str, const std::string& suffix)
+{
+    if (str.size() < suffix.size())
+        return false;
+    size_t start = str.size() - suffix.size();
+    for (size_t i = 0, n = suffix.size(); i != n; ++i)
+    {
+        char c1 = ToLower(str[i+start]);
+        char c2 = ToLower(suffix[i]);
+        if (c1 != c2)
+            return false;
+    }
+    return true;
+}
+
+bool utils::IsHeader(std::string_view path)
+{
+    path = utils::GetFilename(path);
+    size_t dot = path.rfind('.');
+    if (dot == std::string::npos)
+        return true; // no extension is likely a header, e.g. <vector>
+
+    size_t len = path.size();
+    if (dot + 1 < len && (ToLower(path[dot + 1]) == 'h' || ToLower(path[dot + 1]) == 'i'))
+        return true; // extension starting with .h or .i (h, hpp, hxx, inc etc) likely a header
+
+    return false;
+}
+
+std::string utils::GetNicePath(const std::string_view& path)
+{
+    char input[FILENAME_MAX];
+    size_t len = std::min<size_t>(path.size(), FILENAME_MAX-1);
+    memcpy(input, path.data(), len);
+    input[len] = 0;
+    char result[FILENAME_MAX];
+
+    // kram: skip the normalization
+    // cwk_path_normalize(input, result, sizeof(result));
+    strlcpy(result, input, sizeof(result));
+    
+    // convert to forward slashes
+    char *p = result;
+    while (*p)
+    {
+      if (*p == '\\')
+          *p = '/';
+      ++p;
+    }
+    return result;
+}
+
+std::string_view utils::GetFilename(const std::string_view& path)
+{
+    size_t dirIdx = path.rfind('/');
+    if (dirIdx != std::string::npos)
+        return path.substr(dirIdx + 1, path.size() - dirIdx - 1);
+    return path;
+}
diff --git a/kram-profile/CBA/Utils.h b/kram-profile/CBA/Utils.h
new file mode 100755
index 00000000..52f5e5f3
--- /dev/null
+++ b/kram-profile/CBA/Utils.h
@@ -0,0 +1,21 @@
+// Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
+// SPDX-License-Identifier: Unlicense
+#pragma once
+#include <string>
+#include <string_view>
+
+namespace utils
+{
+    [[nodiscard]] std::string GetNicePath(const std::string_view& path);
+    [[nodiscard]] std::string_view GetFilename(const std::string_view& path);
+
+    [[nodiscard]] bool IsHeader(std::string_view path);
+
+    /*
+    void Lowercase(std::string& path);
+
+    [[nodiscard]] bool BeginsWith(const std::string& str, const std::string& prefix);
+     */
+    [[nodiscard]] bool EndsWith(const std::string_view& str, const std::string& suffix);
+    
+}
diff --git a/kram-profile/CBA/simdjson.cpp b/kram-profile/CBA/simdjson.cpp
new file mode 100644
index 00000000..2b7b3132
--- /dev/null
+++ b/kram-profile/CBA/simdjson.cpp
@@ -0,0 +1,15984 @@
+/* auto-generated on 2022-10-16 16:59:15 +0000. Do not edit! */
+/* begin file src/simdjson.cpp */
+#include "simdjson.h"
+
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_UNDESIRED_WARNINGS
+
+/* begin file src/to_chars.cpp */
+#include <cstring>
+#include <cstdint>
+#include <array>
+#include <cmath>
+
+namespace simdjson {
+namespace internal {
+/*!
+implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+Adapted from JSON for Modern C++
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+The code is distributed under the MIT license, Copyright (c) 2009 Florian
+Loitsch. For a detailed description of the algorithm see: [1] Loitsch, "Printing
+Floating-Point Numbers Quickly and Accurately with Integers", Proceedings of the
+ACM SIGPLAN 2010 Conference on Programming Language Design and Implementation,
+PLDI 2010 [2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and
+Accurately", Proceedings of the ACM SIGPLAN 1996 Conference on Programming
+Language Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl {
+
+template <typename Target, typename Source>
+Target reinterpret_bits(const Source source) {
+  static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+  Target target;
+  std::memcpy(&target, &source, sizeof(Source));
+  return target;
+}
+
+struct diyfp // f * 2^e
+{
+  static constexpr int kPrecision = 64; // = q
+
+  std::uint64_t f = 0;
+  int e = 0;
+
+  constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+  /*!
+  @brief returns x - y
+  @pre x.e == y.e and x.f >= y.f
+  */
+  static diyfp sub(const diyfp &x, const diyfp &y) noexcept {
+
+    return {x.f - y.f, x.e};
+  }
+
+  /*!
+  @brief returns x * y
+  @note The result is rounded. (Only the upper q bits are returned.)
+  */
+  static diyfp mul(const diyfp &x, const diyfp &y) noexcept {
+    static_assert(kPrecision == 64, "internal error");
+
+    // Computes:
+    //  f = round((x.f * y.f) / 2^q)
+    //  e = x.e + y.e + q
+
+    // Emulate the 64-bit * 64-bit multiplication:
+    //
+    // p = u * v
+    //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+    //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo )) +
+    //   2^64 (u_hi v_hi         ) = (p0                ) + 2^32 ((p1 ) + (p2 ))
+    //   + 2^64 (p3                ) = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo +
+    //   2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                ) =
+    //   (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi +
+    //   p2_hi + p3) = (p0_lo             ) + 2^32 (Q ) + 2^64 (H ) = (p0_lo ) +
+    //   2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H )
+    //
+    // (Since Q might be larger than 2^32 - 1)
+    //
+    //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+    //
+    // (Q_hi + H does not overflow a 64-bit int)
+    //
+    //   = p_lo + 2^64 p_hi
+
+    const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
+    const std::uint64_t u_hi = x.f >> 32u;
+    const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
+    const std::uint64_t v_hi = y.f >> 32u;
+
+    const std::uint64_t p0 = u_lo * v_lo;
+    const std::uint64_t p1 = u_lo * v_hi;
+    const std::uint64_t p2 = u_hi * v_lo;
+    const std::uint64_t p3 = u_hi * v_hi;
+
+    const std::uint64_t p0_hi = p0 >> 32u;
+    const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
+    const std::uint64_t p1_hi = p1 >> 32u;
+    const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
+    const std::uint64_t p2_hi = p2 >> 32u;
+
+    std::uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+    // The full product might now be computed as
+    //
+    // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+    // p_lo = p0_lo + (Q << 32)
+    //
+    // But in this particular case here, the full p_lo is not required.
+    // Effectively we only need to add the highest bit in p_lo to p_hi (and
+    // Q_hi + 1 does not overflow).
+
+    Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
+
+    const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
+
+    return {h, x.e + y.e + 64};
+  }
+
+  /*!
+  @brief normalize x such that the significand is >= 2^(q-1)
+  @pre x.f != 0
+  */
+  static diyfp normalize(diyfp x) noexcept {
+
+    while ((x.f >> 63u) == 0) {
+      x.f <<= 1u;
+      x.e--;
+    }
+
+    return x;
+  }
+
+  /*!
+  @brief normalize x such that the result has the exponent E
+  @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+  */
+  static diyfp normalize_to(const diyfp &x,
+                            const int target_exponent) noexcept {
+    const int delta = x.e - target_exponent;
+
+    return {x.f << delta, target_exponent};
+  }
+};
+
+struct boundaries {
+  diyfp w;
+  diyfp minus;
+  diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+@pre value must be finite and positive
+*/
+template <typename FloatType> boundaries compute_boundaries(FloatType value) {
+
+  // Convert the IEEE representation into a diyfp.
+  //
+  // If v is denormal:
+  //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+  // If v is normalized:
+  //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+  static_assert(std::numeric_limits<FloatType>::is_iec559,
+                "internal error: dtoa_short requires an IEEE-754 "
+                "floating-point implementation");
+
+  constexpr int kPrecision =
+      std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
+  constexpr int kBias =
+      std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+  constexpr int kMinExp = 1 - kBias;
+  constexpr std::uint64_t kHiddenBit = std::uint64_t{1}
+                                       << (kPrecision - 1); // = 2^(p-1)
+
+  using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t,
+                                              std::uint64_t>::type;
+
+  const std::uint64_t bits = reinterpret_bits<bits_type>(value);
+  const std::uint64_t E = bits >> (kPrecision - 1);
+  const std::uint64_t F = bits & (kHiddenBit - 1);
+
+  const bool is_denormal = E == 0;
+  const diyfp v = is_denormal
+                      ? diyfp(F, kMinExp)
+                      : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+  // Compute the boundaries m- and m+ of the floating-point value
+  // v = f * 2^e.
+  //
+  // Determine v- and v+, the floating-point predecessor and successor if v,
+  // respectively.
+  //
+  //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+  //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+  //
+  //      v+ = v + 2^e
+  //
+  // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+  // between m- and m+ round to v, regardless of how the input rounding
+  // algorithm breaks ties.
+  //
+  //      ---+-------------+-------------+-------------+-------------+---  (A)
+  //         v-            m-            v             m+            v+
+  //
+  //      -----------------+------+------+-------------+-------------+---  (B)
+  //                       v-     m-     v             m+            v+
+
+  const bool lower_boundary_is_closer = F == 0 && E > 1;
+  const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
+  const diyfp m_minus = lower_boundary_is_closer
+                            ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
+                            : diyfp(2 * v.f - 1, v.e - 1); // (A)
+
+  // Determine the normalized w+ = m+.
+  const diyfp w_plus = diyfp::normalize(m_plus);
+
+  // Determine w- = m- such that e_(w-) = e_(w+).
+  const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+  return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power // c = f * 2^e ~= 10^k
+{
+  std::uint64_t f;
+  int e;
+  int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power get_cached_power_for_binary_exponent(int e) {
+  // Now
+  //
+  //      alpha <= e_c + e + q <= gamma                                    (1)
+  //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+  //
+  // and since the c's are normalized, 2^(q-1) <= f_c,
+  //
+  //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+  //      ==> 2^(alpha - e - 1) <= c
+  //
+  // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
+  //
+  //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+  //        = ceil( (alpha - e - 1) * log_10(2) )
+  //
+  // From the paper:
+  // "In theory the result of the procedure could be wrong since c is rounded,
+  //  and the computation itself is approximated [...]. In practice, however,
+  //  this simple function is sufficient."
+  //
+  // For IEEE double precision floating-point numbers converted into
+  // normalized diyfp's w = f * 2^e, with q = 64,
+  //
+  //      e >= -1022      (min IEEE exponent)
+  //           -52        (p - 1)
+  //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+  //           -11        (normalize the diyfp)
+  //         = -1137
+  //
+  // and
+  //
+  //      e <= +1023      (max IEEE exponent)
+  //           -52        (p - 1)
+  //           -11        (normalize the diyfp)
+  //         = 960
+  //
+  // This binary exponent range [-1137,960] results in a decimal exponent
+  // range [-307,324]. One does not need to store a cached power for each
+  // k in this range. For each such k it suffices to find a cached power
+  // such that the exponent of the product lies in [alpha,gamma].
+  // This implies that the difference of the decimal exponents of adjacent
+  // table entries must be less than or equal to
+  //
+  //      floor( (gamma - alpha) * log_10(2) ) = 8.
+  //
+  // (A smaller distance gamma-alpha would require a larger table.)
+
+  // NB:
+  // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+  constexpr int kCachedPowersMinDecExp = -300;
+  constexpr int kCachedPowersDecStep = 8;
+
+  static constexpr std::array<cached_power, 79> kCachedPowers = {{
+      {0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292},
+      {0xBE5691EF416BD60C, -1007, -284}, {0x8DD01FAD907FFC3C, -980, -276},
+      {0xD3515C2831559A83, -954, -268},  {0x9D71AC8FADA6C9B5, -927, -260},
+      {0xEA9C227723EE8BCB, -901, -252},  {0xAECC49914078536D, -874, -244},
+      {0x823C12795DB6CE57, -847, -236},  {0xC21094364DFB5637, -821, -228},
+      {0x9096EA6F3848984F, -794, -220},  {0xD77485CB25823AC7, -768, -212},
+      {0xA086CFCD97BF97F4, -741, -204},  {0xEF340A98172AACE5, -715, -196},
+      {0xB23867FB2A35B28E, -688, -188},  {0x84C8D4DFD2C63F3B, -661, -180},
+      {0xC5DD44271AD3CDBA, -635, -172},  {0x936B9FCEBB25C996, -608, -164},
+      {0xDBAC6C247D62A584, -582, -156},  {0xA3AB66580D5FDAF6, -555, -148},
+      {0xF3E2F893DEC3F126, -529, -140},  {0xB5B5ADA8AAFF80B8, -502, -132},
+      {0x87625F056C7C4A8B, -475, -124},  {0xC9BCFF6034C13053, -449, -116},
+      {0x964E858C91BA2655, -422, -108},  {0xDFF9772470297EBD, -396, -100},
+      {0xA6DFBD9FB8E5B88F, -369, -92},   {0xF8A95FCF88747D94, -343, -84},
+      {0xB94470938FA89BCF, -316, -76},   {0x8A08F0F8BF0F156B, -289, -68},
+      {0xCDB02555653131B6, -263, -60},   {0x993FE2C6D07B7FAC, -236, -52},
+      {0xE45C10C42A2B3B06, -210, -44},   {0xAA242499697392D3, -183, -36},
+      {0xFD87B5F28300CA0E, -157, -28},   {0xBCE5086492111AEB, -130, -20},
+      {0x8CBCCC096F5088CC, -103, -12},   {0xD1B71758E219652C, -77, -4},
+      {0x9C40000000000000, -50, 4},      {0xE8D4A51000000000, -24, 12},
+      {0xAD78EBC5AC620000, 3, 20},       {0x813F3978F8940984, 30, 28},
+      {0xC097CE7BC90715B3, 56, 36},      {0x8F7E32CE7BEA5C70, 83, 44},
+      {0xD5D238A4ABE98068, 109, 52},     {0x9F4F2726179A2245, 136, 60},
+      {0xED63A231D4C4FB27, 162, 68},     {0xB0DE65388CC8ADA8, 189, 76},
+      {0x83C7088E1AAB65DB, 216, 84},     {0xC45D1DF942711D9A, 242, 92},
+      {0x924D692CA61BE758, 269, 100},    {0xDA01EE641A708DEA, 295, 108},
+      {0xA26DA3999AEF774A, 322, 116},    {0xF209787BB47D6B85, 348, 124},
+      {0xB454E4A179DD1877, 375, 132},    {0x865B86925B9BC5C2, 402, 140},
+      {0xC83553C5C8965D3D, 428, 148},    {0x952AB45CFA97A0B3, 455, 156},
+      {0xDE469FBD99A05FE3, 481, 164},    {0xA59BC234DB398C25, 508, 172},
+      {0xF6C69A72A3989F5C, 534, 180},    {0xB7DCBF5354E9BECE, 561, 188},
+      {0x88FCF317F22241E2, 588, 196},    {0xCC20CE9BD35C78A5, 614, 204},
+      {0x98165AF37B2153DF, 641, 212},    {0xE2A0B5DC971F303A, 667, 220},
+      {0xA8D9D1535CE3B396, 694, 228},    {0xFB9B7CD9A4A7443C, 720, 236},
+      {0xBB764C4CA7A44410, 747, 244},    {0x8BAB8EEFB6409C1A, 774, 252},
+      {0xD01FEF10A657842C, 800, 260},    {0x9B10A4E5E9913129, 827, 268},
+      {0xE7109BFBA19C0C9D, 853, 276},    {0xAC2820D9623BF429, 880, 284},
+      {0x80444B5E7AA7CF85, 907, 292},    {0xBF21E44003ACDD2D, 933, 300},
+      {0x8E679C2F5E44FF8F, 960, 308},    {0xD433179D9C8CB841, 986, 316},
+      {0x9E19DB92B4E31BA9, 1013, 324},
+  }};
+
+  // This computation gives exactly the same results for k as
+  //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+  // for |e| <= 1500, but doesn't require floating-point operations.
+  // NB: log_10(2) ~= 78913 / 2^18
+  const int f = kAlpha - e - 1;
+  const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
+
+  const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) /
+                    kCachedPowersDecStep;
+
+  const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
+
+  return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int find_largest_pow10(const std::uint32_t n, std::uint32_t &pow10) {
+  // LCOV_EXCL_START
+  if (n >= 1000000000) {
+    pow10 = 1000000000;
+    return 10;
+  }
+  // LCOV_EXCL_STOP
+  else if (n >= 100000000) {
+    pow10 = 100000000;
+    return 9;
+  } else if (n >= 10000000) {
+    pow10 = 10000000;
+    return 8;
+  } else if (n >= 1000000) {
+    pow10 = 1000000;
+    return 7;
+  } else if (n >= 100000) {
+    pow10 = 100000;
+    return 6;
+  } else if (n >= 10000) {
+    pow10 = 10000;
+    return 5;
+  } else if (n >= 1000) {
+    pow10 = 1000;
+    return 4;
+  } else if (n >= 100) {
+    pow10 = 100;
+    return 3;
+  } else if (n >= 10) {
+    pow10 = 10;
+    return 2;
+  } else {
+    pow10 = 1;
+    return 1;
+  }
+}
+
+inline void grisu2_round(char *buf, int len, std::uint64_t dist,
+                         std::uint64_t delta, std::uint64_t rest,
+                         std::uint64_t ten_k) {
+
+  //               <--------------------------- delta ---->
+  //                                  <---- dist --------->
+  // --------------[------------------+-------------------]--------------
+  //               M-                 w                   M+
+  //
+  //                                  ten_k
+  //                                <------>
+  //                                       <---- rest ---->
+  // --------------[------------------+----+--------------]--------------
+  //                                  w    V
+  //                                       = buf * 10^k
+  //
+  // ten_k represents a unit-in-the-last-place in the decimal representation
+  // stored in buf.
+  // Decrement buf by ten_k while this takes buf closer to w.
+
+  // The tests are written in this order to avoid overflow in unsigned
+  // integer arithmetic.
+
+  while (rest < dist && delta - rest >= ten_k &&
+         (rest + ten_k < dist || dist - rest > rest + ten_k - dist)) {
+    buf[len - 1]--;
+    rest += ten_k;
+  }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void grisu2_digit_gen(char *buffer, int &length, int &decimal_exponent,
+                             diyfp M_minus, diyfp w, diyfp M_plus) {
+  static_assert(kAlpha >= -60, "internal error");
+  static_assert(kGamma <= -32, "internal error");
+
+  // Generates the digits (and the exponent) of a decimal floating-point
+  // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+  // w, M- and M+ share the same exponent e, which satisfies alpha <= e <=
+  // gamma.
+  //
+  //               <--------------------------- delta ---->
+  //                                  <---- dist --------->
+  // --------------[------------------+-------------------]--------------
+  //               M-                 w                   M+
+  //
+  // Grisu2 generates the digits of M+ from left to right and stops as soon as
+  // V is in [M-,M+].
+
+  std::uint64_t delta =
+      diyfp::sub(M_plus, M_minus)
+          .f; // (significand of (M+ - M-), implicit exponent is e)
+  std::uint64_t dist =
+      diyfp::sub(M_plus, w)
+          .f; // (significand of (M+ - w ), implicit exponent is e)
+
+  // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+  //
+  //      M+ = f * 2^e
+  //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+  //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+  //         = p1 + p2 * 2^e
+
+  const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
+
+  auto p1 = static_cast<std::uint32_t>(
+      M_plus.f >>
+      -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+  std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e
+
+  // 1)
+  //
+  // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+  std::uint32_t pow10;
+  const int k = find_largest_pow10(p1, pow10);
+
+  //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+  //
+  //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+  //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+  //
+  //      M+ = p1                                             + p2 * 2^e
+  //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+  //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+  //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+  //
+  // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+  //
+  //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+  //
+  // but stop as soon as
+  //
+  //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+  int n = k;
+  while (n > 0) {
+    // Invariants:
+    //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+    //      pow10 = 10^(n-1) <= p1 < 10^n
+    //
+    const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1)
+    const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1)
+    //
+    //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+    //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+    //
+    buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+    //
+    //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+    //
+    p1 = r;
+    n--;
+    //
+    //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+    //      pow10 = 10^n
+    //
+
+    // Now check if enough digits have been generated.
+    // Compute
+    //
+    //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+    //
+    // Note:
+    // Since rest and delta share the same exponent e, it suffices to
+    // compare the significands.
+    const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
+    if (rest <= delta) {
+      // V = buffer * 10^n, with M- <= V <= M+.
+
+      decimal_exponent += n;
+
+      // We may now just stop. But instead look if the buffer could be
+      // decremented to bring V closer to w.
+      //
+      // pow10 = 10^n is now 1 ulp in the decimal representation V.
+      // The rounding procedure works with diyfp's with an implicit
+      // exponent of e.
+      //
+      //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+      //
+      const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
+      grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+      return;
+    }
+
+    pow10 /= 10;
+    //
+    //      pow10 = 10^(n-1) <= p1 < 10^n
+    // Invariants restored.
+  }
+
+  // 2)
+  //
+  // The digits of the integral part have been generated:
+  //
+  //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+  //         = buffer            + p2 * 2^e
+  //
+  // Now generate the digits of the fractional part p2 * 2^e.
+  //
+  // Note:
+  // No decimal point is generated: the exponent is adjusted instead.
+  //
+  // p2 actually represents the fraction
+  //
+  //      p2 * 2^e
+  //          = p2 / 2^-e
+  //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+  //
+  // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+  //
+  //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+  //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+  //
+  // using
+  //
+  //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+  //                = (                   d) * 2^-e + (                   r)
+  //
+  // or
+  //      10^m * p2 * 2^e = d + r * 2^e
+  //
+  // i.e.
+  //
+  //      M+ = buffer + p2 * 2^e
+  //         = buffer + 10^-m * (d + r * 2^e)
+  //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+  //
+  // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+  int m = 0;
+  for (;;) {
+    // Invariant:
+    //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...)
+    //      * 2^e
+    //         = buffer * 10^-m + 10^-m * (p2                                 )
+    //         * 2^e = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e =
+    //         buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e +
+    //         (10*p2 mod 2^-e)) * 2^e
+    //
+    p2 *= 10;
+    const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
+    const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
+    //
+    //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+    //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+    //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+    //
+    buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+    //
+    //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+    //
+    p2 = r;
+    m++;
+    //
+    //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+    // Invariant restored.
+
+    // Check if enough digits have been generated.
+    //
+    //      10^-m * p2 * 2^e <= delta * 2^e
+    //              p2 * 2^e <= 10^m * delta * 2^e
+    //                    p2 <= 10^m * delta
+    delta *= 10;
+    dist *= 10;
+    if (p2 <= delta) {
+      break;
+    }
+  }
+
+  // V = buffer * 10^-m, with M- <= V <= M+.
+
+  decimal_exponent -= m;
+
+  // 1 ulp in the decimal representation is now 10^-m.
+  // Since delta and dist are now scaled by 10^m, we need to do the
+  // same with ulp in order to keep the units in sync.
+  //
+  //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+  //
+  const std::uint64_t ten_m = one.f;
+  grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+  // By construction this algorithm generates the shortest possible decimal
+  // number (Loitsch, Theorem 6.2) which rounds back to w.
+  // For an input number of precision p, at least
+  //
+  //      N = 1 + ceil(p * log_10(2))
+  //
+  // decimal digits are sufficient to identify all binary floating-point
+  // numbers (Matula, "In-and-Out conversions").
+  // This implies that the algorithm does not produce more than N decimal
+  // digits.
+  //
+  //      N = 17 for p = 53 (IEEE double precision)
+  //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+inline void grisu2(char *buf, int &len, int &decimal_exponent, diyfp m_minus,
+                   diyfp v, diyfp m_plus) {
+
+  //  --------(-----------------------+-----------------------)--------    (A)
+  //          m-                      v                       m+
+  //
+  //  --------------------(-----------+-----------------------)--------    (B)
+  //                      m-          v                       m+
+  //
+  // First scale v (and m- and m+) such that the exponent is in the range
+  // [alpha, gamma].
+
+  const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+  const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
+
+  // The exponent of the products is = v.e + c_minus_k.e + q and is in the range
+  // [alpha,gamma]
+  const diyfp w = diyfp::mul(v, c_minus_k);
+  const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+  const diyfp w_plus = diyfp::mul(m_plus, c_minus_k);
+
+  //  ----(---+---)---------------(---+---)---------------(---+---)----
+  //          w-                      w                       w+
+  //          = c*m-                  = c*v                   = c*m+
+  //
+  // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+  // w+ are now off by a small amount.
+  // In fact:
+  //
+  //      w - v * 10^k < 1 ulp
+  //
+  // To account for this inaccuracy, add resp. subtract 1 ulp.
+  //
+  //  --------+---[---------------(---+---)---------------]---+--------
+  //          w-  M-                  w                   M+  w+
+  //
+  // Now any number in [M-, M+] (bounds included) will round to w when input,
+  // regardless of how the input rounding algorithm breaks ties.
+  //
+  // And digit_gen generates the shortest possible such number in [M-, M+].
+  // Note that this does not mean that Grisu2 always generates the shortest
+  // possible number in the interval (m-, m+).
+  const diyfp M_minus(w_minus.f + 1, w_minus.e);
+  const diyfp M_plus(w_plus.f - 1, w_plus.e);
+
+  decimal_exponent = -cached.k; // = -(-k) = k
+
+  grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template <typename FloatType>
+void grisu2(char *buf, int &len, int &decimal_exponent, FloatType value) {
+  static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                "internal error: not enough precision");
+
+  // If the neighbors (and boundaries) of 'value' are always computed for
+  // double-precision numbers, all float's can be recovered using strtod (and
+  // strtof). However, the resulting decimal representations are not exactly
+  // "short".
+  //
+  // The documentation for 'std::to_chars'
+  // (https://en.cppreference.com/w/cpp/utility/to_chars) says "value is
+  // converted to a string as if by std::sprintf in the default ("C") locale"
+  // and since sprintf promotes float's to double's, I think this is exactly
+  // what 'std::to_chars' does. On the other hand, the documentation for
+  // 'std::to_chars' requires that "parsing the representation using the
+  // corresponding std::from_chars function recovers value exactly". That
+  // indicates that single precision floating-point numbers should be recovered
+  // using 'std::strtof'.
+  //
+  // NB: If the neighbors are computed for single-precision numbers, there is a
+  // single float
+  //     (7.0385307e-26f) which can't be recovered using strtod. The resulting
+  //     double precision value is off by 1 ulp.
+#if 0
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+  const boundaries w = compute_boundaries(value);
+#endif
+
+  grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+inline char *append_exponent(char *buf, int e) {
+
+  if (e < 0) {
+    e = -e;
+    *buf++ = '-';
+  } else {
+    *buf++ = '+';
+  }
+
+  auto k = static_cast<std::uint32_t>(e);
+  if (k < 10) {
+    // Always print at least two digits in the exponent.
+    // This is for compatibility with printf("%g").
+    *buf++ = '0';
+    *buf++ = static_cast<char>('0' + k);
+  } else if (k < 100) {
+    *buf++ = static_cast<char>('0' + k / 10);
+    k %= 10;
+    *buf++ = static_cast<char>('0' + k);
+  } else {
+    *buf++ = static_cast<char>('0' + k / 100);
+    k %= 100;
+    *buf++ = static_cast<char>('0' + k / 10);
+    k %= 10;
+    *buf++ = static_cast<char>('0' + k);
+  }
+
+  return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+inline char *format_buffer(char *buf, int len, int decimal_exponent,
+                           int min_exp, int max_exp) {
+
+  const int k = len;
+  const int n = len + decimal_exponent;
+
+  // v = buf * 10^(n-k)
+  // k is the length of the buffer (number of decimal digits)
+  // n is the position of the decimal point relative to the start of the buffer.
+
+  if (k <= n && n <= max_exp) {
+    // digits[000]
+    // len <= max_exp + 2
+
+    std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
+    // Make it look like a floating-point number (#362, #378)
+    // buf[n + 0] = '.';
+    // buf[n + 1] = '0';
+    return buf + (static_cast<size_t>(n));
+  }
+
+  if (0 < n && n <= max_exp) {
+    // dig.its
+    // len <= max_digits10 + 1
+    std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n,
+                 static_cast<size_t>(k) - static_cast<size_t>(n));
+    buf[n] = '.';
+    return buf + (static_cast<size_t>(k) + 1U);
+  }
+
+  if (min_exp < n && n <= 0) {
+    // 0.[000]digits
+    // len <= 2 + (-min_exp - 1) + max_digits10
+
+    std::memmove(buf + (2 + static_cast<size_t>(-n)), buf,
+                 static_cast<size_t>(k));
+    buf[0] = '0';
+    buf[1] = '.';
+    std::memset(buf + 2, '0', static_cast<size_t>(-n));
+    return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
+  }
+
+  if (k == 1) {
+    // dE+123
+    // len <= 1 + 5
+
+    buf += 1;
+  } else {
+    // d.igitsE+123
+    // len <= max_digits10 + 1 + 5
+
+    std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
+    buf[1] = '.';
+    buf += 1 + static_cast<size_t>(k);
+  }
+
+  *buf++ = 'e';
+  return append_exponent(buf, n - 1);
+}
+
+} // namespace dtoa_impl
+
+/*!
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+char *to_chars(char *first, const char *last, double value) {
+  static_cast<void>(last); // maybe unused - fix warning
+  bool negative = std::signbit(value);
+  if (negative) {
+    value = -value;
+    *first++ = '-';
+  }
+
+  if (value == 0) // +-0
+  {
+    *first++ = '0';
+    // Make it look like a floating-point number (#362, #378)
+    if(negative) {
+      *first++ = '.';
+      *first++ = '0';
+    }
+    return first;
+  }
+  // Compute v = buffer * 10^decimal_exponent.
+  // The decimal digits are stored in the buffer, which needs to be interpreted
+  // as an unsigned decimal integer.
+  // len is the length of the buffer, i.e. the number of decimal digits.
+  int len = 0;
+  int decimal_exponent = 0;
+  dtoa_impl::grisu2(first, len, decimal_exponent, value);
+  // Format the buffer like printf("%.*g", prec, value)
+  constexpr int kMinExp = -4;
+  constexpr int kMaxExp = std::numeric_limits<double>::digits10;
+
+  return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp,
+                                  kMaxExp);
+}
+} // namespace internal
+} // namespace simdjson
+/* end file src/to_chars.cpp */
+/* begin file src/from_chars.cpp */
+#include <limits>
+namespace simdjson {
+namespace internal {
+
+/**
+ * The code in the internal::from_chars function is meant to handle the floating-point number parsing
+ * when we have more than 19 digits in the decimal mantissa. This should only be seen
+ * in adversarial scenarios: we do not expect production systems to even produce
+ * such floating-point numbers.
+ *
+ * The parser is based on work by Nigel Tao (at https://github.com/google/wuffs/)
+ * who credits Ken Thompson for the design (via a reference to the Go source
+ * code). See
+ * https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c
+ * https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c
+ * It is probably not very fast but it is a fallback that should almost never be
+ * called in real life. Google Wuffs is published under APL 2.0.
+ **/
+
+namespace {
+constexpr uint32_t max_digits = 768;
+constexpr int32_t decimal_point_range = 2047;
+} // namespace
+
+struct adjusted_mantissa {
+  uint64_t mantissa;
+  int power2;
+  adjusted_mantissa() : mantissa(0), power2(0) {}
+};
+
+struct decimal {
+  uint32_t num_digits;
+  int32_t decimal_point;
+  bool negative;
+  bool truncated;
+  uint8_t digits[max_digits];
+};
+
+template <typename T> struct binary_format {
+  static constexpr int mantissa_explicit_bits();
+  static constexpr int minimum_exponent();
+  static constexpr int infinite_power();
+  static constexpr int sign_index();
+};
+
+template <> constexpr int binary_format<double>::mantissa_explicit_bits() {
+  return 52;
+}
+
+template <> constexpr int binary_format<double>::minimum_exponent() {
+  return -1023;
+}
+template <> constexpr int binary_format<double>::infinite_power() {
+  return 0x7FF;
+}
+
+template <> constexpr int binary_format<double>::sign_index() { return 63; }
+
+bool is_integer(char c)  noexcept  { return (c >= '0' && c <= '9'); }
+
+// This should always succeed since it follows a call to parse_number.
+decimal parse_decimal(const char *&p) noexcept {
+  decimal answer;
+  answer.num_digits = 0;
+  answer.decimal_point = 0;
+  answer.truncated = false;
+  answer.negative = (*p == '-');
+  if ((*p == '-') || (*p == '+')) {
+    ++p;
+  }
+
+  while (*p == '0') {
+    ++p;
+  }
+  while (is_integer(*p)) {
+    if (answer.num_digits < max_digits) {
+      answer.digits[answer.num_digits] = uint8_t(*p - '0');
+    }
+    answer.num_digits++;
+    ++p;
+  }
+  if (*p == '.') {
+    ++p;
+    const char *first_after_period = p;
+    // if we have not yet encountered a zero, we have to skip it as well
+    if (answer.num_digits == 0) {
+      // skip zeros
+      while (*p == '0') {
+        ++p;
+      }
+    }
+    while (is_integer(*p)) {
+      if (answer.num_digits < max_digits) {
+        answer.digits[answer.num_digits] = uint8_t(*p - '0');
+      }
+      answer.num_digits++;
+      ++p;
+    }
+    answer.decimal_point = int32_t(first_after_period - p);
+  }
+  if(answer.num_digits > 0) {
+    const char *preverse = p - 1;
+    int32_t trailing_zeros = 0;
+    while ((*preverse == '0') || (*preverse == '.')) {
+      if(*preverse == '0') { trailing_zeros++; };
+      --preverse;
+    }
+    answer.decimal_point += int32_t(answer.num_digits);
+    answer.num_digits -= uint32_t(trailing_zeros);
+  }
+  if(answer.num_digits > max_digits ) {
+    answer.num_digits = max_digits;
+    answer.truncated = true;
+  }
+  if (('e' == *p) || ('E' == *p)) {
+    ++p;
+    bool neg_exp = false;
+    if ('-' == *p) {
+      neg_exp = true;
+      ++p;
+    } else if ('+' == *p) {
+      ++p;
+    }
+    int32_t exp_number = 0; // exponential part
+    while (is_integer(*p)) {
+      uint8_t digit = uint8_t(*p - '0');
+      if (exp_number < 0x10000) {
+        exp_number = 10 * exp_number + digit;
+      }
+      ++p;
+    }
+    answer.decimal_point += (neg_exp ? -exp_number : exp_number);
+  }
+  return answer;
+}
+
+// This should always succeed since it follows a call to parse_number.
+// Will not read at or beyond the "end" pointer.
+decimal parse_decimal(const char *&p, const char * end) noexcept {
+  decimal answer;
+  answer.num_digits = 0;
+  answer.decimal_point = 0;
+  answer.truncated = false;
+  if(p == end) { return answer; } // should never happen
+  answer.negative = (*p == '-');
+  if ((*p == '-') || (*p == '+')) {
+    ++p;
+  }
+
+  while ((p != end) && (*p == '0')) {
+    ++p;
+  }
+  while ((p != end) && is_integer(*p)) {
+    if (answer.num_digits < max_digits) {
+      answer.digits[answer.num_digits] = uint8_t(*p - '0');
+    }
+    answer.num_digits++;
+    ++p;
+  }
+  if ((p != end) && (*p == '.')) {
+    ++p;
+    if(p == end) { return answer; } // should never happen
+    const char *first_after_period = p;
+    // if we have not yet encountered a zero, we have to skip it as well
+    if (answer.num_digits == 0) {
+      // skip zeros
+      while (*p == '0') {
+        ++p;
+      }
+    }
+    while ((p != end) && is_integer(*p)) {
+      if (answer.num_digits < max_digits) {
+        answer.digits[answer.num_digits] = uint8_t(*p - '0');
+      }
+      answer.num_digits++;
+      ++p;
+    }
+    answer.decimal_point = int32_t(first_after_period - p);
+  }
+  if(answer.num_digits > 0) {
+    const char *preverse = p - 1;
+    int32_t trailing_zeros = 0;
+    while ((*preverse == '0') || (*preverse == '.')) {
+      if(*preverse == '0') { trailing_zeros++; };
+      --preverse;
+    }
+    answer.decimal_point += int32_t(answer.num_digits);
+    answer.num_digits -= uint32_t(trailing_zeros);
+  }
+  if(answer.num_digits > max_digits ) {
+    answer.num_digits = max_digits;
+    answer.truncated = true;
+  }
+  if ((p != end) && (('e' == *p) || ('E' == *p))) {
+    ++p;
+    if(p == end) { return answer; } // should never happen
+    bool neg_exp = false;
+    if ('-' == *p) {
+      neg_exp = true;
+      ++p;
+    } else if ('+' == *p) {
+      ++p;
+    }
+    int32_t exp_number = 0; // exponential part
+    while ((p != end) && is_integer(*p)) {
+      uint8_t digit = uint8_t(*p - '0');
+      if (exp_number < 0x10000) {
+        exp_number = 10 * exp_number + digit;
+      }
+      ++p;
+    }
+    answer.decimal_point += (neg_exp ? -exp_number : exp_number);
+  }
+  return answer;
+}
+
+namespace {
+
+// remove all final zeroes
+inline void trim(decimal &h) {
+  while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) {
+    h.num_digits--;
+  }
+}
+
+uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) {
+  shift &= 63;
+  const static uint16_t number_of_digits_decimal_left_shift_table[65] = {
+      0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817,
+      0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067,
+      0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF,
+      0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0,
+      0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA,
+      0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC,
+      0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C,
+      0x051C, 0x051C,
+  };
+  uint32_t x_a = number_of_digits_decimal_left_shift_table[shift];
+  uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1];
+  uint32_t num_new_digits = x_a >> 11;
+  uint32_t pow5_a = 0x7FF & x_a;
+  uint32_t pow5_b = 0x7FF & x_b;
+  const static uint8_t
+      number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = {
+          5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5,
+          3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8,
+          2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2,
+          5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1,
+          5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5,
+          3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2,
+          8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3,
+          7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5,
+          6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6,
+          0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3,
+          8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7,
+          6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2,
+          5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8,
+          6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3,
+          2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1,
+          2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6,
+          4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3,
+          2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6,
+          6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3,
+          8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5,
+          5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5,
+          7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3,
+          1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6,
+          6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6,
+          4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7,
+          2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7,
+          3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5,
+          2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5,
+          9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0,
+          2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8,
+          8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5,
+          2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4,
+          9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2,
+          0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5,
+          4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7,
+          5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9,
+          2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5,
+          6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9,
+          4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3,
+          2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8,
+          9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2,
+          3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1,
+          3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1,
+          1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3,
+          1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2,
+          3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1,
+          0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3,
+          5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1,
+          3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3,
+          9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3,
+          9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6,
+          7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3,
+          6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7,
+          6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9,
+          4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2,
+          5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9,
+          6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5,
+      };
+  const uint8_t *pow5 =
+      &number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a];
+  uint32_t i = 0;
+  uint32_t n = pow5_b - pow5_a;
+  for (; i < n; i++) {
+    if (i >= h.num_digits) {
+      return num_new_digits - 1;
+    } else if (h.digits[i] == pow5[i]) {
+      continue;
+    } else if (h.digits[i] < pow5[i]) {
+      return num_new_digits - 1;
+    } else {
+      return num_new_digits;
+    }
+  }
+  return num_new_digits;
+}
+
+} // end of anonymous namespace
+
+uint64_t round(decimal &h) {
+  if ((h.num_digits == 0) || (h.decimal_point < 0)) {
+    return 0;
+  } else if (h.decimal_point > 18) {
+    return UINT64_MAX;
+  }
+  // at this point, we know that h.decimal_point >= 0
+  uint32_t dp = uint32_t(h.decimal_point);
+  uint64_t n = 0;
+  for (uint32_t i = 0; i < dp; i++) {
+    n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0);
+  }
+  bool round_up = false;
+  if (dp < h.num_digits) {
+    round_up = h.digits[dp] >= 5; // normally, we round up
+    // but we may need to round to even!
+    if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) {
+      round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1]));
+    }
+  }
+  if (round_up) {
+    n++;
+  }
+  return n;
+}
+
+// computes h * 2^-shift
+void decimal_left_shift(decimal &h, uint32_t shift) {
+  if (h.num_digits == 0) {
+    return;
+  }
+  uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift);
+  int32_t read_index = int32_t(h.num_digits - 1);
+  uint32_t write_index = h.num_digits - 1 + num_new_digits;
+  uint64_t n = 0;
+
+  while (read_index >= 0) {
+    n += uint64_t(h.digits[read_index]) << shift;
+    uint64_t quotient = n / 10;
+    uint64_t remainder = n - (10 * quotient);
+    if (write_index < max_digits) {
+      h.digits[write_index] = uint8_t(remainder);
+    } else if (remainder > 0) {
+      h.truncated = true;
+    }
+    n = quotient;
+    write_index--;
+    read_index--;
+  }
+  while (n > 0) {
+    uint64_t quotient = n / 10;
+    uint64_t remainder = n - (10 * quotient);
+    if (write_index < max_digits) {
+      h.digits[write_index] = uint8_t(remainder);
+    } else if (remainder > 0) {
+      h.truncated = true;
+    }
+    n = quotient;
+    write_index--;
+  }
+  h.num_digits += num_new_digits;
+  if (h.num_digits > max_digits) {
+    h.num_digits = max_digits;
+  }
+  h.decimal_point += int32_t(num_new_digits);
+  trim(h);
+}
+
+// computes h * 2^shift
+void decimal_right_shift(decimal &h, uint32_t shift) {
+  uint32_t read_index = 0;
+  uint32_t write_index = 0;
+
+  uint64_t n = 0;
+
+  while ((n >> shift) == 0) {
+    if (read_index < h.num_digits) {
+      n = (10 * n) + h.digits[read_index++];
+    } else if (n == 0) {
+      return;
+    } else {
+      while ((n >> shift) == 0) {
+        n = 10 * n;
+        read_index++;
+      }
+      break;
+    }
+  }
+  h.decimal_point -= int32_t(read_index - 1);
+  if (h.decimal_point < -decimal_point_range) { // it is zero
+    h.num_digits = 0;
+    h.decimal_point = 0;
+    h.negative = false;
+    h.truncated = false;
+    return;
+  }
+  uint64_t mask = (uint64_t(1) << shift) - 1;
+  while (read_index < h.num_digits) {
+    uint8_t new_digit = uint8_t(n >> shift);
+    n = (10 * (n & mask)) + h.digits[read_index++];
+    h.digits[write_index++] = new_digit;
+  }
+  while (n > 0) {
+    uint8_t new_digit = uint8_t(n >> shift);
+    n = 10 * (n & mask);
+    if (write_index < max_digits) {
+      h.digits[write_index++] = new_digit;
+    } else if (new_digit > 0) {
+      h.truncated = true;
+    }
+  }
+  h.num_digits = write_index;
+  trim(h);
+}
+
+template <typename binary> adjusted_mantissa compute_float(decimal &d) {
+  adjusted_mantissa answer;
+  if (d.num_digits == 0) {
+    // should be zero
+    answer.power2 = 0;
+    answer.mantissa = 0;
+    return answer;
+  }
+  // At this point, going further, we can assume that d.num_digits > 0.
+  // We want to guard against excessive decimal point values because
+  // they can result in long running times. Indeed, we do
+  // shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22
+  // which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not
+  // fine (runs for a long time).
+  //
+  if(d.decimal_point < -324) {
+    // We have something smaller than 1e-324 which is always zero
+    // in binary64 and binary32.
+    // It should be zero.
+    answer.power2 = 0;
+    answer.mantissa = 0;
+    return answer;
+  } else if(d.decimal_point >= 310) {
+    // We have something at least as large as 0.1e310 which is
+    // always infinite.
+    answer.power2 = binary::infinite_power();
+    answer.mantissa = 0;
+    return answer;
+  }
+
+  static const uint32_t max_shift = 60;
+  static const uint32_t num_powers = 19;
+  static const uint8_t powers[19] = {
+      0,  3,  6,  9,  13, 16, 19, 23, 26, 29, //
+      33, 36, 39, 43, 46, 49, 53, 56, 59,     //
+  };
+  int32_t exp2 = 0;
+  while (d.decimal_point > 0) {
+    uint32_t n = uint32_t(d.decimal_point);
+    uint32_t shift = (n < num_powers) ? powers[n] : max_shift;
+    decimal_right_shift(d, shift);
+    if (d.decimal_point < -decimal_point_range) {
+      // should be zero
+      answer.power2 = 0;
+      answer.mantissa = 0;
+      return answer;
+    }
+    exp2 += int32_t(shift);
+  }
+  // We shift left toward [1/2 ... 1].
+  while (d.decimal_point <= 0) {
+    uint32_t shift;
+    if (d.decimal_point == 0) {
+      if (d.digits[0] >= 5) {
+        break;
+      }
+      shift = (d.digits[0] < 2) ? 2 : 1;
+    } else {
+      uint32_t n = uint32_t(-d.decimal_point);
+      shift = (n < num_powers) ? powers[n] : max_shift;
+    }
+    decimal_left_shift(d, shift);
+    if (d.decimal_point > decimal_point_range) {
+      // we want to get infinity:
+      answer.power2 = 0xFF;
+      answer.mantissa = 0;
+      return answer;
+    }
+    exp2 -= int32_t(shift);
+  }
+  // We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2].
+  exp2--;
+  constexpr int32_t minimum_exponent = binary::minimum_exponent();
+  while ((minimum_exponent + 1) > exp2) {
+    uint32_t n = uint32_t((minimum_exponent + 1) - exp2);
+    if (n > max_shift) {
+      n = max_shift;
+    }
+    decimal_right_shift(d, n);
+    exp2 += int32_t(n);
+  }
+  if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
+    answer.power2 = binary::infinite_power();
+    answer.mantissa = 0;
+    return answer;
+  }
+
+  const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1;
+  decimal_left_shift(d, mantissa_size_in_bits);
+
+  uint64_t mantissa = round(d);
+  // It is possible that we have an overflow, in which case we need
+  // to shift back.
+  if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) {
+    decimal_right_shift(d, 1);
+    exp2 += 1;
+    mantissa = round(d);
+    if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
+      answer.power2 = binary::infinite_power();
+      answer.mantissa = 0;
+      return answer;
+    }
+  }
+  answer.power2 = exp2 - binary::minimum_exponent();
+  if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) {
+    answer.power2--;
+  }
+  answer.mantissa =
+      mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1);
+  return answer;
+}
+
+template <typename binary>
+adjusted_mantissa parse_long_mantissa(const char *first) {
+  decimal d = parse_decimal(first);
+  return compute_float<binary>(d);
+}
+
+template <typename binary>
+adjusted_mantissa parse_long_mantissa(const char *first, const char *end) {
+  decimal d = parse_decimal(first, end);
+  return compute_float<binary>(d);
+}
+
+double from_chars(const char *first) noexcept {
+  bool negative = first[0] == '-';
+  if (negative) {
+    first++;
+  }
+  adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first);
+  uint64_t word = am.mantissa;
+  word |= uint64_t(am.power2)
+          << binary_format<double>::mantissa_explicit_bits();
+  word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
+                  : word;
+  double value;
+  std::memcpy(&value, &word, sizeof(double));
+  return value;
+}
+
+
+double from_chars(const char *first, const char *end) noexcept {
+  bool negative = first[0] == '-';
+  if (negative) {
+    first++;
+  }
+  adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first, end);
+  uint64_t word = am.mantissa;
+  word |= uint64_t(am.power2)
+          << binary_format<double>::mantissa_explicit_bits();
+  word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
+                  : word;
+  double value;
+  std::memcpy(&value, &word, sizeof(double));
+  return value;
+}
+
+} // internal
+} // simdjson
+/* end file src/from_chars.cpp */
+/* begin file src/internal/error_tables.cpp */
+
+namespace simdjson {
+namespace internal {
+
+  SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] {
+    { SUCCESS, "No error" },
+    { CAPACITY, "This parser can't support a document that big" },
+    { MEMALLOC, "Error allocating memory, we're most likely out of memory" },
+    { TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." },
+    { DEPTH_ERROR, "The JSON document was too deep (too many nested objects and arrays)" },
+    { STRING_ERROR, "Problem while parsing a string" },
+    { T_ATOM_ERROR, "Problem while parsing an atom starting with the letter 't'" },
+    { F_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'f'" },
+    { N_ATOM_ERROR, "Problem while parsing an atom starting with the letter 'n'" },
+    { NUMBER_ERROR, "Problem while parsing a number" },
+    { UTF8_ERROR, "The input is not valid UTF-8" },
+    { UNINITIALIZED, "Uninitialized" },
+    { EMPTY, "Empty: no JSON found" },
+    { UNESCAPED_CHARS, "Within strings, some characters must be escaped, we found unescaped characters" },
+    { UNCLOSED_STRING, "A string is opened, but never closed." },
+    { UNSUPPORTED_ARCHITECTURE, "simdjson does not have an implementation supported by this CPU architecture (perhaps it's a non-SIMD CPU?)." },
+    { INCORRECT_TYPE, "The JSON element does not have the requested type." },
+    { NUMBER_OUT_OF_RANGE, "The JSON number is too large or too small to fit within the requested type." },
+    { INDEX_OUT_OF_BOUNDS, "Attempted to access an element of a JSON array that is beyond its length." },
+    { NO_SUCH_FIELD, "The JSON field referenced does not exist in this object." },
+    { IO_ERROR, "Error reading the file." },
+    { INVALID_JSON_POINTER, "Invalid JSON pointer syntax." },
+    { INVALID_URI_FRAGMENT, "Invalid URI fragment syntax." },
+    { UNEXPECTED_ERROR, "Unexpected error, consider reporting this problem as you may have found a bug in simdjson" },
+    { PARSER_IN_USE, "Cannot parse a new document while a document is still in use." },
+    { OUT_OF_ORDER_ITERATION, "Objects and arrays can only be iterated when they are first encountered." },
+    { INSUFFICIENT_PADDING, "simdjson requires the input JSON string to have at least SIMDJSON_PADDING extra bytes allocated, beyond the string's length. Consider using the simdjson::padded_string class if needed." },
+    { INCOMPLETE_ARRAY_OR_OBJECT, "JSON document ended early in the middle of an object or array." },
+    { SCALAR_DOCUMENT_AS_VALUE, "A JSON document made of a scalar (number, Boolean, null or string) is treated as a value. Use get_bool(), get_double(), etc. on the document instead. "},
+    { OUT_OF_BOUNDS, "Attempted to access location outside of document."},
+    { TRAILING_CONTENT, "Unexpected trailing content in the JSON input."}
+  }; // error_messages[]
+
+} // namespace internal
+} // namespace simdjson
+/* end file src/internal/error_tables.cpp */
+/* begin file src/internal/jsoncharutils_tables.cpp */
+
+namespace simdjson {
+namespace internal {
+
+// structural chars here are
+// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
+// we are also interested in the four whitespace characters
+// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
+
+SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886] = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x1,        0x2,        0x3,        0x4,        0x5,
+    0x6,        0x7,        0x8,        0x9,        0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
+    0xb,        0xc,        0xd,        0xe,        0xf,        0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa,        0xb,        0xc,        0xd,        0xe,
+    0xf,        0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x10,       0x20,       0x30,       0x40,       0x50,
+    0x60,       0x70,       0x80,       0x90,       0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
+    0xb0,       0xc0,       0xd0,       0xe0,       0xf0,       0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa0,       0xb0,       0xc0,       0xd0,       0xe0,
+    0xf0,       0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x100,      0x200,      0x300,      0x400,      0x500,
+    0x600,      0x700,      0x800,      0x900,      0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
+    0xb00,      0xc00,      0xd00,      0xe00,      0xf00,      0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa00,      0xb00,      0xc00,      0xd00,      0xe00,
+    0xf00,      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x0,        0x1000,     0x2000,     0x3000,     0x4000,     0x5000,
+    0x6000,     0x7000,     0x8000,     0x9000,     0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
+    0xb000,     0xc000,     0xd000,     0xe000,     0xf000,     0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xa000,     0xb000,     0xc000,     0xd000,     0xe000,
+    0xf000,     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
+
+} // namespace internal
+} // namespace simdjson
+/* end file src/internal/jsoncharutils_tables.cpp */
+/* begin file src/internal/numberparsing_tables.cpp */
+
+namespace simdjson {
+namespace internal {
+
+// Precomputed powers of ten from 10^0 to 10^22. These
+// can be represented exactly using the double type.
+SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[] = {
+    1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
+    1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
+
+/**
+ * When mapping numbers from decimal to binary,
+ * we go from w * 10^q to m * 2^p but we have
+ * 10^q = 5^q * 2^q, so effectively
+ * we are trying to match
+ * w * 2^q * 5^q to m * 2^p. Thus the powers of two
+ * are not a concern since they can be represented
+ * exactly using the binary notation, only the powers of five
+ * affect the binary significand.
+ */
+
+
+// The truncated powers of five from 5^-342 all the way to 5^308
+// The mantissa is truncated to 128 bits, and
+// never rounded up. Uses about 10KB.
+SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[]= {
+        0xeef453d6923bd65a,0x113faa2906a13b3f,
+        0x9558b4661b6565f8,0x4ac7ca59a424c507,
+        0xbaaee17fa23ebf76,0x5d79bcf00d2df649,
+        0xe95a99df8ace6f53,0xf4d82c2c107973dc,
+        0x91d8a02bb6c10594,0x79071b9b8a4be869,
+        0xb64ec836a47146f9,0x9748e2826cdee284,
+        0xe3e27a444d8d98b7,0xfd1b1b2308169b25,
+        0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7,
+        0xb208ef855c969f4f,0xbdbd2d335e51a935,
+        0xde8b2b66b3bc4723,0xad2c788035e61382,
+        0x8b16fb203055ac76,0x4c3bcb5021afcc31,
+        0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d,
+        0xd953e8624b85dd78,0xd71d6dad34a2af0d,
+        0x87d4713d6f33aa6b,0x8672648c40e5ad68,
+        0xa9c98d8ccb009506,0x680efdaf511f18c2,
+        0xd43bf0effdc0ba48,0x212bd1b2566def2,
+        0x84a57695fe98746d,0x14bb630f7604b57,
+        0xa5ced43b7e3e9188,0x419ea3bd35385e2d,
+        0xcf42894a5dce35ea,0x52064cac828675b9,
+        0x818995ce7aa0e1b2,0x7343efebd1940993,
+        0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8,
+        0xca66fa129f9b60a6,0xd41a26e077774ef6,
+        0xfd00b897478238d0,0x8920b098955522b4,
+        0x9e20735e8cb16382,0x55b46e5f5d5535b0,
+        0xc5a890362fddbc62,0xeb2189f734aa831d,
+        0xf712b443bbd52b7b,0xa5e9ec7501d523e4,
+        0x9a6bb0aa55653b2d,0x47b233c92125366e,
+        0xc1069cd4eabe89f8,0x999ec0bb696e840a,
+        0xf148440a256e2c76,0xc00670ea43ca250d,
+        0x96cd2a865764dbca,0x380406926a5e5728,
+        0xbc807527ed3e12bc,0xc605083704f5ecf2,
+        0xeba09271e88d976b,0xf7864a44c633682e,
+        0x93445b8731587ea3,0x7ab3ee6afbe0211d,
+        0xb8157268fdae9e4c,0x5960ea05bad82964,
+        0xe61acf033d1a45df,0x6fb92487298e33bd,
+        0x8fd0c16206306bab,0xa5d3b6d479f8e056,
+        0xb3c4f1ba87bc8696,0x8f48a4899877186c,
+        0xe0b62e2929aba83c,0x331acdabfe94de87,
+        0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14,
+        0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9,
+        0xdb71e91432b1a24a,0xc9e82cd9f69d6150,
+        0x892731ac9faf056e,0xbe311c083a225cd2,
+        0xab70fe17c79ac6ca,0x6dbd630a48aaf406,
+        0xd64d3d9db981787d,0x92cbbccdad5b108,
+        0x85f0468293f0eb4e,0x25bbf56008c58ea5,
+        0xa76c582338ed2621,0xaf2af2b80af6f24e,
+        0xd1476e2c07286faa,0x1af5af660db4aee1,
+        0x82cca4db847945ca,0x50d98d9fc890ed4d,
+        0xa37fce126597973c,0xe50ff107bab528a0,
+        0xcc5fc196fefd7d0c,0x1e53ed49a96272c8,
+        0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a,
+        0x9faacf3df73609b1,0x77b191618c54e9ac,
+        0xc795830d75038c1d,0xd59df5b9ef6a2417,
+        0xf97ae3d0d2446f25,0x4b0573286b44ad1d,
+        0x9becce62836ac577,0x4ee367f9430aec32,
+        0xc2e801fb244576d5,0x229c41f793cda73f,
+        0xf3a20279ed56d48a,0x6b43527578c1110f,
+        0x9845418c345644d6,0x830a13896b78aaa9,
+        0xbe5691ef416bd60c,0x23cc986bc656d553,
+        0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8,
+        0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9,
+        0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53,
+        0xe858ad248f5c22c9,0xd1b3400f8f9cff68,
+        0x91376c36d99995be,0x23100809b9c21fa1,
+        0xb58547448ffffb2d,0xabd40a0c2832a78a,
+        0xe2e69915b3fff9f9,0x16c90c8f323f516c,
+        0x8dd01fad907ffc3b,0xae3da7d97f6792e3,
+        0xb1442798f49ffb4a,0x99cd11cfdf41779c,
+        0xdd95317f31c7fa1d,0x40405643d711d583,
+        0x8a7d3eef7f1cfc52,0x482835ea666b2572,
+        0xad1c8eab5ee43b66,0xda3243650005eecf,
+        0xd863b256369d4a40,0x90bed43e40076a82,
+        0x873e4f75e2224e68,0x5a7744a6e804a291,
+        0xa90de3535aaae202,0x711515d0a205cb36,
+        0xd3515c2831559a83,0xd5a5b44ca873e03,
+        0x8412d9991ed58091,0xe858790afe9486c2,
+        0xa5178fff668ae0b6,0x626e974dbe39a872,
+        0xce5d73ff402d98e3,0xfb0a3d212dc8128f,
+        0x80fa687f881c7f8e,0x7ce66634bc9d0b99,
+        0xa139029f6a239f72,0x1c1fffc1ebc44e80,
+        0xc987434744ac874e,0xa327ffb266b56220,
+        0xfbe9141915d7a922,0x4bf1ff9f0062baa8,
+        0x9d71ac8fada6c9b5,0x6f773fc3603db4a9,
+        0xc4ce17b399107c22,0xcb550fb4384d21d3,
+        0xf6019da07f549b2b,0x7e2a53a146606a48,
+        0x99c102844f94e0fb,0x2eda7444cbfc426d,
+        0xc0314325637a1939,0xfa911155fefb5308,
+        0xf03d93eebc589f88,0x793555ab7eba27ca,
+        0x96267c7535b763b5,0x4bc1558b2f3458de,
+        0xbbb01b9283253ca2,0x9eb1aaedfb016f16,
+        0xea9c227723ee8bcb,0x465e15a979c1cadc,
+        0x92a1958a7675175f,0xbfacd89ec191ec9,
+        0xb749faed14125d36,0xcef980ec671f667b,
+        0xe51c79a85916f484,0x82b7e12780e7401a,
+        0x8f31cc0937ae58d2,0xd1b2ecb8b0908810,
+        0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15,
+        0xdfbdcece67006ac9,0x67a791e093e1d49a,
+        0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0,
+        0xaecc49914078536d,0x58fae9f773886e18,
+        0xda7f5bf590966848,0xaf39a475506a899e,
+        0x888f99797a5e012d,0x6d8406c952429603,
+        0xaab37fd7d8f58178,0xc8e5087ba6d33b83,
+        0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64,
+        0x855c3be0a17fcd26,0x5cf2eea09a55067f,
+        0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e,
+        0xd0601d8efc57b08b,0xf13b94daf124da26,
+        0x823c12795db6ce57,0x76c53d08d6b70858,
+        0xa2cb1717b52481ed,0x54768c4b0c64ca6e,
+        0xcb7ddcdda26da268,0xa9942f5dcf7dfd09,
+        0xfe5d54150b090b02,0xd3f93b35435d7c4c,
+        0x9efa548d26e5a6e1,0xc47bc5014a1a6daf,
+        0xc6b8e9b0709f109a,0x359ab6419ca1091b,
+        0xf867241c8cc6d4c0,0xc30163d203c94b62,
+        0x9b407691d7fc44f8,0x79e0de63425dcf1d,
+        0xc21094364dfb5636,0x985915fc12f542e4,
+        0xf294b943e17a2bc4,0x3e6f5b7b17b2939d,
+        0x979cf3ca6cec5b5a,0xa705992ceecf9c42,
+        0xbd8430bd08277231,0x50c6ff782a838353,
+        0xece53cec4a314ebd,0xa4f8bf5635246428,
+        0x940f4613ae5ed136,0x871b7795e136be99,
+        0xb913179899f68584,0x28e2557b59846e3f,
+        0xe757dd7ec07426e5,0x331aeada2fe589cf,
+        0x9096ea6f3848984f,0x3ff0d2c85def7621,
+        0xb4bca50b065abe63,0xfed077a756b53a9,
+        0xe1ebce4dc7f16dfb,0xd3e8495912c62894,
+        0x8d3360f09cf6e4bd,0x64712dd7abbbd95c,
+        0xb080392cc4349dec,0xbd8d794d96aacfb3,
+        0xdca04777f541c567,0xecf0d7a0fc5583a0,
+        0x89e42caaf9491b60,0xf41686c49db57244,
+        0xac5d37d5b79b6239,0x311c2875c522ced5,
+        0xd77485cb25823ac7,0x7d633293366b828b,
+        0x86a8d39ef77164bc,0xae5dff9c02033197,
+        0xa8530886b54dbdeb,0xd9f57f830283fdfc,
+        0xd267caa862a12d66,0xd072df63c324fd7b,
+        0x8380dea93da4bc60,0x4247cb9e59f71e6d,
+        0xa46116538d0deb78,0x52d9be85f074e608,
+        0xcd795be870516656,0x67902e276c921f8b,
+        0x806bd9714632dff6,0xba1cd8a3db53b6,
+        0xa086cfcd97bf97f3,0x80e8a40eccd228a4,
+        0xc8a883c0fdaf7df0,0x6122cd128006b2cd,
+        0xfad2a4b13d1b5d6c,0x796b805720085f81,
+        0x9cc3a6eec6311a63,0xcbe3303674053bb0,
+        0xc3f490aa77bd60fc,0xbedbfc4411068a9c,
+        0xf4f1b4d515acb93b,0xee92fb5515482d44,
+        0x991711052d8bf3c5,0x751bdd152d4d1c4a,
+        0xbf5cd54678eef0b6,0xd262d45a78a0635d,
+        0xef340a98172aace4,0x86fb897116c87c34,
+        0x9580869f0e7aac0e,0xd45d35e6ae3d4da0,
+        0xbae0a846d2195712,0x8974836059cca109,
+        0xe998d258869facd7,0x2bd1a438703fc94b,
+        0x91ff83775423cc06,0x7b6306a34627ddcf,
+        0xb67f6455292cbf08,0x1a3bc84c17b1d542,
+        0xe41f3d6a7377eeca,0x20caba5f1d9e4a93,
+        0x8e938662882af53e,0x547eb47b7282ee9c,
+        0xb23867fb2a35b28d,0xe99e619a4f23aa43,
+        0xdec681f9f4c31f31,0x6405fa00e2ec94d4,
+        0x8b3c113c38f9f37e,0xde83bc408dd3dd04,
+        0xae0b158b4738705e,0x9624ab50b148d445,
+        0xd98ddaee19068c76,0x3badd624dd9b0957,
+        0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6,
+        0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c,
+        0xd47487cc8470652b,0x7647c3200069671f,
+        0x84c8d4dfd2c63f3b,0x29ecd9f40041e073,
+        0xa5fb0a17c777cf09,0xf468107100525890,
+        0xcf79cc9db955c2cc,0x7182148d4066eeb4,
+        0x81ac1fe293d599bf,0xc6f14cd848405530,
+        0xa21727db38cb002f,0xb8ada00e5a506a7c,
+        0xca9cf1d206fdc03b,0xa6d90811f0e4851c,
+        0xfd442e4688bd304a,0x908f4a166d1da663,
+        0x9e4a9cec15763e2e,0x9a598e4e043287fe,
+        0xc5dd44271ad3cdba,0x40eff1e1853f29fd,
+        0xf7549530e188c128,0xd12bee59e68ef47c,
+        0x9a94dd3e8cf578b9,0x82bb74f8301958ce,
+        0xc13a148e3032d6e7,0xe36a52363c1faf01,
+        0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1,
+        0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9,
+        0xbcb2b812db11a5de,0x7415d448f6b6f0e7,
+        0xebdf661791d60f56,0x111b495b3464ad21,
+        0x936b9fcebb25c995,0xcab10dd900beec34,
+        0xb84687c269ef3bfb,0x3d5d514f40eea742,
+        0xe65829b3046b0afa,0xcb4a5a3112a5112,
+        0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab,
+        0xb3f4e093db73a093,0x59ed216765690f56,
+        0xe0f218b8d25088b8,0x306869c13ec3532c,
+        0x8c974f7383725573,0x1e414218c73a13fb,
+        0xafbd2350644eeacf,0xe5d1929ef90898fa,
+        0xdbac6c247d62a583,0xdf45f746b74abf39,
+        0x894bc396ce5da772,0x6b8bba8c328eb783,
+        0xab9eb47c81f5114f,0x66ea92f3f326564,
+        0xd686619ba27255a2,0xc80a537b0efefebd,
+        0x8613fd0145877585,0xbd06742ce95f5f36,
+        0xa798fc4196e952e7,0x2c48113823b73704,
+        0xd17f3b51fca3a7a0,0xf75a15862ca504c5,
+        0x82ef85133de648c4,0x9a984d73dbe722fb,
+        0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba,
+        0xcc963fee10b7d1b3,0x318df905079926a8,
+        0xffbbcfe994e5c61f,0xfdf17746497f7052,
+        0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633,
+        0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0,
+        0xf9bd690a1b68637b,0x3dfdce7aa3c673b0,
+        0x9c1661a651213e2d,0x6bea10ca65c084e,
+        0xc31bfa0fe5698db8,0x486e494fcff30a62,
+        0xf3e2f893dec3f126,0x5a89dba3c3efccfa,
+        0x986ddb5c6b3a76b7,0xf89629465a75e01c,
+        0xbe89523386091465,0xf6bbb397f1135823,
+        0xee2ba6c0678b597f,0x746aa07ded582e2c,
+        0x94db483840b717ef,0xa8c2a44eb4571cdc,
+        0xba121a4650e4ddeb,0x92f34d62616ce413,
+        0xe896a0d7e51e1566,0x77b020baf9c81d17,
+        0x915e2486ef32cd60,0xace1474dc1d122e,
+        0xb5b5ada8aaff80b8,0xd819992132456ba,
+        0xe3231912d5bf60e6,0x10e1fff697ed6c69,
+        0x8df5efabc5979c8f,0xca8d3ffa1ef463c1,
+        0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2,
+        0xddd0467c64bce4a0,0xac7cb3f6d05ddbde,
+        0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b,
+        0xad4ab7112eb3929d,0x86c16c98d2c953c6,
+        0xd89d64d57a607744,0xe871c7bf077ba8b7,
+        0x87625f056c7c4a8b,0x11471cd764ad4972,
+        0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf,
+        0xd389b47879823479,0x4aff1d108d4ec2c3,
+        0x843610cb4bf160cb,0xcedf722a585139ba,
+        0xa54394fe1eedb8fe,0xc2974eb4ee658828,
+        0xce947a3da6a9273e,0x733d226229feea32,
+        0x811ccc668829b887,0x806357d5a3f525f,
+        0xa163ff802a3426a8,0xca07c2dcb0cf26f7,
+        0xc9bcff6034c13052,0xfc89b393dd02f0b5,
+        0xfc2c3f3841f17c67,0xbbac2078d443ace2,
+        0x9d9ba7832936edc0,0xd54b944b84aa4c0d,
+        0xc5029163f384a931,0xa9e795e65d4df11,
+        0xf64335bcf065d37d,0x4d4617b5ff4a16d5,
+        0x99ea0196163fa42e,0x504bced1bf8e4e45,
+        0xc06481fb9bcf8d39,0xe45ec2862f71e1d6,
+        0xf07da27a82c37088,0x5d767327bb4e5a4c,
+        0x964e858c91ba2655,0x3a6a07f8d510f86f,
+        0xbbe226efb628afea,0x890489f70a55368b,
+        0xeadab0aba3b2dbe5,0x2b45ac74ccea842e,
+        0x92c8ae6b464fc96f,0x3b0b8bc90012929d,
+        0xb77ada0617e3bbcb,0x9ce6ebb40173744,
+        0xe55990879ddcaabd,0xcc420a6a101d0515,
+        0x8f57fa54c2a9eab6,0x9fa946824a12232d,
+        0xb32df8e9f3546564,0x47939822dc96abf9,
+        0xdff9772470297ebd,0x59787e2b93bc56f7,
+        0x8bfbea76c619ef36,0x57eb4edb3c55b65a,
+        0xaefae51477a06b03,0xede622920b6b23f1,
+        0xdab99e59958885c4,0xe95fab368e45eced,
+        0x88b402f7fd75539b,0x11dbcb0218ebb414,
+        0xaae103b5fcd2a881,0xd652bdc29f26a119,
+        0xd59944a37c0752a2,0x4be76d3346f0495f,
+        0x857fcae62d8493a5,0x6f70a4400c562ddb,
+        0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952,
+        0xd097ad07a71f26b2,0x7e2000a41346a7a7,
+        0x825ecc24c873782f,0x8ed400668c0c28c8,
+        0xa2f67f2dfa90563b,0x728900802f0f32fa,
+        0xcbb41ef979346bca,0x4f2b40a03ad2ffb9,
+        0xfea126b7d78186bc,0xe2f610c84987bfa8,
+        0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9,
+        0xc6ede63fa05d3143,0x91503d1c79720dbb,
+        0xf8a95fcf88747d94,0x75a44c6397ce912a,
+        0x9b69dbe1b548ce7c,0xc986afbe3ee11aba,
+        0xc24452da229b021b,0xfbe85badce996168,
+        0xf2d56790ab41c2a2,0xfae27299423fb9c3,
+        0x97c560ba6b0919a5,0xdccd879fc967d41a,
+        0xbdb6b8e905cb600f,0x5400e987bbc1c920,
+        0xed246723473e3813,0x290123e9aab23b68,
+        0x9436c0760c86e30b,0xf9a0b6720aaf6521,
+        0xb94470938fa89bce,0xf808e40e8d5b3e69,
+        0xe7958cb87392c2c2,0xb60b1d1230b20e04,
+        0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2,
+        0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3,
+        0xe2280b6c20dd5232,0x25c6da63c38de1b0,
+        0x8d590723948a535f,0x579c487e5a38ad0e,
+        0xb0af48ec79ace837,0x2d835a9df0c6d851,
+        0xdcdb1b2798182244,0xf8e431456cf88e65,
+        0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff,
+        0xac8b2d36eed2dac5,0xe272467e3d222f3f,
+        0xd7adf884aa879177,0x5b0ed81dcc6abb0f,
+        0x86ccbb52ea94baea,0x98e947129fc2b4e9,
+        0xa87fea27a539e9a5,0x3f2398d747b36224,
+        0xd29fe4b18e88640e,0x8eec7f0d19a03aad,
+        0x83a3eeeef9153e89,0x1953cf68300424ac,
+        0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7,
+        0xcdb02555653131b6,0x3792f412cb06794d,
+        0x808e17555f3ebf11,0xe2bbd88bbee40bd0,
+        0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4,
+        0xc8de047564d20a8b,0xf245825a5a445275,
+        0xfb158592be068d2e,0xeed6e2f0f0d56712,
+        0x9ced737bb6c4183d,0x55464dd69685606b,
+        0xc428d05aa4751e4c,0xaa97e14c3c26b886,
+        0xf53304714d9265df,0xd53dd99f4b3066a8,
+        0x993fe2c6d07b7fab,0xe546a8038efe4029,
+        0xbf8fdb78849a5f96,0xde98520472bdd033,
+        0xef73d256a5c0f77c,0x963e66858f6d4440,
+        0x95a8637627989aad,0xdde7001379a44aa8,
+        0xbb127c53b17ec159,0x5560c018580d5d52,
+        0xe9d71b689dde71af,0xaab8f01e6e10b4a6,
+        0x9226712162ab070d,0xcab3961304ca70e8,
+        0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22,
+        0xe45c10c42a2b3b05,0x8cb89a7db77c506a,
+        0x8eb98a7a9a5b04e3,0x77f3608e92adb242,
+        0xb267ed1940f1c61c,0x55f038b237591ed3,
+        0xdf01e85f912e37a3,0x6b6c46dec52f6688,
+        0x8b61313bbabce2c6,0x2323ac4b3b3da015,
+        0xae397d8aa96c1b77,0xabec975e0a0d081a,
+        0xd9c7dced53c72255,0x96e7bd358c904a21,
+        0x881cea14545c7575,0x7e50d64177da2e54,
+        0xaa242499697392d2,0xdde50bd1d5d0b9e9,
+        0xd4ad2dbfc3d07787,0x955e4ec64b44e864,
+        0x84ec3c97da624ab4,0xbd5af13bef0b113e,
+        0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e,
+        0xcfb11ead453994ba,0x67de18eda5814af2,
+        0x81ceb32c4b43fcf4,0x80eacf948770ced7,
+        0xa2425ff75e14fc31,0xa1258379a94d028d,
+        0xcad2f7f5359a3b3e,0x96ee45813a04330,
+        0xfd87b5f28300ca0d,0x8bca9d6e188853fc,
+        0x9e74d1b791e07e48,0x775ea264cf55347e,
+        0xc612062576589dda,0x95364afe032a81a0,
+        0xf79687aed3eec551,0x3a83ddbd83f52210,
+        0x9abe14cd44753b52,0xc4926a9672793580,
+        0xc16d9a0095928a27,0x75b7053c0f178400,
+        0xf1c90080baf72cb1,0x5324c68b12dd6800,
+        0x971da05074da7bee,0xd3f6fc16ebca8000,
+        0xbce5086492111aea,0x88f4bb1ca6bd0000,
+        0xec1e4a7db69561a5,0x2b31e9e3d0700000,
+        0x9392ee8e921d5d07,0x3aff322e62600000,
+        0xb877aa3236a4b449,0x9befeb9fad487c3,
+        0xe69594bec44de15b,0x4c2ebe687989a9b4,
+        0x901d7cf73ab0acd9,0xf9d37014bf60a11,
+        0xb424dc35095cd80f,0x538484c19ef38c95,
+        0xe12e13424bb40e13,0x2865a5f206b06fba,
+        0x8cbccc096f5088cb,0xf93f87b7442e45d4,
+        0xafebff0bcb24aafe,0xf78f69a51539d749,
+        0xdbe6fecebdedd5be,0xb573440e5a884d1c,
+        0x89705f4136b4a597,0x31680a88f8953031,
+        0xabcc77118461cefc,0xfdc20d2b36ba7c3e,
+        0xd6bf94d5e57a42bc,0x3d32907604691b4d,
+        0x8637bd05af6c69b5,0xa63f9a49c2c1b110,
+        0xa7c5ac471b478423,0xfcf80dc33721d54,
+        0xd1b71758e219652b,0xd3c36113404ea4a9,
+        0x83126e978d4fdf3b,0x645a1cac083126ea,
+        0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4,
+        0xcccccccccccccccc,0xcccccccccccccccd,
+        0x8000000000000000,0x0,
+        0xa000000000000000,0x0,
+        0xc800000000000000,0x0,
+        0xfa00000000000000,0x0,
+        0x9c40000000000000,0x0,
+        0xc350000000000000,0x0,
+        0xf424000000000000,0x0,
+        0x9896800000000000,0x0,
+        0xbebc200000000000,0x0,
+        0xee6b280000000000,0x0,
+        0x9502f90000000000,0x0,
+        0xba43b74000000000,0x0,
+        0xe8d4a51000000000,0x0,
+        0x9184e72a00000000,0x0,
+        0xb5e620f480000000,0x0,
+        0xe35fa931a0000000,0x0,
+        0x8e1bc9bf04000000,0x0,
+        0xb1a2bc2ec5000000,0x0,
+        0xde0b6b3a76400000,0x0,
+        0x8ac7230489e80000,0x0,
+        0xad78ebc5ac620000,0x0,
+        0xd8d726b7177a8000,0x0,
+        0x878678326eac9000,0x0,
+        0xa968163f0a57b400,0x0,
+        0xd3c21bcecceda100,0x0,
+        0x84595161401484a0,0x0,
+        0xa56fa5b99019a5c8,0x0,
+        0xcecb8f27f4200f3a,0x0,
+        0x813f3978f8940984,0x4000000000000000,
+        0xa18f07d736b90be5,0x5000000000000000,
+        0xc9f2c9cd04674ede,0xa400000000000000,
+        0xfc6f7c4045812296,0x4d00000000000000,
+        0x9dc5ada82b70b59d,0xf020000000000000,
+        0xc5371912364ce305,0x6c28000000000000,
+        0xf684df56c3e01bc6,0xc732000000000000,
+        0x9a130b963a6c115c,0x3c7f400000000000,
+        0xc097ce7bc90715b3,0x4b9f100000000000,
+        0xf0bdc21abb48db20,0x1e86d40000000000,
+        0x96769950b50d88f4,0x1314448000000000,
+        0xbc143fa4e250eb31,0x17d955a000000000,
+        0xeb194f8e1ae525fd,0x5dcfab0800000000,
+        0x92efd1b8d0cf37be,0x5aa1cae500000000,
+        0xb7abc627050305ad,0xf14a3d9e40000000,
+        0xe596b7b0c643c719,0x6d9ccd05d0000000,
+        0x8f7e32ce7bea5c6f,0xe4820023a2000000,
+        0xb35dbf821ae4f38b,0xdda2802c8a800000,
+        0xe0352f62a19e306e,0xd50b2037ad200000,
+        0x8c213d9da502de45,0x4526f422cc340000,
+        0xaf298d050e4395d6,0x9670b12b7f410000,
+        0xdaf3f04651d47b4c,0x3c0cdd765f114000,
+        0x88d8762bf324cd0f,0xa5880a69fb6ac800,
+        0xab0e93b6efee0053,0x8eea0d047a457a00,
+        0xd5d238a4abe98068,0x72a4904598d6d880,
+        0x85a36366eb71f041,0x47a6da2b7f864750,
+        0xa70c3c40a64e6c51,0x999090b65f67d924,
+        0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d,
+        0x82818f1281ed449f,0xbff8f10e7a8921a4,
+        0xa321f2d7226895c7,0xaff72d52192b6a0d,
+        0xcbea6f8ceb02bb39,0x9bf4f8a69f764490,
+        0xfee50b7025c36a08,0x2f236d04753d5b4,
+        0x9f4f2726179a2245,0x1d762422c946590,
+        0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5,
+        0xf8ebad2b84e0d58b,0xd2e0898765a7deb2,
+        0x9b934c3b330c8577,0x63cc55f49f88eb2f,
+        0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb,
+        0xf316271c7fc3908a,0x8bef464e3945ef7a,
+        0x97edd871cfda3a56,0x97758bf0e3cbb5ac,
+        0xbde94e8e43d0c8ec,0x3d52eeed1cbea317,
+        0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd,
+        0x945e455f24fb1cf8,0x8fe8caa93e74ef6a,
+        0xb975d6b6ee39e436,0xb3e2fd538e122b44,
+        0xe7d34c64a9c85d44,0x60dbbca87196b616,
+        0x90e40fbeea1d3a4a,0xbc8955e946fe31cd,
+        0xb51d13aea4a488dd,0x6babab6398bdbe41,
+        0xe264589a4dcdab14,0xc696963c7eed2dd1,
+        0x8d7eb76070a08aec,0xfc1e1de5cf543ca2,
+        0xb0de65388cc8ada8,0x3b25a55f43294bcb,
+        0xdd15fe86affad912,0x49ef0eb713f39ebe,
+        0x8a2dbf142dfcc7ab,0x6e3569326c784337,
+        0xacb92ed9397bf996,0x49c2c37f07965404,
+        0xd7e77a8f87daf7fb,0xdc33745ec97be906,
+        0x86f0ac99b4e8dafd,0x69a028bb3ded71a3,
+        0xa8acd7c0222311bc,0xc40832ea0d68ce0c,
+        0xd2d80db02aabd62b,0xf50a3fa490c30190,
+        0x83c7088e1aab65db,0x792667c6da79e0fa,
+        0xa4b8cab1a1563f52,0x577001b891185938,
+        0xcde6fd5e09abcf26,0xed4c0226b55e6f86,
+        0x80b05e5ac60b6178,0x544f8158315b05b4,
+        0xa0dc75f1778e39d6,0x696361ae3db1c721,
+        0xc913936dd571c84c,0x3bc3a19cd1e38e9,
+        0xfb5878494ace3a5f,0x4ab48a04065c723,
+        0x9d174b2dcec0e47b,0x62eb0d64283f9c76,
+        0xc45d1df942711d9a,0x3ba5d0bd324f8394,
+        0xf5746577930d6500,0xca8f44ec7ee36479,
+        0x9968bf6abbe85f20,0x7e998b13cf4e1ecb,
+        0xbfc2ef456ae276e8,0x9e3fedd8c321a67e,
+        0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e,
+        0x95d04aee3b80ece5,0xbba1f1d158724a12,
+        0xbb445da9ca61281f,0x2a8a6e45ae8edc97,
+        0xea1575143cf97226,0xf52d09d71a3293bd,
+        0x924d692ca61be758,0x593c2626705f9c56,
+        0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c,
+        0xe498f455c38b997a,0xb6dfb9c0f956447,
+        0x8edf98b59a373fec,0x4724bd4189bd5eac,
+        0xb2977ee300c50fe7,0x58edec91ec2cb657,
+        0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed,
+        0x8b865b215899f46c,0xbd79e0d20082ee74,
+        0xae67f1e9aec07187,0xecd8590680a3aa11,
+        0xda01ee641a708de9,0xe80e6f4820cc9495,
+        0x884134fe908658b2,0x3109058d147fdcdd,
+        0xaa51823e34a7eede,0xbd4b46f0599fd415,
+        0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a,
+        0x850fadc09923329e,0x3e2cf6bc604ddb0,
+        0xa6539930bf6bff45,0x84db8346b786151c,
+        0xcfe87f7cef46ff16,0xe612641865679a63,
+        0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e,
+        0xa26da3999aef7749,0xe3be5e330f38f09d,
+        0xcb090c8001ab551c,0x5cadf5bfd3072cc5,
+        0xfdcb4fa002162a63,0x73d9732fc7c8f7f6,
+        0x9e9f11c4014dda7e,0x2867e7fddcdd9afa,
+        0xc646d63501a1511d,0xb281e1fd541501b8,
+        0xf7d88bc24209a565,0x1f225a7ca91a4226,
+        0x9ae757596946075f,0x3375788de9b06958,
+        0xc1a12d2fc3978937,0x52d6b1641c83ae,
+        0xf209787bb47d6b84,0xc0678c5dbd23a49a,
+        0x9745eb4d50ce6332,0xf840b7ba963646e0,
+        0xbd176620a501fbff,0xb650e5a93bc3d898,
+        0xec5d3fa8ce427aff,0xa3e51f138ab4cebe,
+        0x93ba47c980e98cdf,0xc66f336c36b10137,
+        0xb8a8d9bbe123f017,0xb80b0047445d4184,
+        0xe6d3102ad96cec1d,0xa60dc059157491e5,
+        0x9043ea1ac7e41392,0x87c89837ad68db2f,
+        0xb454e4a179dd1877,0x29babe4598c311fb,
+        0xe16a1dc9d8545e94,0xf4296dd6fef3d67a,
+        0x8ce2529e2734bb1d,0x1899e4a65f58660c,
+        0xb01ae745b101e9e4,0x5ec05dcff72e7f8f,
+        0xdc21a1171d42645d,0x76707543f4fa1f73,
+        0x899504ae72497eba,0x6a06494a791c53a8,
+        0xabfa45da0edbde69,0x487db9d17636892,
+        0xd6f8d7509292d603,0x45a9d2845d3c42b6,
+        0x865b86925b9bc5c2,0xb8a2392ba45a9b2,
+        0xa7f26836f282b732,0x8e6cac7768d7141e,
+        0xd1ef0244af2364ff,0x3207d795430cd926,
+        0x8335616aed761f1f,0x7f44e6bd49e807b8,
+        0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6,
+        0xcd036837130890a1,0x36dba887c37a8c0f,
+        0x802221226be55a64,0xc2494954da2c9789,
+        0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c,
+        0xc83553c5c8965d3d,0x6f92829494e5acc7,
+        0xfa42a8b73abbf48c,0xcb772339ba1f17f9,
+        0x9c69a97284b578d7,0xff2a760414536efb,
+        0xc38413cf25e2d70d,0xfef5138519684aba,
+        0xf46518c2ef5b8cd1,0x7eb258665fc25d69,
+        0x98bf2f79d5993802,0xef2f773ffbd97a61,
+        0xbeeefb584aff8603,0xaafb550ffacfd8fa,
+        0xeeaaba2e5dbf6784,0x95ba2a53f983cf38,
+        0x952ab45cfa97a0b2,0xdd945a747bf26183,
+        0xba756174393d88df,0x94f971119aeef9e4,
+        0xe912b9d1478ceb17,0x7a37cd5601aab85d,
+        0x91abb422ccb812ee,0xac62e055c10ab33a,
+        0xb616a12b7fe617aa,0x577b986b314d6009,
+        0xe39c49765fdf9d94,0xed5a7e85fda0b80b,
+        0x8e41ade9fbebc27d,0x14588f13be847307,
+        0xb1d219647ae6b31c,0x596eb2d8ae258fc8,
+        0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb,
+        0x8aec23d680043bee,0x25de7bb9480d5854,
+        0xada72ccc20054ae9,0xaf561aa79a10ae6a,
+        0xd910f7ff28069da4,0x1b2ba1518094da04,
+        0x87aa9aff79042286,0x90fb44d2f05d0842,
+        0xa99541bf57452b28,0x353a1607ac744a53,
+        0xd3fa922f2d1675f2,0x42889b8997915ce8,
+        0x847c9b5d7c2e09b7,0x69956135febada11,
+        0xa59bc234db398c25,0x43fab9837e699095,
+        0xcf02b2c21207ef2e,0x94f967e45e03f4bb,
+        0x8161afb94b44f57d,0x1d1be0eebac278f5,
+        0xa1ba1ba79e1632dc,0x6462d92a69731732,
+        0xca28a291859bbf93,0x7d7b8f7503cfdcfe,
+        0xfcb2cb35e702af78,0x5cda735244c3d43e,
+        0x9defbf01b061adab,0x3a0888136afa64a7,
+        0xc56baec21c7a1916,0x88aaa1845b8fdd0,
+        0xf6c69a72a3989f5b,0x8aad549e57273d45,
+        0x9a3c2087a63f6399,0x36ac54e2f678864b,
+        0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd,
+        0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5,
+        0x969eb7c47859e743,0x9f644ae5a4b1b325,
+        0xbc4665b596706114,0x873d5d9f0dde1fee,
+        0xeb57ff22fc0c7959,0xa90cb506d155a7ea,
+        0x9316ff75dd87cbd8,0x9a7f12442d588f2,
+        0xb7dcbf5354e9bece,0xc11ed6d538aeb2f,
+        0xe5d3ef282a242e81,0x8f1668c8a86da5fa,
+        0x8fa475791a569d10,0xf96e017d694487bc,
+        0xb38d92d760ec4455,0x37c981dcc395a9ac,
+        0xe070f78d3927556a,0x85bbe253f47b1417,
+        0x8c469ab843b89562,0x93956d7478ccec8e,
+        0xaf58416654a6babb,0x387ac8d1970027b2,
+        0xdb2e51bfe9d0696a,0x6997b05fcc0319e,
+        0x88fcf317f22241e2,0x441fece3bdf81f03,
+        0xab3c2fddeeaad25a,0xd527e81cad7626c3,
+        0xd60b3bd56a5586f1,0x8a71e223d8d3b074,
+        0x85c7056562757456,0xf6872d5667844e49,
+        0xa738c6bebb12d16c,0xb428f8ac016561db,
+        0xd106f86e69d785c7,0xe13336d701beba52,
+        0x82a45b450226b39c,0xecc0024661173473,
+        0xa34d721642b06084,0x27f002d7f95d0190,
+        0xcc20ce9bd35c78a5,0x31ec038df7b441f4,
+        0xff290242c83396ce,0x7e67047175a15271,
+        0x9f79a169bd203e41,0xf0062c6e984d386,
+        0xc75809c42c684dd1,0x52c07b78a3e60868,
+        0xf92e0c3537826145,0xa7709a56ccdf8a82,
+        0x9bbcc7a142b17ccb,0x88a66076400bb691,
+        0xc2abf989935ddbfe,0x6acff893d00ea435,
+        0xf356f7ebf83552fe,0x583f6b8c4124d43,
+        0x98165af37b2153de,0xc3727a337a8b704a,
+        0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c,
+        0xeda2ee1c7064130c,0x1162def06f79df73,
+        0x9485d4d1c63e8be7,0x8addcb5645ac2ba8,
+        0xb9a74a0637ce2ee1,0x6d953e2bd7173692,
+        0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437,
+        0x910ab1d4db9914a0,0x1d9c9892400a22a2,
+        0xb54d5e4a127f59c8,0x2503beb6d00cab4b,
+        0xe2a0b5dc971f303a,0x2e44ae64840fd61d,
+        0x8da471a9de737e24,0x5ceaecfed289e5d2,
+        0xb10d8e1456105dad,0x7425a83e872c5f47,
+        0xdd50f1996b947518,0xd12f124e28f77719,
+        0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f,
+        0xace73cbfdc0bfb7b,0x636cc64d1001550b,
+        0xd8210befd30efa5a,0x3c47f7e05401aa4e,
+        0x8714a775e3e95c78,0x65acfaec34810a71,
+        0xa8d9d1535ce3b396,0x7f1839a741a14d0d,
+        0xd31045a8341ca07c,0x1ede48111209a050,
+        0x83ea2b892091e44d,0x934aed0aab460432,
+        0xa4e4b66b68b65d60,0xf81da84d5617853f,
+        0xce1de40642e3f4b9,0x36251260ab9d668e,
+        0x80d2ae83e9ce78f3,0xc1d72b7c6b426019,
+        0xa1075a24e4421730,0xb24cf65b8612f81f,
+        0xc94930ae1d529cfc,0xdee033f26797b627,
+        0xfb9b7cd9a4a7443c,0x169840ef017da3b1,
+        0x9d412e0806e88aa5,0x8e1f289560ee864e,
+        0xc491798a08a2ad4e,0xf1a6f2bab92a27e2,
+        0xf5b5d7ec8acb58a2,0xae10af696774b1db,
+        0x9991a6f3d6bf1765,0xacca6da1e0a8ef29,
+        0xbff610b0cc6edd3f,0x17fd090a58d32af3,
+        0xeff394dcff8a948e,0xddfc4b4cef07f5b0,
+        0x95f83d0a1fb69cd9,0x4abdaf101564f98e,
+        0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1,
+        0xea53df5fd18d5513,0x84c86189216dc5ed,
+        0x92746b9be2f8552c,0x32fd3cf5b4e49bb4,
+        0xb7118682dbb66a77,0x3fbc8c33221dc2a1,
+        0xe4d5e82392a40515,0xfabaf3feaa5334a,
+        0x8f05b1163ba6832d,0x29cb4d87f2a7400e,
+        0xb2c71d5bca9023f8,0x743e20e9ef511012,
+        0xdf78e4b2bd342cf6,0x914da9246b255416,
+        0x8bab8eefb6409c1a,0x1ad089b6c2f7548e,
+        0xae9672aba3d0c320,0xa184ac2473b529b1,
+        0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e,
+        0x8865899617fb1871,0x7e2fa67c7a658892,
+        0xaa7eebfb9df9de8d,0xddbb901b98feeab7,
+        0xd51ea6fa85785631,0x552a74227f3ea565,
+        0x8533285c936b35de,0xd53a88958f87275f,
+        0xa67ff273b8460356,0x8a892abaf368f137,
+        0xd01fef10a657842c,0x2d2b7569b0432d85,
+        0x8213f56a67f6b29b,0x9c3b29620e29fc73,
+        0xa298f2c501f45f42,0x8349f3ba91b47b8f,
+        0xcb3f2f7642717713,0x241c70a936219a73,
+        0xfe0efb53d30dd4d7,0xed238cd383aa0110,
+        0x9ec95d1463e8a506,0xf4363804324a40aa,
+        0xc67bb4597ce2ce48,0xb143c6053edcd0d5,
+        0xf81aa16fdc1b81da,0xdd94b7868e94050a,
+        0x9b10a4e5e9913128,0xca7cf2b4191c8326,
+        0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0,
+        0xf24a01a73cf2dccf,0xbc633b39673c8cec,
+        0x976e41088617ca01,0xd5be0503e085d813,
+        0xbd49d14aa79dbc82,0x4b2d8644d8a74e18,
+        0xec9c459d51852ba2,0xddf8e7d60ed1219e,
+        0x93e1ab8252f33b45,0xcabb90e5c942b503,
+        0xb8da1662e7b00a17,0x3d6a751f3b936243,
+        0xe7109bfba19c0c9d,0xcc512670a783ad4,
+        0x906a617d450187e2,0x27fb2b80668b24c5,
+        0xb484f9dc9641e9da,0xb1f9f660802dedf6,
+        0xe1a63853bbd26451,0x5e7873f8a0396973,
+        0x8d07e33455637eb2,0xdb0b487b6423e1e8,
+        0xb049dc016abc5e5f,0x91ce1a9a3d2cda62,
+        0xdc5c5301c56b75f7,0x7641a140cc7810fb,
+        0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d,
+        0xac2820d9623bf429,0x546345fa9fbdcd44,
+        0xd732290fbacaf133,0xa97c177947ad4095,
+        0x867f59a9d4bed6c0,0x49ed8eabcccc485d,
+        0xa81f301449ee8c70,0x5c68f256bfff5a74,
+        0xd226fc195c6a2f8c,0x73832eec6fff3111,
+        0x83585d8fd9c25db7,0xc831fd53c5ff7eab,
+        0xa42e74f3d032f525,0xba3e7ca8b77f5e55,
+        0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb,
+        0x80444b5e7aa7cf85,0x7980d163cf5b81b3,
+        0xa0555e361951c366,0xd7e105bcc332621f,
+        0xc86ab5c39fa63440,0x8dd9472bf3fefaa7,
+        0xfa856334878fc150,0xb14f98f6f0feb951,
+        0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3,
+        0xc3b8358109e84f07,0xa862f80ec4700c8,
+        0xf4a642e14c6262c8,0xcd27bb612758c0fa,
+        0x98e7e9cccfbd7dbd,0x8038d51cb897789c,
+        0xbf21e44003acdd2c,0xe0470a63e6bd56c3,
+        0xeeea5d5004981478,0x1858ccfce06cac74,
+        0x95527a5202df0ccb,0xf37801e0c43ebc8,
+        0xbaa718e68396cffd,0xd30560258f54e6ba,
+        0xe950df20247c83fd,0x47c6b82ef32a2069,
+        0x91d28b7416cdd27e,0x4cdc331d57fa5441,
+        0xb6472e511c81471d,0xe0133fe4adf8e952,
+        0xe3d8f9e563a198e5,0x58180fddd97723a6,
+        0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,};
+
+} // namespace internal
+} // namespace simdjson
+/* end file src/internal/numberparsing_tables.cpp */
+/* begin file src/internal/simdprune_tables.cpp */
+#if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_ICELAKE || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64
+
+#include <cstdint>
+
+namespace simdjson { // table modified and copied from
+namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
+SIMDJSON_DLLIMPORTEXPORT  const unsigned char BitsSetTable256mul2[256] = {
+    0,  2,  2,  4,  2,  4,  4,  6,  2,  4,  4,  6,  4,  6,  6,  8,  2,  4,  4,
+    6,  4,  6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 2,  4,  4,  6,  4,  6,
+    6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,
+    8,  8,  10, 8,  10, 10, 12, 2,  4,  4,  6,  4,  6,  6,  8,  4,  6,  6,  8,
+    6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10,
+    12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,  8,
+    8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 2,  4,  4,  6,  4,
+    6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10,
+    6,  8,  8,  10, 8,  10, 10, 12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,
+    10, 8,  10, 10, 12, 6,  8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12,
+    12, 14, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,
+    8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 6,  8,  8,  10,
+    8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 8,  10, 10, 12, 10, 12, 12,
+    14, 10, 12, 12, 14, 12, 14, 14, 16};
+
+SIMDJSON_DLLIMPORTEXPORT  const uint8_t pshufb_combine_table[272] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
+    0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
+    0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+    0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
+    0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+    0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
+// 256 * 8 bytes = 2kB, easily fits in cache.
+SIMDJSON_DLLIMPORTEXPORT  const uint64_t thintable_epi8[256] = {
+    0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
+    0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
+    0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
+    0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
+    0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
+    0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
+    0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
+    0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
+    0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
+    0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
+    0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
+    0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
+    0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
+    0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
+    0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
+    0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
+    0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
+    0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
+    0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
+    0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
+    0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
+    0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
+    0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
+    0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
+    0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
+    0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
+    0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
+    0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
+    0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
+    0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
+    0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
+    0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
+    0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
+    0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
+    0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
+    0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
+    0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
+    0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
+    0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
+    0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
+    0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
+    0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
+    0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
+    0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
+    0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
+    0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
+    0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
+    0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
+    0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
+    0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
+    0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
+    0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
+    0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
+    0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
+    0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
+    0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
+    0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
+    0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
+    0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
+    0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
+    0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
+    0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
+    0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
+    0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
+    0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
+    0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
+    0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
+    0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
+    0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
+    0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
+    0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
+    0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
+    0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
+    0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
+    0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
+    0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
+    0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
+    0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
+    0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
+    0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
+    0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
+    0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
+    0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
+    0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
+    0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
+    0x0000000000000000,
+}; //static uint64_t thintable_epi8[256]
+
+} // namespace internal
+} // namespace simdjson
+
+#endif //  SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_ICELAKE || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64
+/* end file src/internal/simdprune_tables.cpp */
+/* begin file src/implementation.cpp */
+#include <initializer_list>
+
+namespace simdjson {
+
+bool implementation::supported_by_runtime_system() const {
+  uint32_t required_instruction_sets = this->required_instruction_sets();
+  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+  return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
+}
+
+namespace internal {
+
+// Static array of known implementations. We're hoping these get baked into the executable
+// without requiring a static initializer.
+
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+static const icelake::implementation* get_icelake_singleton() {
+  static const icelake::implementation icelake_singleton{};
+  return &icelake_singleton;
+}
+#endif
+#if SIMDJSON_IMPLEMENTATION_HASWELL
+static const haswell::implementation* get_haswell_singleton() {
+  static const haswell::implementation haswell_singleton{};
+  return &haswell_singleton;
+}
+#endif
+#if SIMDJSON_IMPLEMENTATION_WESTMERE
+static const westmere::implementation* get_westmere_singleton() {
+  static const westmere::implementation westmere_singleton{};
+  return &westmere_singleton;
+}
+#endif // SIMDJSON_IMPLEMENTATION_WESTMERE
+#if SIMDJSON_IMPLEMENTATION_ARM64
+static const arm64::implementation* get_arm64_singleton() {
+  static const arm64::implementation arm64_singleton{};
+  return &arm64_singleton;
+}
+#endif // SIMDJSON_IMPLEMENTATION_ARM64
+#if SIMDJSON_IMPLEMENTATION_PPC64
+static const ppc64::implementation* get_ppc64_singleton() {
+  static const ppc64::implementation ppc64_singleton{};
+  return &ppc64_singleton;
+}
+#endif // SIMDJSON_IMPLEMENTATION_PPC64
+#if SIMDJSON_IMPLEMENTATION_FALLBACK
+static const fallback::implementation* get_fallback_singleton() {
+  static const fallback::implementation fallback_singleton{};
+  return &fallback_singleton;
+}
+#endif // SIMDJSON_IMPLEMENTATION_FALLBACK
+
+/**
+ * @private Detects best supported implementation on first use, and sets it
+ */
+class detect_best_supported_implementation_on_first_use final : public implementation {
+public:
+  const std::string &name() const noexcept final { return set_best()->name(); }
+  const std::string &description() const noexcept final { return set_best()->description(); }
+  uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final {
+    return set_best()->create_dom_parser_implementation(capacity, max_length, dst);
+  }
+  simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
+    return set_best()->minify(buf, len, dst, dst_len);
+  }
+  simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8(buf, len);
+  }
+  simdjson_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
+private:
+  const implementation *set_best() const noexcept;
+};
+
+static const std::initializer_list<const implementation *>& get_available_implementation_pointers() {
+  static const std::initializer_list<const implementation *> available_implementation_pointers {
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+    get_icelake_singleton(),
+#endif
+#if SIMDJSON_IMPLEMENTATION_HASWELL
+    get_haswell_singleton(),
+#endif
+#if SIMDJSON_IMPLEMENTATION_WESTMERE
+    get_westmere_singleton(),
+#endif
+#if SIMDJSON_IMPLEMENTATION_ARM64
+    get_arm64_singleton(),
+#endif
+#if SIMDJSON_IMPLEMENTATION_PPC64
+    get_ppc64_singleton(),
+#endif
+#if SIMDJSON_IMPLEMENTATION_FALLBACK
+    get_fallback_singleton(),
+#endif
+  }; // available_implementation_pointers
+  return available_implementation_pointers;
+}
+
+// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
+class unsupported_implementation final : public implementation {
+public:
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+    size_t,
+    size_t,
+    std::unique_ptr<internal::dom_parser_implementation>&
+  ) const noexcept final {
+    return UNSUPPORTED_ARCHITECTURE;
+  }
+  simdjson_warn_unused error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
+    return UNSUPPORTED_ARCHITECTURE;
+  }
+  simdjson_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
+    return false; // Just refuse to validate. Given that we have a fallback implementation
+    // it seems unlikely that unsupported_implementation will ever be used. If it is used,
+    // then it will flag all strings as invalid. The alternative is to return an error_code
+    // from which the user has to figure out whether the string is valid UTF-8... which seems
+    // like a lot of work just to handle the very unlikely case that we have an unsupported
+    // implementation. And, when it does happen (that we have an unsupported implementation),
+    // what are the chances that the programmer has a fallback? Given that *we* provide the
+    // fallback, it implies that the programmer would need a fallback for our fallback.
+  }
+  unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
+};
+
+const unsupported_implementation* get_unsupported_singleton() {
+    static const unsupported_implementation unsupported_singleton{};
+    return &unsupported_singleton;
+}
+
+size_t available_implementation_list::size() const noexcept {
+  return internal::get_available_implementation_pointers().size();
+}
+const implementation * const *available_implementation_list::begin() const noexcept {
+  return internal::get_available_implementation_pointers().begin();
+}
+const implementation * const *available_implementation_list::end() const noexcept {
+  return internal::get_available_implementation_pointers().end();
+}
+const implementation *available_implementation_list::detect_best_supported() const noexcept {
+  // They are prelisted in priority order, so we just go down the list
+  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+  for (const implementation *impl : internal::get_available_implementation_pointers()) {
+    uint32_t required_instruction_sets = impl->required_instruction_sets();
+    if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
+  }
+  return get_unsupported_singleton(); // this should never happen?
+}
+
+const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
+  SIMDJSON_PUSH_DISABLE_WARNINGS
+  SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+  char *force_implementation_name = getenv("SIMDJSON_FORCE_IMPLEMENTATION");
+  SIMDJSON_POP_DISABLE_WARNINGS
+
+  if (force_implementation_name) {
+    auto force_implementation = get_available_implementations()[force_implementation_name];
+    if (force_implementation) {
+      return get_active_implementation() = force_implementation;
+    } else {
+      // Note: abort() and stderr usage within the library is forbidden.
+      return get_active_implementation() = get_unsupported_singleton();
+    }
+  }
+  return get_active_implementation() = get_available_implementations().detect_best_supported();
+}
+
+} // namespace internal
+
+SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
+  static const internal::available_implementation_list available_implementations{};
+  return available_implementations;
+}
+
+SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
+    static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
+    static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
+    return active_implementation;
+}
+
+simdjson_warn_unused error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
+  return get_active_implementation()->minify(reinterpret_cast<const uint8_t *>(buf), len, reinterpret_cast<uint8_t *>(dst), dst_len);
+}
+simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
+  return get_active_implementation()->validate_utf8(buf, len);
+}
+const implementation * builtin_implementation() {
+  static const implementation * builtin_impl = get_available_implementations()[SIMDJSON_STRINGIFY(SIMDJSON_BUILTIN_IMPLEMENTATION)];
+  assert(builtin_impl);
+  return builtin_impl;
+}
+
+
+} // namespace simdjson
+/* end file src/implementation.cpp */
+
+#if SIMDJSON_IMPLEMENTATION_ARM64
+/* begin file src/arm64/implementation.cpp */
+/* begin file include/simdjson/arm64/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "arm64"
+// #define SIMDJSON_IMPLEMENTATION arm64
+/* end file include/simdjson/arm64/begin.h */
+
+namespace simdjson {
+namespace arm64 {
+
+simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  if (auto err = dst->set_capacity(capacity))
+    return err;
+  if (auto err = dst->set_max_depth(max_depth))
+    return err;
+  return SUCCESS;
+}
+
+} // namespace arm64
+} // namespace simdjson
+
+/* begin file include/simdjson/arm64/end.h */
+/* end file include/simdjson/arm64/end.h */
+/* end file src/arm64/implementation.cpp */
+/* begin file src/arm64/dom_parser_implementation.cpp */
+/* begin file include/simdjson/arm64/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "arm64"
+// #define SIMDJSON_IMPLEMENTATION arm64
+/* end file include/simdjson/arm64/begin.h */
+
+//
+// Stage 1
+//
+namespace simdjson {
+namespace arm64 {
+namespace {
+
+using namespace simd;
+
+struct json_character_block {
+  static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
+
+  simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
+  simdjson_inline uint64_t op() const noexcept { return _op; }
+  simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
+
+  uint64_t _whitespace;
+  uint64_t _op;
+};
+
+simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
+  // Functional programming causes trouble with Visual Studio.
+  // Keeping this version in comments since it is much nicer:
+  // auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
+  //  auto nib_lo = chunk & 0xf;
+  //  auto nib_hi = chunk.shr<4>();
+  //  auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+  //  auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+  //  return shuf_lo & shuf_hi;
+  // });
+  const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+  const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+
+  simd8x64<uint8_t> v(
+     (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
+     (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
+     (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
+     (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
+  );
+
+
+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace). *However* if we only need spaces,
+  // it is likely that we will still compute 'v' above with two lookup_16: one
+  // could do it a bit cheaper. This is in contrast with the x64 implementations
+  // where we can, efficiently, do the white space and structural matching
+  // separately. One reason for this difference is that on ARM NEON, the table
+  // lookups either zero or leave unchanged the characters exceeding 0xF whereas
+  // on x64, the equivalent instruction (pshufb) automatically applies a mask,
+  // ignoring the 4 most significant bits. Thus the x64 implementation is
+  // optimized differently. This being said, if you use this code strictly
+  // just for minification (or just to identify the structural characters),
+  // there is a small untaken optimization opportunity here. We deliberately
+  // do not pick it up.
+
+  uint64_t op = simd8x64<bool>(
+        v.chunks[0].any_bits_set(0x7),
+        v.chunks[1].any_bits_set(0x7),
+        v.chunks[2].any_bits_set(0x7),
+        v.chunks[3].any_bits_set(0x7)
+  ).to_bitmask();
+
+  uint64_t whitespace = simd8x64<bool>(
+        v.chunks[0].any_bits_set(0x18),
+        v.chunks[1].any_bits_set(0x18),
+        v.chunks[2].any_bits_set(0x18),
+        v.chunks[3].any_bits_set(0x18)
+  ).to_bitmask();
+
+  return { whitespace, op };
+}
+
+simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+    simd8<uint8_t> bits = input.reduce_or();
+    return bits.max_val() < 0x80u;
+}
+
+simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+    simd8<bool> is_second_byte = prev1 >= uint8_t(0xc0u);
+    simd8<bool> is_third_byte  = prev2 >= uint8_t(0xe0u);
+    simd8<bool> is_fourth_byte = prev3 >= uint8_t(0xf0u);
+    // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
+    // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
+    // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
+    // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
+    // The error will be detected there.
+    return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdjson_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+    simd8<bool> is_third_byte  = prev2 >= uint8_t(0xe0u);
+    simd8<bool> is_fourth_byte = prev3 >= uint8_t(0xf0u);
+    return is_third_byte ^ is_fourth_byte;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+
+/* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+    static const uint8_t max_array[64] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#else
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#endif
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdjson_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+#ifndef SIMDJSON_IF_CONSTEXPR
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_IF_CONSTEXPR if constexpr
+#else
+#define SIMDJSON_IF_CONSTEXPR if
+#endif
+#endif
+
+    simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdjson_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
+                ||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
+                || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support one, two or four chunks per 64-byte block.");
+        SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+      }
+    }
+    // do not forget to call check_eof!
+    simdjson_inline error_code errors() {
+      return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_lookup4_algorithm.h */
+/* begin file src/generic/stage1/json_structural_indexer.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+/* begin file src/generic/stage1/buf_block_reader.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdjson_inline size_t block_index();
+  simdjson_inline bool has_full_block() const;
+  simdjson_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdjson_inline size_t get_remainder(uint8_t *dst) const;
+  simdjson_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text_64(const uint8_t *text) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdjson_unused static char * format_mask(uint64_t mask) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/buf_block_reader.h */
+/* begin file src/generic/stage1/json_string_scanner.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage1 {
+
+struct json_string_block {
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
+  _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
+
+  // Escaped characters (characters following an escape() character)
+  simdjson_inline uint64_t escaped() const { return _escaped; }
+  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+  simdjson_inline uint64_t escape() const { return _backslash & ~_escaped; }
+  // Real (non-backslashed) quotes
+  simdjson_inline uint64_t quote() const { return _quote; }
+  // Start quotes of strings
+  simdjson_inline uint64_t string_start() const { return _quote & _in_string; }
+  // End quotes of strings
+  simdjson_inline uint64_t string_end() const { return _quote & ~_in_string; }
+  // Only characters inside the string (not including the quotes)
+  simdjson_inline uint64_t string_content() const { return _in_string & ~_quote; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+  // Tail of string (everything except the start quote)
+  simdjson_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+  // backslash characters
+  uint64_t _backslash;
+  // escaped characters (backslashed--does not include the hex characters after \u)
+  uint64_t _escaped;
+  // real quotes (non-backslashed ones)
+  uint64_t _quote;
+  // string characters (includes start quote but not end quote)
+  uint64_t _in_string;
+};
+
+// Scans blocks for string characters, storing the state necessary to do so
+class json_string_scanner {
+public:
+  simdjson_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Intended to be defined by the implementation
+  simdjson_inline uint64_t find_escaped(uint64_t escape);
+  simdjson_inline uint64_t find_escaped_branchless(uint64_t escape);
+
+  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+  uint64_t prev_in_string = 0ULL;
+  // Whether the first character of the next iteration is escaped.
+  uint64_t prev_escaped = 0ULL;
+};
+
+//
+// Finds escaped characters (characters following \).
+//
+// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+//
+// Does this by:
+// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+//
+// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+// the start bit causes a carry), and leaves even-bit sequences alone.
+//
+// Example:
+//
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+// escaped        |   x  | x x  x x  x x  x  x  |
+// desired        |   x  | x x  x x  x x  x  x  |
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+//
+simdjson_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
+  // If there was overflow, pretend the first character isn't a backslash
+  backslash &= ~prev_escaped;
+  uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+  // Get sequences starting on even bits by clearing out the odd series using +
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+  uint64_t sequences_starting_on_even_bits;
+  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+  // Mask every other backslashed character as an escaped character
+  // Flip the mask for sequences that start on even bits, to correct them
+  return (even_bits ^ invert_mask) & follows_escape;
+}
+
+//
+// Return a mask of all string characters plus end quotes.
+//
+// prev_escaped is overflow saying whether the next character is escaped.
+// prev_in_string is overflow saying whether we're still in a string.
+//
+// Backslash sequences outside of quotes will be detected in stage 2.
+//
+simdjson_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  const uint64_t backslash = in.eq('\\');
+  const uint64_t escaped = find_escaped(backslash);
+  const uint64_t quote = in.eq('"') & ~escaped;
+
+  //
+  // prefix_xor flips on bits inside the string (and flips off the end quote).
+  //
+  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+  // (characters inside strings are outside, and characters outside strings are inside).
+  //
+  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+  //
+  // Check if we're still in a string at the end of the box so the next block will know
+  //
+  // right shift of a signed value expected to be well-defined and standard
+  // compliant as of C++20, John Regher from Utah U. says this is fine code
+  //
+  prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
+  // Use ^ to turn the beginning quote off, and the end quote on.
+
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_string_block(
+    backslash,
+    escaped,
+    quote,
+    in_string
+  );
+}
+
+simdjson_inline error_code json_string_scanner::finish() {
+  if (prev_in_string) {
+    return UNCLOSED_STRING;
+  }
+  return SUCCESS;
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/json_string_scanner.h */
+/* begin file src/generic/stage1/json_scanner.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage1 {
+
+/**
+ * A block of scanned json, with information on operators and scalars.
+ *
+ * We seek to identify pseudo-structural characters. Anything that is inside
+ * a string must be omitted (hence  & ~_string.string_tail()).
+ * Otherwise, pseudo-structural characters come in two forms.
+ * 1. We have the structural characters ([,],{,},:, comma). The
+ *    term 'structural character' is from the JSON RFC.
+ * 2. We have the 'scalar pseudo-structural characters'.
+ *    Scalars are quotes, and any character except structural characters and white space.
+ *
+ * To identify the scalar pseudo-structural characters, we must look at what comes
+ * before them: it must be a space, a quote or a structural characters.
+ * Starting with simdjson v0.3, we identify them by
+ * negation: we identify everything that is followed by a non-quote scalar,
+ * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
+ */
+struct json_block {
+public:
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+  simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+
+  /**
+   * The start of structurals.
+   * In simdjson prior to v0.3, these were called the pseudo-structural characters.
+   **/
+  simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
+  /** All JSON whitespace (i.e. not in a string) */
+  simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
+
+  // Helpers
+
+  /** Whether the given characters are inside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
+  /** Whether the given characters are outside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
+
+  // string and escape characters
+  json_string_block _string;
+  // whitespace, structural characters ('operators'), scalars
+  json_character_block _characters;
+  // whether the previous character was a scalar
+  uint64_t _follows_potential_nonquote_scalar;
+private:
+  // Potential structurals (i.e. disregarding strings)
+
+  /**
+   * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
+   * They may reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
+  /**
+   * The start of non-operator runs, like 123, true and "abc".
+   * It main reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_scalar_start() const noexcept {
+    // The term "scalar" refers to anything except structural characters and white space
+    // (so letters, numbers, quotes).
+    // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
+    // then we know that it is irrelevant structurally.
+    return _characters.scalar() & ~follows_potential_scalar();
+  }
+  /**
+   * Whether the given character is immediately after a non-operator like 123, true.
+   * The characters following a quote are not included.
+   */
+  simdjson_inline uint64_t follows_potential_scalar() const noexcept {
+    // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
+    // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
+    // white space.
+    // It is understood that within quoted region, anything at all could be marked (irrelevant).
+    return _follows_potential_nonquote_scalar;
+  }
+};
+
+/**
+ * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - structural characters or 'operators' ([]{},:, comma)
+ *   and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+class json_scanner {
+public:
+  json_scanner() = default;
+  simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Whether the last character of the previous iteration is part of a scalar token
+  // (anything except whitespace or a structural character/'operator').
+  uint64_t prev_scalar = 0ULL;
+  json_string_scanner string_scanner{};
+};
+
+
+//
+// Check if the current character immediately follows a matching character.
+//
+// For example, this checks for quotes with backslashes in front of them:
+//
+//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+//
+simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+  const uint64_t result = match << 1 | overflow;
+  overflow = match >> 63;
+  return result;
+}
+
+simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  json_string_block strings = string_scanner.next(in);
+  // identifies the white-space and the structural characters
+  json_character_block characters = json_character_block::classify(in);
+  // The term "scalar" refers to anything except structural characters and white space
+  // (so letters, numbers, quotes).
+  // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
+  //
+  // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
+  // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
+  // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
+  // may need to add an extra check when parsing strings.
+  //
+  // Performance: there are many ways to skin this cat.
+  const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
+  uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_block(
+    strings,// strings is a function-local object so either it moves or the copy is elided.
+    characters,
+    follows_nonquote_scalar
+  );
+}
+
+simdjson_inline error_code json_scanner::finish() {
+  return string_scanner.finish();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/json_scanner.h */
+/* begin file src/generic/stage1/json_minifier.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage1 {
+
+class json_minifier {
+public:
+  template<size_t STEP_SIZE>
+  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+
+private:
+  simdjson_inline json_minifier(uint8_t *_dst)
+  : dst{_dst}
+  {}
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
+  simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+  json_scanner scanner{};
+  uint8_t *dst;
+};
+
+simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
+  uint64_t mask = block.whitespace();
+  dst += in.compress(mask, dst);
+}
+
+simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
+  error_code error = scanner.finish();
+  if (error) { dst_len = 0; return error; }
+  dst_len = dst - dst_start;
+  return SUCCESS;
+}
+
+template<>
+simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  simd::simd8x64<uint8_t> in_2(block_buf+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1);
+  this->next(in_2, block_2);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  json_block block_1 = scanner.next(in_1);
+  this->next(block_buf, block_1);
+  reader.advance();
+}
+
+template<size_t STEP_SIZE>
+error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
+  while (reader.has_full_block()) {
+    minifier.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  size_t remaining_bytes = reader.get_remainder(block);
+  if (remaining_bytes > 0) {
+    // We do not want to write directly to the output stream. Rather, we write
+    // to a local buffer (for safety).
+    uint8_t out_block[STEP_SIZE];
+    uint8_t * const guarded_dst{minifier.dst};
+    minifier.dst = out_block;
+    minifier.step<STEP_SIZE>(block, reader);
+    size_t to_write = minifier.dst - out_block;
+    // In some cases, we could be enticed to consider the padded spaces
+    // as part of the string. This is fine as long as we do not write more
+    // than we consumed.
+    if(to_write > remaining_bytes) { to_write = remaining_bytes; }
+    memcpy(guarded_dst, out_block, to_write);
+    minifier.dst = guarded_dst + to_write;
+  }
+  return minifier.finish(dst, dst_len);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/json_minifier.h */
+/* begin file src/generic/stage1/find_next_document_index.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ':' ','
+  * and when the second element is NOT one of these characters: '}' ']' ':' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // Variant: do not count separately, just figure out depth
+  if(parser.n_structural_indexes == 0) { return 0; }
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  // If we made it to the end, we want to finish counting to see if we have a full document.
+  switch (parser.buf[parser.structural_indexes[0]]) {
+    case '}':
+      obj_cnt--;
+      break;
+    case ']':
+      arr_cnt--;
+      break;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+  }
+  if (!arr_cnt && !obj_cnt) {
+    // We have a complete document.
+    return parser.n_structural_indexes;
+  }
+  return 0;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/find_next_document_index.h */
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage1 {
+
+class bit_indexer {
+public:
+  uint32_t *tail;
+
+  simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // base_ptr[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so base_ptr
+  // needs to be large enough to handle this
+  //
+  // If the kernel sets SIMDJSON_CUSTOM_BIT_INDEXER, then it will provide its own
+  // version of the code.
+#ifdef SIMDJSON_CUSTOM_BIT_INDEXER
+  simdjson_inline void write(uint32_t idx, uint64_t bits);
+#else
+  simdjson_inline void write(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+        return;
+#if defined(SIMDJSON_PREFER_REVERSE_BITS)
+    /**
+     * ARM lacks a fast trailing zero instruction, but it has a fast
+     * bit reversal instruction and a fast leading zero instruction.
+     * Thus it may be profitable to reverse the bits (once) and then
+     * to rely on a sequence of instructions that call the leading
+     * zero instruction.
+     *
+     * Performance notes:
+     * The chosen routine is not optimal in terms of data dependency
+     * since zero_leading_bit might require two instructions. However,
+     * it tends to minimize the total number of instructions which is
+     * beneficial.
+     */
+
+    uint64_t rev_bits = reverse_bits(bits);
+    int cnt = static_cast<int>(count_ones(bits));
+    int i = 0;
+    // Do the first 8 all together
+    for (; i<8; i++) {
+      int lz = leading_zeroes(rev_bits);
+      this->tail[i] = static_cast<uint32_t>(idx) + lz;
+      rev_bits = zero_leading_bit(rev_bits, lz);
+    }
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      i = 8;
+      for (; i<16; i++) {
+        int lz = leading_zeroes(rev_bits);
+        this->tail[i] = static_cast<uint32_t>(idx) + lz;
+        rev_bits = zero_leading_bit(rev_bits, lz);
+      }
+
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        i = 16;
+        while (rev_bits != 0) {
+          int lz = leading_zeroes(rev_bits);
+          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
+          rev_bits = zero_leading_bit(rev_bits, lz);
+        }
+      }
+    }
+    this->tail += cnt;
+#else // SIMDJSON_PREFER_REVERSE_BITS
+    /**
+     * Under recent x64 systems, we often have both a fast trailing zero
+     * instruction and a fast 'clear-lower-bit' instruction so the following
+     * algorithm can be competitive.
+     */
+
+    int cnt = static_cast<int>(count_ones(bits));
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
+      bits = clear_lowest_bit(bits);
+    }
+
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
+        bits = clear_lowest_bit(bits);
+      }
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        int i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = clear_lowest_bit(bits);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+#endif
+  }
+#endif // SIMDJSON_CUSTOM_BIT_INDEXER
+
+};
+
+class json_structural_indexer {
+public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+  template<size_t STEP_SIZE>
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
+
+private:
+  simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
+  simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
+
+  json_scanner scanner{};
+  utf8_checker checker{};
+  bit_indexer indexer;
+  uint64_t prev_structurals = 0;
+  uint64_t unescaped_chars_error = 0;
+};
+
+simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+
+// Skip the last character if it is partial
+simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (simdjson_unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
+  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
+  // We guard the rest of the code so that we can assume that len > 0 throughout.
+  if (len == 0) { return EMPTY; }
+  if (is_streaming(partial)) {
+    len = trim_partial_utf8(buf, len);
+    // If you end up with an empty window after trimming
+    // the partial UTF-8 bytes, then chances are good that you
+    // have an UTF-8 formatting error.
+    if(len == 0) { return UTF8_ERROR; }
+  }
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+
+  // Read all but the last block
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+  // Take care of the last block (will always be there unless file is empty which is
+  // not supposed to happen.)
+  uint8_t block[STEP_SIZE];
+  if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
+  indexer.step<STEP_SIZE>(block, reader);
+  return indexer.finish(parser, reader.block_index(), len, partial);
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
+
+simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+  error_code error = scanner.finish();
+  // We deliberately break down the next expression so that it is
+  // human readable.
+  const bool should_we_exit = is_streaming(partial) ?
+    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
+    : (error != SUCCESS); // if partial is false, we must have SUCCESS
+  const bool have_unclosed_string = (error == UNCLOSED_STRING);
+  if (simdjson_unlikely(should_we_exit)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * The On Demand API requires special padding.
+   *
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   *
+   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
+   * R"({ "a": [,,)"
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial == stage1_mode::streaming_partial) {
+    // If we have an unclosed string, then the last structural
+    // will be the quote and we want to make sure to omit it.
+    if(have_unclosed_string) {
+      parser.n_structural_indexes--;
+      // a valid JSON file cannot have zero structural indexes - we should have found something
+      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
+    }
+    // We truncate the input to the end of the last complete document (or zero).
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      if(parser.structural_indexes[0] == 0) {
+        // If the buffer is partial and we started at index 0 but the document is
+        // incomplete, it's too big to parse.
+        return CAPACITY;
+      } else {
+        // It is possible that the document could be parsed, we just had a lot
+        // of white space.
+        parser.n_structural_indexes = 0;
+        return EMPTY;
+      }
+    }
+
+    parser.n_structural_indexes = new_structural_indexes;
+  } else if (partial == stage1_mode::streaming_final) {
+    if(have_unclosed_string) { parser.n_structural_indexes--; }
+    // We truncate the input to the end of the last complete document (or zero).
+    // Because partial == stage1_mode::streaming_final, it means that we may
+    // silently ignore trailing garbage. Though it sounds bad, we do it
+    // deliberately because many people who have streams of JSON documents
+    // will truncate them for processing. E.g., imagine that you are uncompressing
+    // the data from a size file or receiving it in chunks from the network. You
+    // may not know where exactly the last document will be. Meanwhile the
+    // document_stream instances allow people to know the JSON documents they are
+    // parsing (see the iterator.source() method).
+    parser.n_structural_indexes = find_next_document_index(parser);
+    // We store the initial n_structural_indexes so that the client can see
+    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
+    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
+    // otherwise, it will copy some prior index.
+    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
+    // This next line is critical, do not change it unless you understand what you are
+    // doing.
+    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+    if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+        // We tolerate an unclosed string at the very end of the stream. Indeed, users
+        // often load their data in bulk without being careful and they want us to ignore
+        // the trailing garbage.
+        return EMPTY;
+    }
+  }
+  checker.check_eof();
+  return checker.errors();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/json_structural_indexer.h */
+/* begin file src/generic/stage1/utf8_validator.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage1 {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return c.errors() == error_code::SUCCESS;
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_validator.h */
+
+//
+// Stage 2
+//
+
+/* begin file src/generic/stage2/stringparsing.h */
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+/// @private
+namespace stringparsing {
+
+// begin copypasta
+// These chars yield themselves: " \ /
+// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+// u not handled in this table as it's complex
+static const uint8_t escape_map[256] = {
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
+    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
+    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+};
+
+// handle a unicode codepoint
+// write appropriate values into dest
+// src will advance 6 bytes or 12 bytes
+// dest will advance a variable amount (return via pointer)
+// return true if the unicode codepoint was valid
+// We work in little-endian then swap at write time
+simdjson_warn_unused
+simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                            uint8_t **dst_ptr) {
+  // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+  // conversion isn't valid; we defer the check for this to inside the
+  // multilingual plane check
+  uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
+  *src_ptr += 6;
+
+  // If we found a high surrogate, we must
+  // check for low surrogate for characters
+  // outside the Basic
+  // Multilingual Plane.
+  if (code_point >= 0xd800 && code_point < 0xdc00) {
+    const uint8_t *src_data = *src_ptr;
+    /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
+    if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
+      return false;
+    }
+    uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
+
+    // We have already checked that the high surrogate is valid and
+    // (code_point - 0xd800) < 1024.
+    //
+    // Check that code_point_2 is in the range 0xdc00..0xdfff
+    // and that code_point_2 was parsed from valid hex.
+    uint32_t low_bit = code_point_2 - 0xdc00;
+    if (low_bit >> 10) {
+      return false;
+    }
+
+    code_point =
+        (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
+    *src_ptr += 6;
+  } else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
+      // If we encounter a low surrogate (not preceded by a high surrogate)
+      // then we have an error.
+      return false;
+  }
+  size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
+  *dst_ptr += offset;
+  return offset > 0;
+}
+
+/**
+ * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
+ * must be an unescaped quote terminating the string. It returns the final output
+ * position as pointer. In case of error (e.g., the string has bad escaped codes),
+ * then null_nullptrptr is returned. It is assumed that the output buffer is large
+ * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
+ * SIMDJSON_PADDING bytes.
+ */
+simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
+  while (1) {
+    // Copy the next n bytes, and find the backslash and quote in them.
+    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+    // If the next thing is the end quote, copy and return
+    if (bs_quote.has_quote_first()) {
+      // we encountered quotes first. Move dst to point to quotes and exit
+      return dst + bs_quote.quote_index();
+    }
+    if (bs_quote.has_backslash()) {
+      /* find out where the backspace is */
+      auto bs_dist = bs_quote.backslash_index();
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return nullptr;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return nullptr; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      src += backslash_and_quote::BYTES_PROCESSED;
+      dst += backslash_and_quote::BYTES_PROCESSED;
+    }
+  }
+  /* can't be reached */
+  return nullptr;
+}
+
+} // namespace stringparsing
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/generic/stage2/tape_builder.h */
+/* begin file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace logger {
+
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+#if SIMDJSON_VERBOSE_LOGGING
+  static constexpr const bool LOG_ENABLED = true;
+#else
+  static constexpr const bool LOG_ENABLED = false;
+#endif
+  static constexpr const int LOG_EVENT_LEN = 20;
+  static constexpr const int LOG_BUFFER_LEN = 30;
+  static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
+  static constexpr const int LOG_INDEX_LEN = 5;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static simdjson_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static simdjson_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
+      printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
+    }
+  }
+
+  simdjson_unused static simdjson_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line from the stage 2 DOM parser
+  template<typename S>
+  static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
+      auto next_index = structurals.next_structural;
+      auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
+      auto next = &structurals.buf[*next_index];
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(current[i]));
+        }
+        printf(" ");
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
+          printf("%c", printable_char(next[i]));
+        }
+        printf(" ");
+      }
+      if (current_index) {
+        printf("| %*u ", LOG_INDEX_LEN, *current_index);
+      } else {
+        printf("| %-*s ", LOG_INDEX_LEN, "");
+      }
+      // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
+      printf("| %-s ", detail);
+      printf("|\n");
+    }
+  }
+
+} // namespace logger
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage2/logger.h */
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage2 {
+
+class json_iterator {
+public:
+  const uint8_t* const buf;
+  uint32_t *next_structural;
+  dom_parser_implementation &dom_parser;
+  uint32_t depth{0};
+
+  /**
+   * Walk the JSON document.
+   *
+   * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
+   * the first parameter; some callbacks have other parameters as well:
+   *
+   * - visit_document_start() - at the beginning.
+   * - visit_document_end() - at the end (if things were successful).
+   *
+   * - visit_array_start() - at the start `[` of a non-empty array.
+   * - visit_array_end() - at the end `]` of a non-empty array.
+   * - visit_empty_array() - when an empty array is encountered.
+   *
+   * - visit_object_end() - at the start `]` of a non-empty object.
+   * - visit_object_start() - at the end `]` of a non-empty object.
+   * - visit_empty_object() - when an empty object is encountered.
+   * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
+   *                                   guaranteed to point at the first quote of the string (`"key"`).
+   * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
+   * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
+   *
+   * - increment_count(iter) - each time a value is found in an array or object.
+   */
+  template<bool STREAMING, typename V>
+  simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
+
+  /**
+   * Create an iterator capable of walking a JSON document.
+   *
+   * The document must have already passed through stage 1.
+   */
+  simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
+
+  /**
+   * Look at the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *peek() const noexcept;
+  /**
+   * Advance to the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *advance() noexcept;
+  /**
+   * Get the remaining length of the document, from the start of the current token.
+   */
+  simdjson_inline size_t remaining_len() const noexcept;
+  /**
+   * Check if we are at the end of the document.
+   *
+   * If this is true, there are no more tokens.
+   */
+  simdjson_inline bool at_eof() const noexcept;
+  /**
+   * Check if we are at the beginning of the document.
+   */
+  simdjson_inline bool at_beginning() const noexcept;
+  simdjson_inline uint8_t last_structural() const noexcept;
+
+  /**
+   * Log that a value has been found.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_value(const char *type) const noexcept;
+  /**
+   * Log the start of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_start_value(const char *type) const noexcept;
+  /**
+   * Log the end of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_end_value(const char *type) const noexcept;
+  /**
+   * Log an error.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_error(const char *error) const noexcept;
+
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
+};
+
+template<bool STREAMING, typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
+  logger::log_start();
+
+  //
+  // Start the document
+  //
+  if (at_eof()) { return EMPTY; }
+  log_start_value("document");
+  SIMDJSON_TRY( visitor.visit_document_start(*this) );
+
+  //
+  // Read first value
+  //
+  {
+    auto value = advance();
+
+    // Make sure the outer object or array is closed before continuing; otherwise, there are ways we
+    // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      switch (*value) {
+        case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
+        case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
+      }
+    }
+
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
+    }
+  }
+  goto document_end;
+
+//
+// Object parser states
+//
+object_begin:
+  log_start_value("object");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = false;
+  SIMDJSON_TRY( visitor.visit_object_start(*this) );
+
+  {
+    auto key = advance();
+    if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
+    SIMDJSON_TRY( visitor.increment_count(*this) );
+    SIMDJSON_TRY( visitor.visit_key(*this, key) );
+  }
+
+object_field:
+  if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+object_continue:
+  switch (*advance()) {
+    case ',':
+      SIMDJSON_TRY( visitor.increment_count(*this) );
+      {
+        auto key = advance();
+        if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
+        SIMDJSON_TRY( visitor.visit_key(*this, key) );
+      }
+      goto object_field;
+    case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
+    default: log_error("No comma between object fields"); return TAPE_ERROR;
+  }
+
+scope_end:
+  depth--;
+  if (depth == 0) { goto document_end; }
+  if (dom_parser.is_array[depth]) { goto array_continue; }
+  goto object_continue;
+
+//
+// Array parser states
+//
+array_begin:
+  log_start_value("array");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = true;
+  SIMDJSON_TRY( visitor.visit_array_start(*this) );
+  SIMDJSON_TRY( visitor.increment_count(*this) );
+
+array_value:
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+array_continue:
+  switch (*advance()) {
+    case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
+    case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
+    default: log_error("Missing comma between array values"); return TAPE_ERROR;
+  }
+
+document_end:
+  log_end_value("document");
+  SIMDJSON_TRY( visitor.visit_document_end(*this) );
+
+  dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
+
+  // If we didn't make it to the end, it's an error
+  if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
+    log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return TAPE_ERROR;
+  }
+
+  return SUCCESS;
+
+} // walk_document()
+
+simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
+  : buf{_dom_parser.buf},
+    next_structural{&_dom_parser.structural_indexes[start_structural_index]},
+    dom_parser{_dom_parser} {
+}
+
+simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
+  return &buf[*(next_structural)];
+}
+simdjson_inline const uint8_t *json_iterator::advance() noexcept {
+  return &buf[*(next_structural++)];
+}
+simdjson_inline size_t json_iterator::remaining_len() const noexcept {
+  return dom_parser.len - *(next_structural-1);
+}
+
+simdjson_inline bool json_iterator::at_eof() const noexcept {
+  return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
+}
+simdjson_inline bool json_iterator::at_beginning() const noexcept {
+  return next_structural == dom_parser.structural_indexes.get();
+}
+simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
+  return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
+}
+
+simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
+  logger::log_line(*this, "", type, "");
+}
+
+simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
+  logger::log_line(*this, "+", type, "");
+  if (logger::LOG_ENABLED) { logger::log_depth++; }
+}
+
+simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
+  if (logger::LOG_ENABLED) { logger::log_depth--; }
+  logger::log_line(*this, "-", type, "");
+}
+
+simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
+  logger::log_line(*this, "", "ERROR", error);
+}
+
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_root_string(*this, value);
+    case 't': return visitor.visit_root_true_atom(*this, value);
+    case 'f': return visitor.visit_root_false_atom(*this, value);
+    case 'n': return visitor.visit_root_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_root_number(*this, value);
+    default:
+      log_error("Document starts with a non-value character");
+      return TAPE_ERROR;
+  }
+}
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_string(*this, value);
+    case 't': return visitor.visit_true_atom(*this, value);
+    case 'f': return visitor.visit_false_atom(*this, value);
+    case 'n': return visitor.visit_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_number(*this, value);
+    default:
+      log_error("Non-value found when value was expected!");
+      return TAPE_ERROR;
+  }
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/tape_writer.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage2 {
+
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+
+  /** Write a signed 64-bit value to tape. */
+  simdjson_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  simdjson_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  simdjson_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  simdjson_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  simdjson_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  simdjson_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+simdjson_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+simdjson_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage2 {
+
+struct tape_builder {
+  template<bool STREAMING>
+  simdjson_warn_unused static simdjson_inline error_code parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept;
+
+  /** Called when a non-empty document starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty document ends without error. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty array starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty array ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
+  /** Called when an empty array is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty object starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
+  /**
+   * Called when a key in a field is encountered.
+   *
+   * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
+   * will be called after this with the field value.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
+  /** Called when a non-empty object ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
+  /** Called when an empty object is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
+
+  /**
+   * Called when a string, number, boolean or null is found.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+  /**
+   * Called when a string, number, boolean or null is found at the top level of a document (i.e.
+   * when there is no array or object and the entire document is a single string, number, boolean or
+   * null.
+   *
+   * This is separate from primitive() because simdjson's normal primitive parsing routines assume
+   * there is at least one more token after the value, which is only true in an array or object.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  /** Called each time a new field or element in an array or object is found. */
+  simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
+
+  /** Next location to write to tape */
+  tape_writer tape;
+private:
+  /** Next write location in the string buf for stage 2 parsing */
+  uint8_t *current_string_buf_loc;
+
+  simdjson_inline tape_builder(dom::document &doc) noexcept;
+
+  simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
+  simdjson_inline void start_container(json_iterator &iter) noexcept;
+  simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
+  simdjson_inline void on_end_string(uint8_t *dst) noexcept;
+}; // class tape_builder
+
+template<bool STREAMING>
+simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
+  json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  tape_builder builder(doc);
+  return iter.walk_document<STREAMING>(builder);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_root_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
+  constexpr uint32_t start_tape_index = 0;
+  tape.append(start_tape_index, internal::tape_type::ROOT);
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
+  return visit_string(iter, key, true);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
+  return SUCCESS;
+}
+
+simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
+  iter.log_value(key ? "key" : "string");
+  uint8_t *dst = on_start_string(iter);
+  dst = stringparsing::parse_string(value+1, dst);
+  if (dst == nullptr) {
+    iter.log_error("Invalid escape in string");
+    return STRING_ERROR;
+  }
+  on_end_string(dst);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
+  return visit_string(iter, value);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("number");
+  return numberparsing::parse_number(value, tape);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
+  //
+  // We need to make a copy to make sure that the string is space terminated.
+  // This is not about padding the input, which should already padded up
+  // to len + SIMDJSON_PADDING. However, we have no control at this stage
+  // on how the padding was done. What if the input string was padded with nulls?
+  // It is quite common for an input string to have an extra null character (C string).
+  // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+  // document, but the string "9\0" by itself is fine. So we make a copy and
+  // pad the input with spaces when we know that there is just one input element.
+  // This copy is relatively expensive, but it will almost never be called in
+  // practice unless you are in the strange scenario where you have many JSON
+  // documents made of single atoms.
+  //
+  std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
+  if (copy.get() == nullptr) { return MEMALLOC; }
+  std::memcpy(copy.get(), value, iter.remaining_len());
+  std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
+  error_code error = visit_number(iter, copy.get());
+  return error;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+// private:
+
+simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
+  return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  auto start_index = next_tape_index(iter);
+  tape.append(start_index+2, start);
+  tape.append(start_index, end);
+  return SUCCESS;
+}
+
+simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
+  iter.dom_parser.open_containers[iter.depth].count = 0;
+  tape.skip(); // We don't actually *write* the start element until the end.
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  // Write the ending tape element, pointing at the start location
+  const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
+  tape.append(start_tape_index, end);
+  // Write the start tape element, pointing at the end location (and including count)
+  // count can overflow if it exceeds 24 bits... so we saturate
+  // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+  const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
+  const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
+  return SUCCESS;
+}
+
+simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
+  // we advance the point, accounting for the fact that we have a NULL termination
+  tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
+  return current_string_buf_loc + sizeof(uint32_t);
+}
+
+simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
+  uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+  // TODO check for overflow in case someone has a crazy string (>=4GB?)
+  // But only add the overflow check when the document itself exceeds 4GB
+  // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+  memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+  // NULL termination is still handy if you expect all your strings to
+  // be NULL terminated? It comes at a small cost
+  *dst = 0;
+  current_string_buf_loc = dst + 1;
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file src/generic/stage2/tape_builder.h */
+
+//
+// Implementation-specific overrides
+//
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace stage1 {
+
+simdjson_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  // On ARM, we don't short-circuit this if there are no backslashes, because the branch gives us no
+  // benefit and therefore makes things worse.
+  // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+
+} // namespace stage1
+} // unnamed namespace
+
+simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
+}
+
+simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return arm64::stage1::generic_validate_utf8(buf,len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<false>(*this, _doc);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<true>(*this, _doc);
+}
+
+simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst) const noexcept {
+  return arm64::stringparsing::parse_string(src, dst);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  auto error = stage1(_buf, _len, stage1_mode::regular);
+  if (error) { return error; }
+  return stage2(_doc);
+}
+
+} // namespace arm64
+} // namespace simdjson
+
+/* begin file include/simdjson/arm64/end.h */
+/* end file include/simdjson/arm64/end.h */
+/* end file src/arm64/dom_parser_implementation.cpp */
+#endif
+#if SIMDJSON_IMPLEMENTATION_FALLBACK
+/* begin file src/fallback/implementation.cpp */
+/* begin file include/simdjson/fallback/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "fallback"
+// #define SIMDJSON_IMPLEMENTATION fallback
+/* end file include/simdjson/fallback/begin.h */
+
+namespace simdjson {
+namespace fallback {
+
+simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  if (auto err = dst->set_capacity(capacity))
+    return err;
+  if (auto err = dst->set_max_depth(max_depth))
+    return err;
+  return SUCCESS;
+}
+
+} // namespace fallback
+} // namespace simdjson
+
+/* begin file include/simdjson/fallback/end.h */
+/* end file include/simdjson/fallback/end.h */
+/* end file src/fallback/implementation.cpp */
+/* begin file src/fallback/dom_parser_implementation.cpp */
+/* begin file include/simdjson/fallback/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "fallback"
+// #define SIMDJSON_IMPLEMENTATION fallback
+/* end file include/simdjson/fallback/begin.h */
+
+//
+// Stage 1
+//
+/* begin file src/generic/stage1/find_next_document_index.h */
+namespace simdjson {
+namespace fallback {
+namespace {
+
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ':' ','
+  * and when the second element is NOT one of these characters: '}' ']' ':' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // Variant: do not count separately, just figure out depth
+  if(parser.n_structural_indexes == 0) { return 0; }
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  // If we made it to the end, we want to finish counting to see if we have a full document.
+  switch (parser.buf[parser.structural_indexes[0]]) {
+    case '}':
+      obj_cnt--;
+      break;
+    case ']':
+      arr_cnt--;
+      break;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+  }
+  if (!arr_cnt && !obj_cnt) {
+    // We have a complete document.
+    return parser.n_structural_indexes;
+  }
+  return 0;
+}
+
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage1/find_next_document_index.h */
+
+namespace simdjson {
+namespace fallback {
+namespace {
+namespace stage1 {
+
+class structural_scanner {
+public:
+
+simdjson_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial)
+  : buf{_parser.buf},
+    next_structural_index{_parser.structural_indexes.get()},
+    parser{_parser},
+    len{static_cast<uint32_t>(_parser.len)},
+    partial{_partial} {
+}
+
+simdjson_inline void add_structural() {
+  *next_structural_index = idx;
+  next_structural_index++;
+}
+
+simdjson_inline bool is_continuation(uint8_t c) {
+  return (c & 0xc0) == 0x80;
+}
+
+simdjson_inline void validate_utf8_character() {
+  // Continuation
+  if (simdjson_unlikely((buf[idx] & 0x40) == 0)) {
+    // extra continuation
+    error = UTF8_ERROR;
+    idx++;
+    return;
+  }
+
+  // 2-byte
+  if ((buf[idx] & 0x20) == 0) {
+    // missing continuation
+    if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
+      if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
+      error = UTF8_ERROR;
+      idx++;
+      return;
+    }
+    // overlong: 1100000_ 10______
+    if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
+    idx += 2;
+    return;
+  }
+
+  // 3-byte
+  if ((buf[idx] & 0x10) == 0) {
+    // missing continuation
+    if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
+      if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
+      error = UTF8_ERROR;
+      idx++;
+      return;
+    }
+    // overlong: 11100000 100_____ ________
+    if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
+    // surrogates: U+D800-U+DFFF 11101101 101_____
+    if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
+    idx += 3;
+    return;
+  }
+
+  // 4-byte
+  // missing continuation
+  if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
+    if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
+    error = UTF8_ERROR;
+    idx++;
+    return;
+  }
+  // overlong: 11110000 1000____ ________ ________
+  if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
+  // too large: > U+10FFFF:
+  // 11110100 (1001|101_)____
+  // 1111(1___|011_|0101) 10______
+  // also includes 5, 6, 7 and 8 byte characters:
+  // 11111___
+  if (buf[idx] == 0xf4 && buf[idx+1] >= 0x90) { error = UTF8_ERROR; }
+  if (buf[idx] >= 0xf5) { error = UTF8_ERROR; }
+  idx += 4;
+}
+
+// Returns true if the string is unclosed.
+simdjson_inline bool validate_string() {
+  idx++; // skip first quote
+  while (idx < len && buf[idx] != '"') {
+    if (buf[idx] == '\\') {
+      idx += 2;
+    } else if (simdjson_unlikely(buf[idx] & 0x80)) {
+      validate_utf8_character();
+    } else {
+      if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
+      idx++;
+    }
+  }
+  if (idx >= len) { return true; }
+  return false;
+}
+
+simdjson_inline bool is_whitespace_or_operator(uint8_t c) {
+  switch (c) {
+    case '{': case '}': case '[': case ']': case ',': case ':':
+    case ' ': case '\r': case '\n': case '\t':
+      return true;
+    default:
+      return false;
+  }
+}
+
+//
+// Parse the entire input in STEP_SIZE-byte chunks.
+//
+simdjson_inline error_code scan() {
+  bool unclosed_string = false;
+  for (;idx<len;idx++) {
+    switch (buf[idx]) {
+      // String
+      case '"':
+        add_structural();
+        unclosed_string |= validate_string();
+        break;
+      // Operator
+      case '{': case '}': case '[': case ']': case ',': case ':':
+        add_structural();
+        break;
+      // Whitespace
+      case ' ': case '\r': case '\n': case '\t':
+        break;
+      // Primitive or invalid character (invalid characters will be checked in stage 2)
+      default:
+        // Anything else, add the structural and go until we find the next one
+        add_structural();
+        while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
+          idx++;
+        };
+        break;
+    }
+  }
+  // We pad beyond.
+  // https://github.com/simdjson/simdjson/issues/906
+  // See json_structural_indexer.h for an explanation.
+  *next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
+  next_structural_index[1] = len;
+  next_structural_index[2] = 0;
+  parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
+  if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
+  parser.next_structural_index = 0;
+  if (partial == stage1_mode::streaming_partial) {
+    if(unclosed_string) {
+      parser.n_structural_indexes--;
+      if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
+    }
+    // We truncate the input to the end of the last complete document (or zero).
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      if(parser.structural_indexes[0] == 0) {
+        // If the buffer is partial and we started at index 0 but the document is
+        // incomplete, it's too big to parse.
+        return CAPACITY;
+      } else {
+        // It is possible that the document could be parsed, we just had a lot
+        // of white space.
+        parser.n_structural_indexes = 0;
+        return EMPTY;
+      }
+    }
+    parser.n_structural_indexes = new_structural_indexes;
+  } else if(partial == stage1_mode::streaming_final) {
+    if(unclosed_string) { parser.n_structural_indexes--; }
+    // We truncate the input to the end of the last complete document (or zero).
+    // Because partial == stage1_mode::streaming_final, it means that we may
+    // silently ignore trailing garbage. Though it sounds bad, we do it
+    // deliberately because many people who have streams of JSON documents
+    // will truncate them for processing. E.g., imagine that you are uncompressing
+    // the data from a size file or receiving it in chunks from the network. You
+    // may not know where exactly the last document will be. Meanwhile the
+    // document_stream instances allow people to know the JSON documents they are
+    // parsing (see the iterator.source() method).
+    parser.n_structural_indexes = find_next_document_index(parser);
+    // We store the initial n_structural_indexes so that the client can see
+    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
+    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
+    // otherwise, it will copy some prior index.
+    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
+    // This next line is critical, do not change it unless you understand what you are
+    // doing.
+    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+    if (parser.n_structural_indexes == 0) { return EMPTY; }
+  } else if(unclosed_string) { error = UNCLOSED_STRING; }
+  return error;
+}
+
+private:
+  const uint8_t *buf;
+  uint32_t *next_structural_index;
+  dom_parser_implementation &parser;
+  uint32_t len;
+  uint32_t idx{0};
+  error_code error{SUCCESS};
+  stage1_mode partial;
+}; // structural_scanner
+
+} // namespace stage1
+} // unnamed namespace
+
+simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  stage1::structural_scanner scanner(*this, partial);
+  return scanner.scan();
+}
+
+// big table for the minifier
+static uint8_t jump_table[256 * 3] = {
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
+    1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
+    1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+    1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+    0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
+};
+
+simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  size_t i = 0, pos = 0;
+  uint8_t quote = 0;
+  uint8_t nonescape = 1;
+
+  while (i < len) {
+    unsigned char c = buf[i];
+    uint8_t *meta = jump_table + 3 * c;
+
+    quote = quote ^ (meta[0] & nonescape);
+    dst[pos] = c;
+    pos += meta[2] | quote;
+
+    i += 1;
+    nonescape = uint8_t(~nonescape) | (meta[1]);
+  }
+  dst_len = pos; // we intentionally do not work with a reference
+  // for fear of aliasing
+  return quote ? UNCLOSED_STRING : SUCCESS;
+}
+
+// credit: based on code from Google Fuchsia (Apache Licensed)
+simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+    if (byte < 0x80) {
+      pos++;
+      continue;
+    } else if ((byte & 0xe0) == 0xc0) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
+      // range check
+      code_point = (byte & 0x1f) << 6 | (data[pos + 1] & 0x3f);
+      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
+    } else if ((byte & 0xf0) == 0xe0) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
+      if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
+      // range check
+      code_point = (byte & 0x0f) << 12 |
+                   (data[pos + 1] & 0x3f) << 6 |
+                   (data[pos + 2] & 0x3f);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0xf8) == 0xf0) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
+      if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
+      if ((data[pos + 3] & 0xc0) != 0x80) { return false; }
+      // range check
+      code_point =
+          (byte & 0x07) << 18 | (data[pos + 1] & 0x3f) << 12 |
+          (data[pos + 2] & 0x3f) << 6 | (data[pos + 3] & 0x3f);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
+}
+
+} // namespace fallback
+} // namespace simdjson
+
+//
+// Stage 2
+//
+/* begin file src/generic/stage2/stringparsing.h */
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+
+namespace simdjson {
+namespace fallback {
+namespace {
+/// @private
+namespace stringparsing {
+
+// begin copypasta
+// These chars yield themselves: " \ /
+// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+// u not handled in this table as it's complex
+static const uint8_t escape_map[256] = {
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
+    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
+    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+};
+
+// handle a unicode codepoint
+// write appropriate values into dest
+// src will advance 6 bytes or 12 bytes
+// dest will advance a variable amount (return via pointer)
+// return true if the unicode codepoint was valid
+// We work in little-endian then swap at write time
+simdjson_warn_unused
+simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                            uint8_t **dst_ptr) {
+  // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+  // conversion isn't valid; we defer the check for this to inside the
+  // multilingual plane check
+  uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
+  *src_ptr += 6;
+
+  // If we found a high surrogate, we must
+  // check for low surrogate for characters
+  // outside the Basic
+  // Multilingual Plane.
+  if (code_point >= 0xd800 && code_point < 0xdc00) {
+    const uint8_t *src_data = *src_ptr;
+    /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
+    if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
+      return false;
+    }
+    uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
+
+    // We have already checked that the high surrogate is valid and
+    // (code_point - 0xd800) < 1024.
+    //
+    // Check that code_point_2 is in the range 0xdc00..0xdfff
+    // and that code_point_2 was parsed from valid hex.
+    uint32_t low_bit = code_point_2 - 0xdc00;
+    if (low_bit >> 10) {
+      return false;
+    }
+
+    code_point =
+        (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
+    *src_ptr += 6;
+  } else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
+      // If we encounter a low surrogate (not preceded by a high surrogate)
+      // then we have an error.
+      return false;
+  }
+  size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
+  *dst_ptr += offset;
+  return offset > 0;
+}
+
+/**
+ * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
+ * must be an unescaped quote terminating the string. It returns the final output
+ * position as pointer. In case of error (e.g., the string has bad escaped codes),
+ * then null_nullptrptr is returned. It is assumed that the output buffer is large
+ * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
+ * SIMDJSON_PADDING bytes.
+ */
+simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
+  while (1) {
+    // Copy the next n bytes, and find the backslash and quote in them.
+    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+    // If the next thing is the end quote, copy and return
+    if (bs_quote.has_quote_first()) {
+      // we encountered quotes first. Move dst to point to quotes and exit
+      return dst + bs_quote.quote_index();
+    }
+    if (bs_quote.has_backslash()) {
+      /* find out where the backspace is */
+      auto bs_dist = bs_quote.backslash_index();
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return nullptr;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return nullptr; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      src += backslash_and_quote::BYTES_PROCESSED;
+      dst += backslash_and_quote::BYTES_PROCESSED;
+    }
+  }
+  /* can't be reached */
+  return nullptr;
+}
+
+} // namespace stringparsing
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/generic/stage2/tape_builder.h */
+/* begin file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace simdjson {
+namespace fallback {
+namespace {
+namespace logger {
+
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+#if SIMDJSON_VERBOSE_LOGGING
+  static constexpr const bool LOG_ENABLED = true;
+#else
+  static constexpr const bool LOG_ENABLED = false;
+#endif
+  static constexpr const int LOG_EVENT_LEN = 20;
+  static constexpr const int LOG_BUFFER_LEN = 30;
+  static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
+  static constexpr const int LOG_INDEX_LEN = 5;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static simdjson_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static simdjson_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
+      printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
+    }
+  }
+
+  simdjson_unused static simdjson_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line from the stage 2 DOM parser
+  template<typename S>
+  static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
+      auto next_index = structurals.next_structural;
+      auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
+      auto next = &structurals.buf[*next_index];
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(current[i]));
+        }
+        printf(" ");
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
+          printf("%c", printable_char(next[i]));
+        }
+        printf(" ");
+      }
+      if (current_index) {
+        printf("| %*u ", LOG_INDEX_LEN, *current_index);
+      } else {
+        printf("| %-*s ", LOG_INDEX_LEN, "");
+      }
+      // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
+      printf("| %-s ", detail);
+      printf("|\n");
+    }
+  }
+
+} // namespace logger
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage2/logger.h */
+
+namespace simdjson {
+namespace fallback {
+namespace {
+namespace stage2 {
+
+class json_iterator {
+public:
+  const uint8_t* const buf;
+  uint32_t *next_structural;
+  dom_parser_implementation &dom_parser;
+  uint32_t depth{0};
+
+  /**
+   * Walk the JSON document.
+   *
+   * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
+   * the first parameter; some callbacks have other parameters as well:
+   *
+   * - visit_document_start() - at the beginning.
+   * - visit_document_end() - at the end (if things were successful).
+   *
+   * - visit_array_start() - at the start `[` of a non-empty array.
+   * - visit_array_end() - at the end `]` of a non-empty array.
+   * - visit_empty_array() - when an empty array is encountered.
+   *
+   * - visit_object_end() - at the start `]` of a non-empty object.
+   * - visit_object_start() - at the end `]` of a non-empty object.
+   * - visit_empty_object() - when an empty object is encountered.
+   * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
+   *                                   guaranteed to point at the first quote of the string (`"key"`).
+   * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
+   * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
+   *
+   * - increment_count(iter) - each time a value is found in an array or object.
+   */
+  template<bool STREAMING, typename V>
+  simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
+
+  /**
+   * Create an iterator capable of walking a JSON document.
+   *
+   * The document must have already passed through stage 1.
+   */
+  simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
+
+  /**
+   * Look at the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *peek() const noexcept;
+  /**
+   * Advance to the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *advance() noexcept;
+  /**
+   * Get the remaining length of the document, from the start of the current token.
+   */
+  simdjson_inline size_t remaining_len() const noexcept;
+  /**
+   * Check if we are at the end of the document.
+   *
+   * If this is true, there are no more tokens.
+   */
+  simdjson_inline bool at_eof() const noexcept;
+  /**
+   * Check if we are at the beginning of the document.
+   */
+  simdjson_inline bool at_beginning() const noexcept;
+  simdjson_inline uint8_t last_structural() const noexcept;
+
+  /**
+   * Log that a value has been found.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_value(const char *type) const noexcept;
+  /**
+   * Log the start of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_start_value(const char *type) const noexcept;
+  /**
+   * Log the end of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_end_value(const char *type) const noexcept;
+  /**
+   * Log an error.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_error(const char *error) const noexcept;
+
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
+};
+
+template<bool STREAMING, typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
+  logger::log_start();
+
+  //
+  // Start the document
+  //
+  if (at_eof()) { return EMPTY; }
+  log_start_value("document");
+  SIMDJSON_TRY( visitor.visit_document_start(*this) );
+
+  //
+  // Read first value
+  //
+  {
+    auto value = advance();
+
+    // Make sure the outer object or array is closed before continuing; otherwise, there are ways we
+    // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      switch (*value) {
+        case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
+        case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
+      }
+    }
+
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
+    }
+  }
+  goto document_end;
+
+//
+// Object parser states
+//
+object_begin:
+  log_start_value("object");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = false;
+  SIMDJSON_TRY( visitor.visit_object_start(*this) );
+
+  {
+    auto key = advance();
+    if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
+    SIMDJSON_TRY( visitor.increment_count(*this) );
+    SIMDJSON_TRY( visitor.visit_key(*this, key) );
+  }
+
+object_field:
+  if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+object_continue:
+  switch (*advance()) {
+    case ',':
+      SIMDJSON_TRY( visitor.increment_count(*this) );
+      {
+        auto key = advance();
+        if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
+        SIMDJSON_TRY( visitor.visit_key(*this, key) );
+      }
+      goto object_field;
+    case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
+    default: log_error("No comma between object fields"); return TAPE_ERROR;
+  }
+
+scope_end:
+  depth--;
+  if (depth == 0) { goto document_end; }
+  if (dom_parser.is_array[depth]) { goto array_continue; }
+  goto object_continue;
+
+//
+// Array parser states
+//
+array_begin:
+  log_start_value("array");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = true;
+  SIMDJSON_TRY( visitor.visit_array_start(*this) );
+  SIMDJSON_TRY( visitor.increment_count(*this) );
+
+array_value:
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+array_continue:
+  switch (*advance()) {
+    case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
+    case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
+    default: log_error("Missing comma between array values"); return TAPE_ERROR;
+  }
+
+document_end:
+  log_end_value("document");
+  SIMDJSON_TRY( visitor.visit_document_end(*this) );
+
+  dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
+
+  // If we didn't make it to the end, it's an error
+  if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
+    log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return TAPE_ERROR;
+  }
+
+  return SUCCESS;
+
+} // walk_document()
+
+simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
+  : buf{_dom_parser.buf},
+    next_structural{&_dom_parser.structural_indexes[start_structural_index]},
+    dom_parser{_dom_parser} {
+}
+
+simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
+  return &buf[*(next_structural)];
+}
+simdjson_inline const uint8_t *json_iterator::advance() noexcept {
+  return &buf[*(next_structural++)];
+}
+simdjson_inline size_t json_iterator::remaining_len() const noexcept {
+  return dom_parser.len - *(next_structural-1);
+}
+
+simdjson_inline bool json_iterator::at_eof() const noexcept {
+  return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
+}
+simdjson_inline bool json_iterator::at_beginning() const noexcept {
+  return next_structural == dom_parser.structural_indexes.get();
+}
+simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
+  return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
+}
+
+simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
+  logger::log_line(*this, "", type, "");
+}
+
+simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
+  logger::log_line(*this, "+", type, "");
+  if (logger::LOG_ENABLED) { logger::log_depth++; }
+}
+
+simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
+  if (logger::LOG_ENABLED) { logger::log_depth--; }
+  logger::log_line(*this, "-", type, "");
+}
+
+simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
+  logger::log_line(*this, "", "ERROR", error);
+}
+
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_root_string(*this, value);
+    case 't': return visitor.visit_root_true_atom(*this, value);
+    case 'f': return visitor.visit_root_false_atom(*this, value);
+    case 'n': return visitor.visit_root_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_root_number(*this, value);
+    default:
+      log_error("Document starts with a non-value character");
+      return TAPE_ERROR;
+  }
+}
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_string(*this, value);
+    case 't': return visitor.visit_true_atom(*this, value);
+    case 'f': return visitor.visit_false_atom(*this, value);
+    case 'n': return visitor.visit_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_number(*this, value);
+    default:
+      log_error("Non-value found when value was expected!");
+      return TAPE_ERROR;
+  }
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/tape_writer.h */
+namespace simdjson {
+namespace fallback {
+namespace {
+namespace stage2 {
+
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+
+  /** Write a signed 64-bit value to tape. */
+  simdjson_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  simdjson_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  simdjson_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  simdjson_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  simdjson_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  simdjson_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+simdjson_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+simdjson_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+
+namespace simdjson {
+namespace fallback {
+namespace {
+namespace stage2 {
+
+struct tape_builder {
+  template<bool STREAMING>
+  simdjson_warn_unused static simdjson_inline error_code parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept;
+
+  /** Called when a non-empty document starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty document ends without error. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty array starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty array ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
+  /** Called when an empty array is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty object starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
+  /**
+   * Called when a key in a field is encountered.
+   *
+   * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
+   * will be called after this with the field value.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
+  /** Called when a non-empty object ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
+  /** Called when an empty object is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
+
+  /**
+   * Called when a string, number, boolean or null is found.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+  /**
+   * Called when a string, number, boolean or null is found at the top level of a document (i.e.
+   * when there is no array or object and the entire document is a single string, number, boolean or
+   * null.
+   *
+   * This is separate from primitive() because simdjson's normal primitive parsing routines assume
+   * there is at least one more token after the value, which is only true in an array or object.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  /** Called each time a new field or element in an array or object is found. */
+  simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
+
+  /** Next location to write to tape */
+  tape_writer tape;
+private:
+  /** Next write location in the string buf for stage 2 parsing */
+  uint8_t *current_string_buf_loc;
+
+  simdjson_inline tape_builder(dom::document &doc) noexcept;
+
+  simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
+  simdjson_inline void start_container(json_iterator &iter) noexcept;
+  simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
+  simdjson_inline void on_end_string(uint8_t *dst) noexcept;
+}; // class tape_builder
+
+template<bool STREAMING>
+simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
+  json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  tape_builder builder(doc);
+  return iter.walk_document<STREAMING>(builder);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_root_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
+  constexpr uint32_t start_tape_index = 0;
+  tape.append(start_tape_index, internal::tape_type::ROOT);
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
+  return visit_string(iter, key, true);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
+  return SUCCESS;
+}
+
+simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
+  iter.log_value(key ? "key" : "string");
+  uint8_t *dst = on_start_string(iter);
+  dst = stringparsing::parse_string(value+1, dst);
+  if (dst == nullptr) {
+    iter.log_error("Invalid escape in string");
+    return STRING_ERROR;
+  }
+  on_end_string(dst);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
+  return visit_string(iter, value);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("number");
+  return numberparsing::parse_number(value, tape);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
+  //
+  // We need to make a copy to make sure that the string is space terminated.
+  // This is not about padding the input, which should already padded up
+  // to len + SIMDJSON_PADDING. However, we have no control at this stage
+  // on how the padding was done. What if the input string was padded with nulls?
+  // It is quite common for an input string to have an extra null character (C string).
+  // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+  // document, but the string "9\0" by itself is fine. So we make a copy and
+  // pad the input with spaces when we know that there is just one input element.
+  // This copy is relatively expensive, but it will almost never be called in
+  // practice unless you are in the strange scenario where you have many JSON
+  // documents made of single atoms.
+  //
+  std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
+  if (copy.get() == nullptr) { return MEMALLOC; }
+  std::memcpy(copy.get(), value, iter.remaining_len());
+  std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
+  error_code error = visit_number(iter, copy.get());
+  return error;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+// private:
+
+simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
+  return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  auto start_index = next_tape_index(iter);
+  tape.append(start_index+2, start);
+  tape.append(start_index, end);
+  return SUCCESS;
+}
+
+simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
+  iter.dom_parser.open_containers[iter.depth].count = 0;
+  tape.skip(); // We don't actually *write* the start element until the end.
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  // Write the ending tape element, pointing at the start location
+  const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
+  tape.append(start_tape_index, end);
+  // Write the start tape element, pointing at the end location (and including count)
+  // count can overflow if it exceeds 24 bits... so we saturate
+  // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+  const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
+  const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
+  return SUCCESS;
+}
+
+simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
+  // we advance the point, accounting for the fact that we have a NULL termination
+  tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
+  return current_string_buf_loc + sizeof(uint32_t);
+}
+
+simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
+  uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+  // TODO check for overflow in case someone has a crazy string (>=4GB?)
+  // But only add the overflow check when the document itself exceeds 4GB
+  // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+  memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+  // NULL termination is still handy if you expect all your strings to
+  // be NULL terminated? It comes at a small cost
+  *dst = 0;
+  current_string_buf_loc = dst + 1;
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage2/tape_builder.h */
+
+namespace simdjson {
+namespace fallback {
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<false>(*this, _doc);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<true>(*this, _doc);
+}
+
+simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst) const noexcept {
+  return fallback::stringparsing::parse_string(src, dst);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  auto error = stage1(_buf, _len, stage1_mode::regular);
+  if (error) { return error; }
+  return stage2(_doc);
+}
+
+} // namespace fallback
+} // namespace simdjson
+
+/* begin file include/simdjson/fallback/end.h */
+/* end file include/simdjson/fallback/end.h */
+/* end file src/fallback/dom_parser_implementation.cpp */
+#endif
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+/* begin file src/icelake/implementation.cpp */
+/* begin file include/simdjson/icelake/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "icelake"
+// #define SIMDJSON_IMPLEMENTATION icelake
+SIMDJSON_TARGET_ICELAKE
+/* end file include/simdjson/icelake/begin.h */
+
+namespace simdjson {
+namespace icelake {
+
+simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  if (auto err = dst->set_capacity(capacity))
+    return err;
+  if (auto err = dst->set_max_depth(max_depth))
+    return err;
+  return SUCCESS;
+}
+
+} // namespace icelake
+} // namespace simdjson
+
+/* begin file include/simdjson/icelake/end.h */
+SIMDJSON_UNTARGET_ICELAKE
+/* end file include/simdjson/icelake/end.h */
+
+/* end file src/icelake/implementation.cpp */
+/* begin file src/icelake/dom_parser_implementation.cpp */
+/* begin file include/simdjson/icelake/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "icelake"
+// #define SIMDJSON_IMPLEMENTATION icelake
+SIMDJSON_TARGET_ICELAKE
+/* end file include/simdjson/icelake/begin.h */
+
+//
+// Stage 1
+//
+
+namespace simdjson {
+namespace icelake {
+namespace {
+
+using namespace simd;
+
+struct json_character_block {
+  static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
+  //  ASCII white-space ('\r','\n','\t',' ')
+  simdjson_inline uint64_t whitespace() const noexcept;
+  // non-quote structural characters (comma, colon, braces, brackets)
+  simdjson_inline uint64_t op() const noexcept;
+  // neither a structural character nor a white-space, so letters, numbers and quotes
+  simdjson_inline uint64_t scalar() const noexcept;
+
+  uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
+  uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
+};
+
+simdjson_inline uint64_t json_character_block::whitespace() const noexcept { return _whitespace; }
+simdjson_inline uint64_t json_character_block::op() const noexcept { return _op; }
+simdjson_inline uint64_t json_character_block::scalar() const noexcept { return ~(op() | whitespace()); }
+
+// This identifies structural characters (comma, colon, braces, brackets),
+// and ASCII white-space ('\r','\n','\t',' ').
+simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
+  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
+  // we can't use the generic lookup_16.
+  const auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
+
+  // The 6 operators (:,[]{}) have these values:
+  //
+  // , 2C
+  // : 3A
+  // [ 5B
+  // { 7B
+  // ] 5D
+  // } 7D
+  //
+  // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
+  // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
+  // match it (against | 0x20).
+  //
+  // To prevent recognizing other characters, everything else gets compared with 0, which cannot
+  // match due to the | 0x20.
+  //
+  // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
+  // and :. This gets caught in stage 2, which checks the actual character to ensure the right
+  // operators are in the right places.
+  const auto op_table = simd8<uint8_t>::repeat_16(
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
+    ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
+  );
+
+  // We compute whitespace and op separately. If later code only uses one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace).
+
+  const uint64_t whitespace = in.eq({
+    _mm512_shuffle_epi8(whitespace_table, in.chunks[0])
+  });
+  // Turn [ and ] into { and }
+  const simd8x64<uint8_t> curlified{
+    in.chunks[0] | 0x20
+  };
+  const uint64_t op = curlified.eq({
+    _mm512_shuffle_epi8(op_table, in.chunks[0])
+  });
+
+  return { whitespace, op };
+}
+
+simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  return input.reduce_or().is_ascii();
+}
+
+simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+simdjson_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+
+/* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+    static const uint8_t max_array[64] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#else
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#endif
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdjson_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+#ifndef SIMDJSON_IF_CONSTEXPR
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_IF_CONSTEXPR if constexpr
+#else
+#define SIMDJSON_IF_CONSTEXPR if
+#endif
+#endif
+
+    simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdjson_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
+                ||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
+                || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support one, two or four chunks per 64-byte block.");
+        SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+      }
+    }
+    // do not forget to call check_eof!
+    simdjson_inline error_code errors() {
+      return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_lookup4_algorithm.h */
+// defining SIMDJSON_CUSTOM_BIT_INDEXER allows us to provide our own bit_indexer::write
+#define SIMDJSON_CUSTOM_BIT_INDEXER
+/* begin file src/generic/stage1/json_structural_indexer.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+/* begin file src/generic/stage1/buf_block_reader.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdjson_inline size_t block_index();
+  simdjson_inline bool has_full_block() const;
+  simdjson_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdjson_inline size_t get_remainder(uint8_t *dst) const;
+  simdjson_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text_64(const uint8_t *text) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdjson_unused static char * format_mask(uint64_t mask) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/buf_block_reader.h */
+/* begin file src/generic/stage1/json_string_scanner.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage1 {
+
+struct json_string_block {
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
+  _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
+
+  // Escaped characters (characters following an escape() character)
+  simdjson_inline uint64_t escaped() const { return _escaped; }
+  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+  simdjson_inline uint64_t escape() const { return _backslash & ~_escaped; }
+  // Real (non-backslashed) quotes
+  simdjson_inline uint64_t quote() const { return _quote; }
+  // Start quotes of strings
+  simdjson_inline uint64_t string_start() const { return _quote & _in_string; }
+  // End quotes of strings
+  simdjson_inline uint64_t string_end() const { return _quote & ~_in_string; }
+  // Only characters inside the string (not including the quotes)
+  simdjson_inline uint64_t string_content() const { return _in_string & ~_quote; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+  // Tail of string (everything except the start quote)
+  simdjson_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+  // backslash characters
+  uint64_t _backslash;
+  // escaped characters (backslashed--does not include the hex characters after \u)
+  uint64_t _escaped;
+  // real quotes (non-backslashed ones)
+  uint64_t _quote;
+  // string characters (includes start quote but not end quote)
+  uint64_t _in_string;
+};
+
+// Scans blocks for string characters, storing the state necessary to do so
+class json_string_scanner {
+public:
+  simdjson_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Intended to be defined by the implementation
+  simdjson_inline uint64_t find_escaped(uint64_t escape);
+  simdjson_inline uint64_t find_escaped_branchless(uint64_t escape);
+
+  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+  uint64_t prev_in_string = 0ULL;
+  // Whether the first character of the next iteration is escaped.
+  uint64_t prev_escaped = 0ULL;
+};
+
+//
+// Finds escaped characters (characters following \).
+//
+// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+//
+// Does this by:
+// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+//
+// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+// the start bit causes a carry), and leaves even-bit sequences alone.
+//
+// Example:
+//
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+// escaped        |   x  | x x  x x  x x  x  x  |
+// desired        |   x  | x x  x x  x x  x  x  |
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+//
+simdjson_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
+  // If there was overflow, pretend the first character isn't a backslash
+  backslash &= ~prev_escaped;
+  uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+  // Get sequences starting on even bits by clearing out the odd series using +
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+  uint64_t sequences_starting_on_even_bits;
+  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+  // Mask every other backslashed character as an escaped character
+  // Flip the mask for sequences that start on even bits, to correct them
+  return (even_bits ^ invert_mask) & follows_escape;
+}
+
+//
+// Return a mask of all string characters plus end quotes.
+//
+// prev_escaped is overflow saying whether the next character is escaped.
+// prev_in_string is overflow saying whether we're still in a string.
+//
+// Backslash sequences outside of quotes will be detected in stage 2.
+//
+simdjson_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  const uint64_t backslash = in.eq('\\');
+  const uint64_t escaped = find_escaped(backslash);
+  const uint64_t quote = in.eq('"') & ~escaped;
+
+  //
+  // prefix_xor flips on bits inside the string (and flips off the end quote).
+  //
+  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+  // (characters inside strings are outside, and characters outside strings are inside).
+  //
+  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+  //
+  // Check if we're still in a string at the end of the box so the next block will know
+  //
+  // right shift of a signed value expected to be well-defined and standard
+  // compliant as of C++20, John Regher from Utah U. says this is fine code
+  //
+  prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
+  // Use ^ to turn the beginning quote off, and the end quote on.
+
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_string_block(
+    backslash,
+    escaped,
+    quote,
+    in_string
+  );
+}
+
+simdjson_inline error_code json_string_scanner::finish() {
+  if (prev_in_string) {
+    return UNCLOSED_STRING;
+  }
+  return SUCCESS;
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/json_string_scanner.h */
+/* begin file src/generic/stage1/json_scanner.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage1 {
+
+/**
+ * A block of scanned json, with information on operators and scalars.
+ *
+ * We seek to identify pseudo-structural characters. Anything that is inside
+ * a string must be omitted (hence  & ~_string.string_tail()).
+ * Otherwise, pseudo-structural characters come in two forms.
+ * 1. We have the structural characters ([,],{,},:, comma). The
+ *    term 'structural character' is from the JSON RFC.
+ * 2. We have the 'scalar pseudo-structural characters'.
+ *    Scalars are quotes, and any character except structural characters and white space.
+ *
+ * To identify the scalar pseudo-structural characters, we must look at what comes
+ * before them: it must be a space, a quote or a structural characters.
+ * Starting with simdjson v0.3, we identify them by
+ * negation: we identify everything that is followed by a non-quote scalar,
+ * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
+ */
+struct json_block {
+public:
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+  simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+
+  /**
+   * The start of structurals.
+   * In simdjson prior to v0.3, these were called the pseudo-structural characters.
+   **/
+  simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
+  /** All JSON whitespace (i.e. not in a string) */
+  simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
+
+  // Helpers
+
+  /** Whether the given characters are inside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
+  /** Whether the given characters are outside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
+
+  // string and escape characters
+  json_string_block _string;
+  // whitespace, structural characters ('operators'), scalars
+  json_character_block _characters;
+  // whether the previous character was a scalar
+  uint64_t _follows_potential_nonquote_scalar;
+private:
+  // Potential structurals (i.e. disregarding strings)
+
+  /**
+   * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
+   * They may reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
+  /**
+   * The start of non-operator runs, like 123, true and "abc".
+   * It main reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_scalar_start() const noexcept {
+    // The term "scalar" refers to anything except structural characters and white space
+    // (so letters, numbers, quotes).
+    // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
+    // then we know that it is irrelevant structurally.
+    return _characters.scalar() & ~follows_potential_scalar();
+  }
+  /**
+   * Whether the given character is immediately after a non-operator like 123, true.
+   * The characters following a quote are not included.
+   */
+  simdjson_inline uint64_t follows_potential_scalar() const noexcept {
+    // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
+    // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
+    // white space.
+    // It is understood that within quoted region, anything at all could be marked (irrelevant).
+    return _follows_potential_nonquote_scalar;
+  }
+};
+
+/**
+ * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - structural characters or 'operators' ([]{},:, comma)
+ *   and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+class json_scanner {
+public:
+  json_scanner() = default;
+  simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Whether the last character of the previous iteration is part of a scalar token
+  // (anything except whitespace or a structural character/'operator').
+  uint64_t prev_scalar = 0ULL;
+  json_string_scanner string_scanner{};
+};
+
+
+//
+// Check if the current character immediately follows a matching character.
+//
+// For example, this checks for quotes with backslashes in front of them:
+//
+//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+//
+simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+  const uint64_t result = match << 1 | overflow;
+  overflow = match >> 63;
+  return result;
+}
+
+simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  json_string_block strings = string_scanner.next(in);
+  // identifies the white-space and the structural characters
+  json_character_block characters = json_character_block::classify(in);
+  // The term "scalar" refers to anything except structural characters and white space
+  // (so letters, numbers, quotes).
+  // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
+  //
+  // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
+  // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
+  // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
+  // may need to add an extra check when parsing strings.
+  //
+  // Performance: there are many ways to skin this cat.
+  const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
+  uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_block(
+    strings,// strings is a function-local object so either it moves or the copy is elided.
+    characters,
+    follows_nonquote_scalar
+  );
+}
+
+simdjson_inline error_code json_scanner::finish() {
+  return string_scanner.finish();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/json_scanner.h */
+/* begin file src/generic/stage1/json_minifier.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage1 {
+
+class json_minifier {
+public:
+  template<size_t STEP_SIZE>
+  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+
+private:
+  simdjson_inline json_minifier(uint8_t *_dst)
+  : dst{_dst}
+  {}
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
+  simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+  json_scanner scanner{};
+  uint8_t *dst;
+};
+
+simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
+  uint64_t mask = block.whitespace();
+  dst += in.compress(mask, dst);
+}
+
+simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
+  error_code error = scanner.finish();
+  if (error) { dst_len = 0; return error; }
+  dst_len = dst - dst_start;
+  return SUCCESS;
+}
+
+template<>
+simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  simd::simd8x64<uint8_t> in_2(block_buf+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1);
+  this->next(in_2, block_2);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  json_block block_1 = scanner.next(in_1);
+  this->next(block_buf, block_1);
+  reader.advance();
+}
+
+template<size_t STEP_SIZE>
+error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
+  while (reader.has_full_block()) {
+    minifier.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  size_t remaining_bytes = reader.get_remainder(block);
+  if (remaining_bytes > 0) {
+    // We do not want to write directly to the output stream. Rather, we write
+    // to a local buffer (for safety).
+    uint8_t out_block[STEP_SIZE];
+    uint8_t * const guarded_dst{minifier.dst};
+    minifier.dst = out_block;
+    minifier.step<STEP_SIZE>(block, reader);
+    size_t to_write = minifier.dst - out_block;
+    // In some cases, we could be enticed to consider the padded spaces
+    // as part of the string. This is fine as long as we do not write more
+    // than we consumed.
+    if(to_write > remaining_bytes) { to_write = remaining_bytes; }
+    memcpy(guarded_dst, out_block, to_write);
+    minifier.dst = guarded_dst + to_write;
+  }
+  return minifier.finish(dst, dst_len);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/json_minifier.h */
+/* begin file src/generic/stage1/find_next_document_index.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ':' ','
+  * and when the second element is NOT one of these characters: '}' ']' ':' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // Variant: do not count separately, just figure out depth
+  if(parser.n_structural_indexes == 0) { return 0; }
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  // If we made it to the end, we want to finish counting to see if we have a full document.
+  switch (parser.buf[parser.structural_indexes[0]]) {
+    case '}':
+      obj_cnt--;
+      break;
+    case ']':
+      arr_cnt--;
+      break;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+  }
+  if (!arr_cnt && !obj_cnt) {
+    // We have a complete document.
+    return parser.n_structural_indexes;
+  }
+  return 0;
+}
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/find_next_document_index.h */
+
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage1 {
+
+class bit_indexer {
+public:
+  uint32_t *tail;
+
+  simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // base_ptr[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so base_ptr
+  // needs to be large enough to handle this
+  //
+  // If the kernel sets SIMDJSON_CUSTOM_BIT_INDEXER, then it will provide its own
+  // version of the code.
+#ifdef SIMDJSON_CUSTOM_BIT_INDEXER
+  simdjson_inline void write(uint32_t idx, uint64_t bits);
+#else
+  simdjson_inline void write(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+        return;
+#if defined(SIMDJSON_PREFER_REVERSE_BITS)
+    /**
+     * ARM lacks a fast trailing zero instruction, but it has a fast
+     * bit reversal instruction and a fast leading zero instruction.
+     * Thus it may be profitable to reverse the bits (once) and then
+     * to rely on a sequence of instructions that call the leading
+     * zero instruction.
+     *
+     * Performance notes:
+     * The chosen routine is not optimal in terms of data dependency
+     * since zero_leading_bit might require two instructions. However,
+     * it tends to minimize the total number of instructions which is
+     * beneficial.
+     */
+
+    uint64_t rev_bits = reverse_bits(bits);
+    int cnt = static_cast<int>(count_ones(bits));
+    int i = 0;
+    // Do the first 8 all together
+    for (; i<8; i++) {
+      int lz = leading_zeroes(rev_bits);
+      this->tail[i] = static_cast<uint32_t>(idx) + lz;
+      rev_bits = zero_leading_bit(rev_bits, lz);
+    }
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      i = 8;
+      for (; i<16; i++) {
+        int lz = leading_zeroes(rev_bits);
+        this->tail[i] = static_cast<uint32_t>(idx) + lz;
+        rev_bits = zero_leading_bit(rev_bits, lz);
+      }
+
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        i = 16;
+        while (rev_bits != 0) {
+          int lz = leading_zeroes(rev_bits);
+          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
+          rev_bits = zero_leading_bit(rev_bits, lz);
+        }
+      }
+    }
+    this->tail += cnt;
+#else // SIMDJSON_PREFER_REVERSE_BITS
+    /**
+     * Under recent x64 systems, we often have both a fast trailing zero
+     * instruction and a fast 'clear-lower-bit' instruction so the following
+     * algorithm can be competitive.
+     */
+
+    int cnt = static_cast<int>(count_ones(bits));
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
+      bits = clear_lowest_bit(bits);
+    }
+
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
+        bits = clear_lowest_bit(bits);
+      }
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        int i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = clear_lowest_bit(bits);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+#endif
+  }
+#endif // SIMDJSON_CUSTOM_BIT_INDEXER
+
+};
+
+class json_structural_indexer {
+public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+  template<size_t STEP_SIZE>
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
+
+private:
+  simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
+  simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
+
+  json_scanner scanner{};
+  utf8_checker checker{};
+  bit_indexer indexer;
+  uint64_t prev_structurals = 0;
+  uint64_t unescaped_chars_error = 0;
+};
+
+simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+
+// Skip the last character if it is partial
+simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (simdjson_unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
+  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
+  // We guard the rest of the code so that we can assume that len > 0 throughout.
+  if (len == 0) { return EMPTY; }
+  if (is_streaming(partial)) {
+    len = trim_partial_utf8(buf, len);
+    // If you end up with an empty window after trimming
+    // the partial UTF-8 bytes, then chances are good that you
+    // have an UTF-8 formatting error.
+    if(len == 0) { return UTF8_ERROR; }
+  }
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+
+  // Read all but the last block
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+  // Take care of the last block (will always be there unless file is empty which is
+  // not supposed to happen.)
+  uint8_t block[STEP_SIZE];
+  if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
+  indexer.step<STEP_SIZE>(block, reader);
+  return indexer.finish(parser, reader.block_index(), len, partial);
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
+
+simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+  error_code error = scanner.finish();
+  // We deliberately break down the next expression so that it is
+  // human readable.
+  const bool should_we_exit = is_streaming(partial) ?
+    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
+    : (error != SUCCESS); // if partial is false, we must have SUCCESS
+  const bool have_unclosed_string = (error == UNCLOSED_STRING);
+  if (simdjson_unlikely(should_we_exit)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * The On Demand API requires special padding.
+   *
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   *
+   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
+   * R"({ "a": [,,)"
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial == stage1_mode::streaming_partial) {
+    // If we have an unclosed string, then the last structural
+    // will be the quote and we want to make sure to omit it.
+    if(have_unclosed_string) {
+      parser.n_structural_indexes--;
+      // a valid JSON file cannot have zero structural indexes - we should have found something
+      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
+    }
+    // We truncate the input to the end of the last complete document (or zero).
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      if(parser.structural_indexes[0] == 0) {
+        // If the buffer is partial and we started at index 0 but the document is
+        // incomplete, it's too big to parse.
+        return CAPACITY;
+      } else {
+        // It is possible that the document could be parsed, we just had a lot
+        // of white space.
+        parser.n_structural_indexes = 0;
+        return EMPTY;
+      }
+    }
+
+    parser.n_structural_indexes = new_structural_indexes;
+  } else if (partial == stage1_mode::streaming_final) {
+    if(have_unclosed_string) { parser.n_structural_indexes--; }
+    // We truncate the input to the end of the last complete document (or zero).
+    // Because partial == stage1_mode::streaming_final, it means that we may
+    // silently ignore trailing garbage. Though it sounds bad, we do it
+    // deliberately because many people who have streams of JSON documents
+    // will truncate them for processing. E.g., imagine that you are uncompressing
+    // the data from a size file or receiving it in chunks from the network. You
+    // may not know where exactly the last document will be. Meanwhile the
+    // document_stream instances allow people to know the JSON documents they are
+    // parsing (see the iterator.source() method).
+    parser.n_structural_indexes = find_next_document_index(parser);
+    // We store the initial n_structural_indexes so that the client can see
+    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
+    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
+    // otherwise, it will copy some prior index.
+    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
+    // This next line is critical, do not change it unless you understand what you are
+    // doing.
+    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+    if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+        // We tolerate an unclosed string at the very end of the stream. Indeed, users
+        // often load their data in bulk without being careful and they want us to ignore
+        // the trailing garbage.
+        return EMPTY;
+    }
+  }
+  checker.check_eof();
+  return checker.errors();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/json_structural_indexer.h */
+// We must not forget to undefine it now:
+#undef SIMDJSON_CUSTOM_BIT_INDEXER
+
+/**
+ * We provide a custom version of bit_indexer::write using
+ * naked intrinsics.
+ * TODO: make this code more elegant.
+ */
+// Under GCC 12, the intrinsic _mm512_extracti32x4_epi32 may generate 'maybe uninitialized'.
+// as a workaround, we disable warnings within the following function.
+SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
+namespace simdjson { namespace icelake { namespace { namespace stage1 {
+simdjson_inline void bit_indexer::write(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0) { return; }
+
+    const __m512i indexes = _mm512_maskz_compress_epi8(bits, _mm512_set_epi32(
+      0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130,
+      0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120,
+      0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110,
+      0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100
+    ));
+    const __m512i start_index = _mm512_set1_epi32(idx);
+
+    const auto count = count_ones(bits);
+    __m512i t0 = _mm512_cvtepu8_epi32(_mm512_castsi512_si128(indexes));
+    _mm512_storeu_si512(this->tail, _mm512_add_epi32(t0, start_index));
+
+    if(count > 16) {
+      const __m512i t1 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 1));
+      _mm512_storeu_si512(this->tail + 16, _mm512_add_epi32(t1, start_index));
+      if(count > 32) {
+        const __m512i t2 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 2));
+        _mm512_storeu_si512(this->tail + 32, _mm512_add_epi32(t2, start_index));
+        if(count > 48) {
+          const __m512i t3 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 3));
+          _mm512_storeu_si512(this->tail + 48, _mm512_add_epi32(t3, start_index));
+        }
+      }
+    }
+    this->tail += count;
+}
+}}}}
+SIMDJSON_POP_DISABLE_WARNINGS
+
+/* begin file src/generic/stage1/utf8_validator.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage1 {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return c.errors() == error_code::SUCCESS;
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_validator.h */
+
+//
+// Stage 2
+//
+/* begin file src/generic/stage2/stringparsing.h */
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+
+namespace simdjson {
+namespace icelake {
+namespace {
+/// @private
+namespace stringparsing {
+
+// begin copypasta
+// These chars yield themselves: " \ /
+// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+// u not handled in this table as it's complex
+static const uint8_t escape_map[256] = {
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
+    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
+    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+};
+
+// handle a unicode codepoint
+// write appropriate values into dest
+// src will advance 6 bytes or 12 bytes
+// dest will advance a variable amount (return via pointer)
+// return true if the unicode codepoint was valid
+// We work in little-endian then swap at write time
+simdjson_warn_unused
+simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                            uint8_t **dst_ptr) {
+  // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+  // conversion isn't valid; we defer the check for this to inside the
+  // multilingual plane check
+  uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
+  *src_ptr += 6;
+
+  // If we found a high surrogate, we must
+  // check for low surrogate for characters
+  // outside the Basic
+  // Multilingual Plane.
+  if (code_point >= 0xd800 && code_point < 0xdc00) {
+    const uint8_t *src_data = *src_ptr;
+    /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
+    if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
+      return false;
+    }
+    uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
+
+    // We have already checked that the high surrogate is valid and
+    // (code_point - 0xd800) < 1024.
+    //
+    // Check that code_point_2 is in the range 0xdc00..0xdfff
+    // and that code_point_2 was parsed from valid hex.
+    uint32_t low_bit = code_point_2 - 0xdc00;
+    if (low_bit >> 10) {
+      return false;
+    }
+
+    code_point =
+        (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
+    *src_ptr += 6;
+  } else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
+      // If we encounter a low surrogate (not preceded by a high surrogate)
+      // then we have an error.
+      return false;
+  }
+  size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
+  *dst_ptr += offset;
+  return offset > 0;
+}
+
+/**
+ * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
+ * must be an unescaped quote terminating the string. It returns the final output
+ * position as pointer. In case of error (e.g., the string has bad escaped codes),
+ * then null_nullptrptr is returned. It is assumed that the output buffer is large
+ * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
+ * SIMDJSON_PADDING bytes.
+ */
+simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
+  while (1) {
+    // Copy the next n bytes, and find the backslash and quote in them.
+    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+    // If the next thing is the end quote, copy and return
+    if (bs_quote.has_quote_first()) {
+      // we encountered quotes first. Move dst to point to quotes and exit
+      return dst + bs_quote.quote_index();
+    }
+    if (bs_quote.has_backslash()) {
+      /* find out where the backspace is */
+      auto bs_dist = bs_quote.backslash_index();
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return nullptr;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return nullptr; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      src += backslash_and_quote::BYTES_PROCESSED;
+      dst += backslash_and_quote::BYTES_PROCESSED;
+    }
+  }
+  /* can't be reached */
+  return nullptr;
+}
+
+} // namespace stringparsing
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/generic/stage2/tape_builder.h */
+/* begin file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace logger {
+
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+#if SIMDJSON_VERBOSE_LOGGING
+  static constexpr const bool LOG_ENABLED = true;
+#else
+  static constexpr const bool LOG_ENABLED = false;
+#endif
+  static constexpr const int LOG_EVENT_LEN = 20;
+  static constexpr const int LOG_BUFFER_LEN = 30;
+  static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
+  static constexpr const int LOG_INDEX_LEN = 5;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static simdjson_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static simdjson_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
+      printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
+    }
+  }
+
+  simdjson_unused static simdjson_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line from the stage 2 DOM parser
+  template<typename S>
+  static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
+      auto next_index = structurals.next_structural;
+      auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
+      auto next = &structurals.buf[*next_index];
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(current[i]));
+        }
+        printf(" ");
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
+          printf("%c", printable_char(next[i]));
+        }
+        printf(" ");
+      }
+      if (current_index) {
+        printf("| %*u ", LOG_INDEX_LEN, *current_index);
+      } else {
+        printf("| %-*s ", LOG_INDEX_LEN, "");
+      }
+      // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
+      printf("| %-s ", detail);
+      printf("|\n");
+    }
+  }
+
+} // namespace logger
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage2/logger.h */
+
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage2 {
+
+class json_iterator {
+public:
+  const uint8_t* const buf;
+  uint32_t *next_structural;
+  dom_parser_implementation &dom_parser;
+  uint32_t depth{0};
+
+  /**
+   * Walk the JSON document.
+   *
+   * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
+   * the first parameter; some callbacks have other parameters as well:
+   *
+   * - visit_document_start() - at the beginning.
+   * - visit_document_end() - at the end (if things were successful).
+   *
+   * - visit_array_start() - at the start `[` of a non-empty array.
+   * - visit_array_end() - at the end `]` of a non-empty array.
+   * - visit_empty_array() - when an empty array is encountered.
+   *
+   * - visit_object_end() - at the start `]` of a non-empty object.
+   * - visit_object_start() - at the end `]` of a non-empty object.
+   * - visit_empty_object() - when an empty object is encountered.
+   * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
+   *                                   guaranteed to point at the first quote of the string (`"key"`).
+   * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
+   * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
+   *
+   * - increment_count(iter) - each time a value is found in an array or object.
+   */
+  template<bool STREAMING, typename V>
+  simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
+
+  /**
+   * Create an iterator capable of walking a JSON document.
+   *
+   * The document must have already passed through stage 1.
+   */
+  simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
+
+  /**
+   * Look at the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *peek() const noexcept;
+  /**
+   * Advance to the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *advance() noexcept;
+  /**
+   * Get the remaining length of the document, from the start of the current token.
+   */
+  simdjson_inline size_t remaining_len() const noexcept;
+  /**
+   * Check if we are at the end of the document.
+   *
+   * If this is true, there are no more tokens.
+   */
+  simdjson_inline bool at_eof() const noexcept;
+  /**
+   * Check if we are at the beginning of the document.
+   */
+  simdjson_inline bool at_beginning() const noexcept;
+  simdjson_inline uint8_t last_structural() const noexcept;
+
+  /**
+   * Log that a value has been found.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_value(const char *type) const noexcept;
+  /**
+   * Log the start of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_start_value(const char *type) const noexcept;
+  /**
+   * Log the end of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_end_value(const char *type) const noexcept;
+  /**
+   * Log an error.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_error(const char *error) const noexcept;
+
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
+};
+
+template<bool STREAMING, typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
+  logger::log_start();
+
+  //
+  // Start the document
+  //
+  if (at_eof()) { return EMPTY; }
+  log_start_value("document");
+  SIMDJSON_TRY( visitor.visit_document_start(*this) );
+
+  //
+  // Read first value
+  //
+  {
+    auto value = advance();
+
+    // Make sure the outer object or array is closed before continuing; otherwise, there are ways we
+    // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      switch (*value) {
+        case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
+        case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
+      }
+    }
+
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
+    }
+  }
+  goto document_end;
+
+//
+// Object parser states
+//
+object_begin:
+  log_start_value("object");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = false;
+  SIMDJSON_TRY( visitor.visit_object_start(*this) );
+
+  {
+    auto key = advance();
+    if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
+    SIMDJSON_TRY( visitor.increment_count(*this) );
+    SIMDJSON_TRY( visitor.visit_key(*this, key) );
+  }
+
+object_field:
+  if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+object_continue:
+  switch (*advance()) {
+    case ',':
+      SIMDJSON_TRY( visitor.increment_count(*this) );
+      {
+        auto key = advance();
+        if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
+        SIMDJSON_TRY( visitor.visit_key(*this, key) );
+      }
+      goto object_field;
+    case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
+    default: log_error("No comma between object fields"); return TAPE_ERROR;
+  }
+
+scope_end:
+  depth--;
+  if (depth == 0) { goto document_end; }
+  if (dom_parser.is_array[depth]) { goto array_continue; }
+  goto object_continue;
+
+//
+// Array parser states
+//
+array_begin:
+  log_start_value("array");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = true;
+  SIMDJSON_TRY( visitor.visit_array_start(*this) );
+  SIMDJSON_TRY( visitor.increment_count(*this) );
+
+array_value:
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+array_continue:
+  switch (*advance()) {
+    case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
+    case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
+    default: log_error("Missing comma between array values"); return TAPE_ERROR;
+  }
+
+document_end:
+  log_end_value("document");
+  SIMDJSON_TRY( visitor.visit_document_end(*this) );
+
+  dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
+
+  // If we didn't make it to the end, it's an error
+  if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
+    log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return TAPE_ERROR;
+  }
+
+  return SUCCESS;
+
+} // walk_document()
+
+simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
+  : buf{_dom_parser.buf},
+    next_structural{&_dom_parser.structural_indexes[start_structural_index]},
+    dom_parser{_dom_parser} {
+}
+
+simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
+  return &buf[*(next_structural)];
+}
+simdjson_inline const uint8_t *json_iterator::advance() noexcept {
+  return &buf[*(next_structural++)];
+}
+simdjson_inline size_t json_iterator::remaining_len() const noexcept {
+  return dom_parser.len - *(next_structural-1);
+}
+
+simdjson_inline bool json_iterator::at_eof() const noexcept {
+  return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
+}
+simdjson_inline bool json_iterator::at_beginning() const noexcept {
+  return next_structural == dom_parser.structural_indexes.get();
+}
+simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
+  return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
+}
+
+simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
+  logger::log_line(*this, "", type, "");
+}
+
+simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
+  logger::log_line(*this, "+", type, "");
+  if (logger::LOG_ENABLED) { logger::log_depth++; }
+}
+
+simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
+  if (logger::LOG_ENABLED) { logger::log_depth--; }
+  logger::log_line(*this, "-", type, "");
+}
+
+simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
+  logger::log_line(*this, "", "ERROR", error);
+}
+
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_root_string(*this, value);
+    case 't': return visitor.visit_root_true_atom(*this, value);
+    case 'f': return visitor.visit_root_false_atom(*this, value);
+    case 'n': return visitor.visit_root_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_root_number(*this, value);
+    default:
+      log_error("Document starts with a non-value character");
+      return TAPE_ERROR;
+  }
+}
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_string(*this, value);
+    case 't': return visitor.visit_true_atom(*this, value);
+    case 'f': return visitor.visit_false_atom(*this, value);
+    case 'n': return visitor.visit_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_number(*this, value);
+    default:
+      log_error("Non-value found when value was expected!");
+      return TAPE_ERROR;
+  }
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/tape_writer.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage2 {
+
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+
+  /** Write a signed 64-bit value to tape. */
+  simdjson_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  simdjson_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  simdjson_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  simdjson_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  simdjson_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  simdjson_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+simdjson_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+simdjson_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage2 {
+
+struct tape_builder {
+  template<bool STREAMING>
+  simdjson_warn_unused static simdjson_inline error_code parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept;
+
+  /** Called when a non-empty document starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty document ends without error. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty array starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty array ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
+  /** Called when an empty array is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty object starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
+  /**
+   * Called when a key in a field is encountered.
+   *
+   * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
+   * will be called after this with the field value.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
+  /** Called when a non-empty object ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
+  /** Called when an empty object is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
+
+  /**
+   * Called when a string, number, boolean or null is found.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+  /**
+   * Called when a string, number, boolean or null is found at the top level of a document (i.e.
+   * when there is no array or object and the entire document is a single string, number, boolean or
+   * null.
+   *
+   * This is separate from primitive() because simdjson's normal primitive parsing routines assume
+   * there is at least one more token after the value, which is only true in an array or object.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  /** Called each time a new field or element in an array or object is found. */
+  simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
+
+  /** Next location to write to tape */
+  tape_writer tape;
+private:
+  /** Next write location in the string buf for stage 2 parsing */
+  uint8_t *current_string_buf_loc;
+
+  simdjson_inline tape_builder(dom::document &doc) noexcept;
+
+  simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
+  simdjson_inline void start_container(json_iterator &iter) noexcept;
+  simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
+  simdjson_inline void on_end_string(uint8_t *dst) noexcept;
+}; // class tape_builder
+
+template<bool STREAMING>
+simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
+  json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  tape_builder builder(doc);
+  return iter.walk_document<STREAMING>(builder);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_root_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
+  constexpr uint32_t start_tape_index = 0;
+  tape.append(start_tape_index, internal::tape_type::ROOT);
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
+  return visit_string(iter, key, true);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
+  return SUCCESS;
+}
+
+simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
+  iter.log_value(key ? "key" : "string");
+  uint8_t *dst = on_start_string(iter);
+  dst = stringparsing::parse_string(value+1, dst);
+  if (dst == nullptr) {
+    iter.log_error("Invalid escape in string");
+    return STRING_ERROR;
+  }
+  on_end_string(dst);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
+  return visit_string(iter, value);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("number");
+  return numberparsing::parse_number(value, tape);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
+  //
+  // We need to make a copy to make sure that the string is space terminated.
+  // This is not about padding the input, which should already padded up
+  // to len + SIMDJSON_PADDING. However, we have no control at this stage
+  // on how the padding was done. What if the input string was padded with nulls?
+  // It is quite common for an input string to have an extra null character (C string).
+  // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+  // document, but the string "9\0" by itself is fine. So we make a copy and
+  // pad the input with spaces when we know that there is just one input element.
+  // This copy is relatively expensive, but it will almost never be called in
+  // practice unless you are in the strange scenario where you have many JSON
+  // documents made of single atoms.
+  //
+  std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
+  if (copy.get() == nullptr) { return MEMALLOC; }
+  std::memcpy(copy.get(), value, iter.remaining_len());
+  std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
+  error_code error = visit_number(iter, copy.get());
+  return error;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+// private:
+
+simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
+  return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  auto start_index = next_tape_index(iter);
+  tape.append(start_index+2, start);
+  tape.append(start_index, end);
+  return SUCCESS;
+}
+
+simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
+  iter.dom_parser.open_containers[iter.depth].count = 0;
+  tape.skip(); // We don't actually *write* the start element until the end.
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  // Write the ending tape element, pointing at the start location
+  const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
+  tape.append(start_tape_index, end);
+  // Write the start tape element, pointing at the end location (and including count)
+  // count can overflow if it exceeds 24 bits... so we saturate
+  // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+  const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
+  const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
+  return SUCCESS;
+}
+
+simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
+  // we advance the point, accounting for the fact that we have a NULL termination
+  tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
+  return current_string_buf_loc + sizeof(uint32_t);
+}
+
+simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
+  uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+  // TODO check for overflow in case someone has a crazy string (>=4GB?)
+  // But only add the overflow check when the document itself exceeds 4GB
+  // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+  memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+  // NULL termination is still handy if you expect all your strings to
+  // be NULL terminated? It comes at a small cost
+  *dst = 0;
+  current_string_buf_loc = dst + 1;
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file src/generic/stage2/tape_builder.h */
+
+//
+// Implementation-specific overrides
+//
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace stage1 {
+
+simdjson_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+
+} // namespace stage1
+} // unnamed namespace
+
+simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return icelake::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return icelake::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
+}
+
+simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return icelake::stage1::generic_validate_utf8(buf,len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<false>(*this, _doc);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<true>(*this, _doc);
+}
+
+simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst) const noexcept {
+  return icelake::stringparsing::parse_string(src, dst);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  auto error = stage1(_buf, _len, stage1_mode::regular);
+  if (error) { return error; }
+  return stage2(_doc);
+}
+
+} // namespace icelake
+} // namespace simdjson
+
+/* begin file include/simdjson/icelake/end.h */
+SIMDJSON_UNTARGET_ICELAKE
+/* end file include/simdjson/icelake/end.h */
+/* end file src/icelake/dom_parser_implementation.cpp */
+#endif
+#if SIMDJSON_IMPLEMENTATION_HASWELL
+/* begin file src/haswell/implementation.cpp */
+/* begin file include/simdjson/haswell/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "haswell"
+// #define SIMDJSON_IMPLEMENTATION haswell
+SIMDJSON_TARGET_HASWELL
+/* end file include/simdjson/haswell/begin.h */
+
+namespace simdjson {
+namespace haswell {
+
+simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  if (auto err = dst->set_capacity(capacity))
+    return err;
+  if (auto err = dst->set_max_depth(max_depth))
+    return err;
+  return SUCCESS;
+}
+
+} // namespace haswell
+} // namespace simdjson
+
+/* begin file include/simdjson/haswell/end.h */
+SIMDJSON_UNTARGET_HASWELL
+/* end file include/simdjson/haswell/end.h */
+
+/* end file src/haswell/implementation.cpp */
+/* begin file src/haswell/dom_parser_implementation.cpp */
+/* begin file include/simdjson/haswell/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "haswell"
+// #define SIMDJSON_IMPLEMENTATION haswell
+SIMDJSON_TARGET_HASWELL
+/* end file include/simdjson/haswell/begin.h */
+
+//
+// Stage 1
+//
+
+namespace simdjson {
+namespace haswell {
+namespace {
+
+using namespace simd;
+
+struct json_character_block {
+  static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
+  //  ASCII white-space ('\r','\n','\t',' ')
+  simdjson_inline uint64_t whitespace() const noexcept;
+  // non-quote structural characters (comma, colon, braces, brackets)
+  simdjson_inline uint64_t op() const noexcept;
+  // neither a structural character nor a white-space, so letters, numbers and quotes
+  simdjson_inline uint64_t scalar() const noexcept;
+
+  uint64_t _whitespace; // ASCII white-space ('\r','\n','\t',' ')
+  uint64_t _op; // structural characters (comma, colon, braces, brackets but not quotes)
+};
+
+simdjson_inline uint64_t json_character_block::whitespace() const noexcept { return _whitespace; }
+simdjson_inline uint64_t json_character_block::op() const noexcept { return _op; }
+simdjson_inline uint64_t json_character_block::scalar() const noexcept { return ~(op() | whitespace()); }
+
+// This identifies structural characters (comma, colon, braces, brackets),
+// and ASCII white-space ('\r','\n','\t',' ').
+simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
+  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
+  // we can't use the generic lookup_16.
+  const auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
+
+  // The 6 operators (:,[]{}) have these values:
+  //
+  // , 2C
+  // : 3A
+  // [ 5B
+  // { 7B
+  // ] 5D
+  // } 7D
+  //
+  // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
+  // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
+  // match it (against | 0x20).
+  //
+  // To prevent recognizing other characters, everything else gets compared with 0, which cannot
+  // match due to the | 0x20.
+  //
+  // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
+  // and :. This gets caught in stage 2, which checks the actual character to ensure the right
+  // operators are in the right places.
+  const auto op_table = simd8<uint8_t>::repeat_16(
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
+    ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
+  );
+
+  // We compute whitespace and op separately. If later code only uses one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace).
+
+  const uint64_t whitespace = in.eq({
+    _mm256_shuffle_epi8(whitespace_table, in.chunks[0]),
+    _mm256_shuffle_epi8(whitespace_table, in.chunks[1])
+  });
+  // Turn [ and ] into { and }
+  const simd8x64<uint8_t> curlified{
+    in.chunks[0] | 0x20,
+    in.chunks[1] | 0x20
+  };
+  const uint64_t op = curlified.eq({
+    _mm256_shuffle_epi8(op_table, in.chunks[0]),
+    _mm256_shuffle_epi8(op_table, in.chunks[1])
+  });
+
+  return { whitespace, op };
+}
+
+simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  return input.reduce_or().is_ascii();
+}
+
+simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+simdjson_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+
+/* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+    static const uint8_t max_array[64] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#else
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#endif
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdjson_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+#ifndef SIMDJSON_IF_CONSTEXPR
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_IF_CONSTEXPR if constexpr
+#else
+#define SIMDJSON_IF_CONSTEXPR if
+#endif
+#endif
+
+    simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdjson_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
+                ||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
+                || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support one, two or four chunks per 64-byte block.");
+        SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+      }
+    }
+    // do not forget to call check_eof!
+    simdjson_inline error_code errors() {
+      return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_lookup4_algorithm.h */
+/* begin file src/generic/stage1/json_structural_indexer.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+/* begin file src/generic/stage1/buf_block_reader.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdjson_inline size_t block_index();
+  simdjson_inline bool has_full_block() const;
+  simdjson_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdjson_inline size_t get_remainder(uint8_t *dst) const;
+  simdjson_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text_64(const uint8_t *text) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdjson_unused static char * format_mask(uint64_t mask) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/buf_block_reader.h */
+/* begin file src/generic/stage1/json_string_scanner.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage1 {
+
+struct json_string_block {
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
+  _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
+
+  // Escaped characters (characters following an escape() character)
+  simdjson_inline uint64_t escaped() const { return _escaped; }
+  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+  simdjson_inline uint64_t escape() const { return _backslash & ~_escaped; }
+  // Real (non-backslashed) quotes
+  simdjson_inline uint64_t quote() const { return _quote; }
+  // Start quotes of strings
+  simdjson_inline uint64_t string_start() const { return _quote & _in_string; }
+  // End quotes of strings
+  simdjson_inline uint64_t string_end() const { return _quote & ~_in_string; }
+  // Only characters inside the string (not including the quotes)
+  simdjson_inline uint64_t string_content() const { return _in_string & ~_quote; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+  // Tail of string (everything except the start quote)
+  simdjson_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+  // backslash characters
+  uint64_t _backslash;
+  // escaped characters (backslashed--does not include the hex characters after \u)
+  uint64_t _escaped;
+  // real quotes (non-backslashed ones)
+  uint64_t _quote;
+  // string characters (includes start quote but not end quote)
+  uint64_t _in_string;
+};
+
+// Scans blocks for string characters, storing the state necessary to do so
+class json_string_scanner {
+public:
+  simdjson_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Intended to be defined by the implementation
+  simdjson_inline uint64_t find_escaped(uint64_t escape);
+  simdjson_inline uint64_t find_escaped_branchless(uint64_t escape);
+
+  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+  uint64_t prev_in_string = 0ULL;
+  // Whether the first character of the next iteration is escaped.
+  uint64_t prev_escaped = 0ULL;
+};
+
+//
+// Finds escaped characters (characters following \).
+//
+// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+//
+// Does this by:
+// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+//
+// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+// the start bit causes a carry), and leaves even-bit sequences alone.
+//
+// Example:
+//
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+// escaped        |   x  | x x  x x  x x  x  x  |
+// desired        |   x  | x x  x x  x x  x  x  |
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+//
+simdjson_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
+  // If there was overflow, pretend the first character isn't a backslash
+  backslash &= ~prev_escaped;
+  uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+  // Get sequences starting on even bits by clearing out the odd series using +
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+  uint64_t sequences_starting_on_even_bits;
+  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+  // Mask every other backslashed character as an escaped character
+  // Flip the mask for sequences that start on even bits, to correct them
+  return (even_bits ^ invert_mask) & follows_escape;
+}
+
+//
+// Return a mask of all string characters plus end quotes.
+//
+// prev_escaped is overflow saying whether the next character is escaped.
+// prev_in_string is overflow saying whether we're still in a string.
+//
+// Backslash sequences outside of quotes will be detected in stage 2.
+//
+simdjson_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  const uint64_t backslash = in.eq('\\');
+  const uint64_t escaped = find_escaped(backslash);
+  const uint64_t quote = in.eq('"') & ~escaped;
+
+  //
+  // prefix_xor flips on bits inside the string (and flips off the end quote).
+  //
+  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+  // (characters inside strings are outside, and characters outside strings are inside).
+  //
+  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+  //
+  // Check if we're still in a string at the end of the box so the next block will know
+  //
+  // right shift of a signed value expected to be well-defined and standard
+  // compliant as of C++20, John Regher from Utah U. says this is fine code
+  //
+  prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
+  // Use ^ to turn the beginning quote off, and the end quote on.
+
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_string_block(
+    backslash,
+    escaped,
+    quote,
+    in_string
+  );
+}
+
+simdjson_inline error_code json_string_scanner::finish() {
+  if (prev_in_string) {
+    return UNCLOSED_STRING;
+  }
+  return SUCCESS;
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/json_string_scanner.h */
+/* begin file src/generic/stage1/json_scanner.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage1 {
+
+/**
+ * A block of scanned json, with information on operators and scalars.
+ *
+ * We seek to identify pseudo-structural characters. Anything that is inside
+ * a string must be omitted (hence  & ~_string.string_tail()).
+ * Otherwise, pseudo-structural characters come in two forms.
+ * 1. We have the structural characters ([,],{,},:, comma). The
+ *    term 'structural character' is from the JSON RFC.
+ * 2. We have the 'scalar pseudo-structural characters'.
+ *    Scalars are quotes, and any character except structural characters and white space.
+ *
+ * To identify the scalar pseudo-structural characters, we must look at what comes
+ * before them: it must be a space, a quote or a structural characters.
+ * Starting with simdjson v0.3, we identify them by
+ * negation: we identify everything that is followed by a non-quote scalar,
+ * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
+ */
+struct json_block {
+public:
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+  simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+
+  /**
+   * The start of structurals.
+   * In simdjson prior to v0.3, these were called the pseudo-structural characters.
+   **/
+  simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
+  /** All JSON whitespace (i.e. not in a string) */
+  simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
+
+  // Helpers
+
+  /** Whether the given characters are inside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
+  /** Whether the given characters are outside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
+
+  // string and escape characters
+  json_string_block _string;
+  // whitespace, structural characters ('operators'), scalars
+  json_character_block _characters;
+  // whether the previous character was a scalar
+  uint64_t _follows_potential_nonquote_scalar;
+private:
+  // Potential structurals (i.e. disregarding strings)
+
+  /**
+   * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
+   * They may reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
+  /**
+   * The start of non-operator runs, like 123, true and "abc".
+   * It main reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_scalar_start() const noexcept {
+    // The term "scalar" refers to anything except structural characters and white space
+    // (so letters, numbers, quotes).
+    // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
+    // then we know that it is irrelevant structurally.
+    return _characters.scalar() & ~follows_potential_scalar();
+  }
+  /**
+   * Whether the given character is immediately after a non-operator like 123, true.
+   * The characters following a quote are not included.
+   */
+  simdjson_inline uint64_t follows_potential_scalar() const noexcept {
+    // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
+    // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
+    // white space.
+    // It is understood that within quoted region, anything at all could be marked (irrelevant).
+    return _follows_potential_nonquote_scalar;
+  }
+};
+
+/**
+ * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - structural characters or 'operators' ([]{},:, comma)
+ *   and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+class json_scanner {
+public:
+  json_scanner() = default;
+  simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Whether the last character of the previous iteration is part of a scalar token
+  // (anything except whitespace or a structural character/'operator').
+  uint64_t prev_scalar = 0ULL;
+  json_string_scanner string_scanner{};
+};
+
+
+//
+// Check if the current character immediately follows a matching character.
+//
+// For example, this checks for quotes with backslashes in front of them:
+//
+//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+//
+simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+  const uint64_t result = match << 1 | overflow;
+  overflow = match >> 63;
+  return result;
+}
+
+simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  json_string_block strings = string_scanner.next(in);
+  // identifies the white-space and the structural characters
+  json_character_block characters = json_character_block::classify(in);
+  // The term "scalar" refers to anything except structural characters and white space
+  // (so letters, numbers, quotes).
+  // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
+  //
+  // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
+  // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
+  // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
+  // may need to add an extra check when parsing strings.
+  //
+  // Performance: there are many ways to skin this cat.
+  const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
+  uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_block(
+    strings,// strings is a function-local object so either it moves or the copy is elided.
+    characters,
+    follows_nonquote_scalar
+  );
+}
+
+simdjson_inline error_code json_scanner::finish() {
+  return string_scanner.finish();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/json_scanner.h */
+/* begin file src/generic/stage1/json_minifier.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage1 {
+
+class json_minifier {
+public:
+  template<size_t STEP_SIZE>
+  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+
+private:
+  simdjson_inline json_minifier(uint8_t *_dst)
+  : dst{_dst}
+  {}
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
+  simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+  json_scanner scanner{};
+  uint8_t *dst;
+};
+
+simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
+  uint64_t mask = block.whitespace();
+  dst += in.compress(mask, dst);
+}
+
+simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
+  error_code error = scanner.finish();
+  if (error) { dst_len = 0; return error; }
+  dst_len = dst - dst_start;
+  return SUCCESS;
+}
+
+template<>
+simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  simd::simd8x64<uint8_t> in_2(block_buf+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1);
+  this->next(in_2, block_2);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  json_block block_1 = scanner.next(in_1);
+  this->next(block_buf, block_1);
+  reader.advance();
+}
+
+template<size_t STEP_SIZE>
+error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
+  while (reader.has_full_block()) {
+    minifier.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  size_t remaining_bytes = reader.get_remainder(block);
+  if (remaining_bytes > 0) {
+    // We do not want to write directly to the output stream. Rather, we write
+    // to a local buffer (for safety).
+    uint8_t out_block[STEP_SIZE];
+    uint8_t * const guarded_dst{minifier.dst};
+    minifier.dst = out_block;
+    minifier.step<STEP_SIZE>(block, reader);
+    size_t to_write = minifier.dst - out_block;
+    // In some cases, we could be enticed to consider the padded spaces
+    // as part of the string. This is fine as long as we do not write more
+    // than we consumed.
+    if(to_write > remaining_bytes) { to_write = remaining_bytes; }
+    memcpy(guarded_dst, out_block, to_write);
+    minifier.dst = guarded_dst + to_write;
+  }
+  return minifier.finish(dst, dst_len);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/json_minifier.h */
+/* begin file src/generic/stage1/find_next_document_index.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ':' ','
+  * and when the second element is NOT one of these characters: '}' ']' ':' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // Variant: do not count separately, just figure out depth
+  if(parser.n_structural_indexes == 0) { return 0; }
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  // If we made it to the end, we want to finish counting to see if we have a full document.
+  switch (parser.buf[parser.structural_indexes[0]]) {
+    case '}':
+      obj_cnt--;
+      break;
+    case ']':
+      arr_cnt--;
+      break;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+  }
+  if (!arr_cnt && !obj_cnt) {
+    // We have a complete document.
+    return parser.n_structural_indexes;
+  }
+  return 0;
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/find_next_document_index.h */
+
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage1 {
+
+class bit_indexer {
+public:
+  uint32_t *tail;
+
+  simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // base_ptr[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so base_ptr
+  // needs to be large enough to handle this
+  //
+  // If the kernel sets SIMDJSON_CUSTOM_BIT_INDEXER, then it will provide its own
+  // version of the code.
+#ifdef SIMDJSON_CUSTOM_BIT_INDEXER
+  simdjson_inline void write(uint32_t idx, uint64_t bits);
+#else
+  simdjson_inline void write(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+        return;
+#if defined(SIMDJSON_PREFER_REVERSE_BITS)
+    /**
+     * ARM lacks a fast trailing zero instruction, but it has a fast
+     * bit reversal instruction and a fast leading zero instruction.
+     * Thus it may be profitable to reverse the bits (once) and then
+     * to rely on a sequence of instructions that call the leading
+     * zero instruction.
+     *
+     * Performance notes:
+     * The chosen routine is not optimal in terms of data dependency
+     * since zero_leading_bit might require two instructions. However,
+     * it tends to minimize the total number of instructions which is
+     * beneficial.
+     */
+
+    uint64_t rev_bits = reverse_bits(bits);
+    int cnt = static_cast<int>(count_ones(bits));
+    int i = 0;
+    // Do the first 8 all together
+    for (; i<8; i++) {
+      int lz = leading_zeroes(rev_bits);
+      this->tail[i] = static_cast<uint32_t>(idx) + lz;
+      rev_bits = zero_leading_bit(rev_bits, lz);
+    }
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      i = 8;
+      for (; i<16; i++) {
+        int lz = leading_zeroes(rev_bits);
+        this->tail[i] = static_cast<uint32_t>(idx) + lz;
+        rev_bits = zero_leading_bit(rev_bits, lz);
+      }
+
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        i = 16;
+        while (rev_bits != 0) {
+          int lz = leading_zeroes(rev_bits);
+          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
+          rev_bits = zero_leading_bit(rev_bits, lz);
+        }
+      }
+    }
+    this->tail += cnt;
+#else // SIMDJSON_PREFER_REVERSE_BITS
+    /**
+     * Under recent x64 systems, we often have both a fast trailing zero
+     * instruction and a fast 'clear-lower-bit' instruction so the following
+     * algorithm can be competitive.
+     */
+
+    int cnt = static_cast<int>(count_ones(bits));
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
+      bits = clear_lowest_bit(bits);
+    }
+
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
+        bits = clear_lowest_bit(bits);
+      }
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        int i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = clear_lowest_bit(bits);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+#endif
+  }
+#endif // SIMDJSON_CUSTOM_BIT_INDEXER
+
+};
+
+class json_structural_indexer {
+public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+  template<size_t STEP_SIZE>
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
+
+private:
+  simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
+  simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
+
+  json_scanner scanner{};
+  utf8_checker checker{};
+  bit_indexer indexer;
+  uint64_t prev_structurals = 0;
+  uint64_t unescaped_chars_error = 0;
+};
+
+simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+
+// Skip the last character if it is partial
+simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (simdjson_unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
+  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
+  // We guard the rest of the code so that we can assume that len > 0 throughout.
+  if (len == 0) { return EMPTY; }
+  if (is_streaming(partial)) {
+    len = trim_partial_utf8(buf, len);
+    // If you end up with an empty window after trimming
+    // the partial UTF-8 bytes, then chances are good that you
+    // have an UTF-8 formatting error.
+    if(len == 0) { return UTF8_ERROR; }
+  }
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+
+  // Read all but the last block
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+  // Take care of the last block (will always be there unless file is empty which is
+  // not supposed to happen.)
+  uint8_t block[STEP_SIZE];
+  if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
+  indexer.step<STEP_SIZE>(block, reader);
+  return indexer.finish(parser, reader.block_index(), len, partial);
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
+
+simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+  error_code error = scanner.finish();
+  // We deliberately break down the next expression so that it is
+  // human readable.
+  const bool should_we_exit = is_streaming(partial) ?
+    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
+    : (error != SUCCESS); // if partial is false, we must have SUCCESS
+  const bool have_unclosed_string = (error == UNCLOSED_STRING);
+  if (simdjson_unlikely(should_we_exit)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * The On Demand API requires special padding.
+   *
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   *
+   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
+   * R"({ "a": [,,)"
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial == stage1_mode::streaming_partial) {
+    // If we have an unclosed string, then the last structural
+    // will be the quote and we want to make sure to omit it.
+    if(have_unclosed_string) {
+      parser.n_structural_indexes--;
+      // a valid JSON file cannot have zero structural indexes - we should have found something
+      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
+    }
+    // We truncate the input to the end of the last complete document (or zero).
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      if(parser.structural_indexes[0] == 0) {
+        // If the buffer is partial and we started at index 0 but the document is
+        // incomplete, it's too big to parse.
+        return CAPACITY;
+      } else {
+        // It is possible that the document could be parsed, we just had a lot
+        // of white space.
+        parser.n_structural_indexes = 0;
+        return EMPTY;
+      }
+    }
+
+    parser.n_structural_indexes = new_structural_indexes;
+  } else if (partial == stage1_mode::streaming_final) {
+    if(have_unclosed_string) { parser.n_structural_indexes--; }
+    // We truncate the input to the end of the last complete document (or zero).
+    // Because partial == stage1_mode::streaming_final, it means that we may
+    // silently ignore trailing garbage. Though it sounds bad, we do it
+    // deliberately because many people who have streams of JSON documents
+    // will truncate them for processing. E.g., imagine that you are uncompressing
+    // the data from a size file or receiving it in chunks from the network. You
+    // may not know where exactly the last document will be. Meanwhile the
+    // document_stream instances allow people to know the JSON documents they are
+    // parsing (see the iterator.source() method).
+    parser.n_structural_indexes = find_next_document_index(parser);
+    // We store the initial n_structural_indexes so that the client can see
+    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
+    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
+    // otherwise, it will copy some prior index.
+    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
+    // This next line is critical, do not change it unless you understand what you are
+    // doing.
+    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+    if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+        // We tolerate an unclosed string at the very end of the stream. Indeed, users
+        // often load their data in bulk without being careful and they want us to ignore
+        // the trailing garbage.
+        return EMPTY;
+    }
+  }
+  checker.check_eof();
+  return checker.errors();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/json_structural_indexer.h */
+/* begin file src/generic/stage1/utf8_validator.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage1 {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return c.errors() == error_code::SUCCESS;
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_validator.h */
+
+//
+// Stage 2
+//
+/* begin file src/generic/stage2/stringparsing.h */
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+
+namespace simdjson {
+namespace haswell {
+namespace {
+/// @private
+namespace stringparsing {
+
+// begin copypasta
+// These chars yield themselves: " \ /
+// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+// u not handled in this table as it's complex
+static const uint8_t escape_map[256] = {
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
+    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
+    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+};
+
+// handle a unicode codepoint
+// write appropriate values into dest
+// src will advance 6 bytes or 12 bytes
+// dest will advance a variable amount (return via pointer)
+// return true if the unicode codepoint was valid
+// We work in little-endian then swap at write time
+simdjson_warn_unused
+simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                            uint8_t **dst_ptr) {
+  // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+  // conversion isn't valid; we defer the check for this to inside the
+  // multilingual plane check
+  uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
+  *src_ptr += 6;
+
+  // If we found a high surrogate, we must
+  // check for low surrogate for characters
+  // outside the Basic
+  // Multilingual Plane.
+  if (code_point >= 0xd800 && code_point < 0xdc00) {
+    const uint8_t *src_data = *src_ptr;
+    /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
+    if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
+      return false;
+    }
+    uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
+
+    // We have already checked that the high surrogate is valid and
+    // (code_point - 0xd800) < 1024.
+    //
+    // Check that code_point_2 is in the range 0xdc00..0xdfff
+    // and that code_point_2 was parsed from valid hex.
+    uint32_t low_bit = code_point_2 - 0xdc00;
+    if (low_bit >> 10) {
+      return false;
+    }
+
+    code_point =
+        (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
+    *src_ptr += 6;
+  } else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
+      // If we encounter a low surrogate (not preceded by a high surrogate)
+      // then we have an error.
+      return false;
+  }
+  size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
+  *dst_ptr += offset;
+  return offset > 0;
+}
+
+/**
+ * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
+ * must be an unescaped quote terminating the string. It returns the final output
+ * position as pointer. In case of error (e.g., the string has bad escaped codes),
+ * then null_nullptrptr is returned. It is assumed that the output buffer is large
+ * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
+ * SIMDJSON_PADDING bytes.
+ */
+simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
+  while (1) {
+    // Copy the next n bytes, and find the backslash and quote in them.
+    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+    // If the next thing is the end quote, copy and return
+    if (bs_quote.has_quote_first()) {
+      // we encountered quotes first. Move dst to point to quotes and exit
+      return dst + bs_quote.quote_index();
+    }
+    if (bs_quote.has_backslash()) {
+      /* find out where the backspace is */
+      auto bs_dist = bs_quote.backslash_index();
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return nullptr;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return nullptr; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      src += backslash_and_quote::BYTES_PROCESSED;
+      dst += backslash_and_quote::BYTES_PROCESSED;
+    }
+  }
+  /* can't be reached */
+  return nullptr;
+}
+
+} // namespace stringparsing
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/generic/stage2/tape_builder.h */
+/* begin file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace logger {
+
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+#if SIMDJSON_VERBOSE_LOGGING
+  static constexpr const bool LOG_ENABLED = true;
+#else
+  static constexpr const bool LOG_ENABLED = false;
+#endif
+  static constexpr const int LOG_EVENT_LEN = 20;
+  static constexpr const int LOG_BUFFER_LEN = 30;
+  static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
+  static constexpr const int LOG_INDEX_LEN = 5;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static simdjson_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static simdjson_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
+      printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
+    }
+  }
+
+  simdjson_unused static simdjson_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line from the stage 2 DOM parser
+  template<typename S>
+  static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
+      auto next_index = structurals.next_structural;
+      auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
+      auto next = &structurals.buf[*next_index];
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(current[i]));
+        }
+        printf(" ");
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
+          printf("%c", printable_char(next[i]));
+        }
+        printf(" ");
+      }
+      if (current_index) {
+        printf("| %*u ", LOG_INDEX_LEN, *current_index);
+      } else {
+        printf("| %-*s ", LOG_INDEX_LEN, "");
+      }
+      // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
+      printf("| %-s ", detail);
+      printf("|\n");
+    }
+  }
+
+} // namespace logger
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage2/logger.h */
+
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage2 {
+
+class json_iterator {
+public:
+  const uint8_t* const buf;
+  uint32_t *next_structural;
+  dom_parser_implementation &dom_parser;
+  uint32_t depth{0};
+
+  /**
+   * Walk the JSON document.
+   *
+   * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
+   * the first parameter; some callbacks have other parameters as well:
+   *
+   * - visit_document_start() - at the beginning.
+   * - visit_document_end() - at the end (if things were successful).
+   *
+   * - visit_array_start() - at the start `[` of a non-empty array.
+   * - visit_array_end() - at the end `]` of a non-empty array.
+   * - visit_empty_array() - when an empty array is encountered.
+   *
+   * - visit_object_end() - at the start `]` of a non-empty object.
+   * - visit_object_start() - at the end `]` of a non-empty object.
+   * - visit_empty_object() - when an empty object is encountered.
+   * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
+   *                                   guaranteed to point at the first quote of the string (`"key"`).
+   * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
+   * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
+   *
+   * - increment_count(iter) - each time a value is found in an array or object.
+   */
+  template<bool STREAMING, typename V>
+  simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
+
+  /**
+   * Create an iterator capable of walking a JSON document.
+   *
+   * The document must have already passed through stage 1.
+   */
+  simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
+
+  /**
+   * Look at the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *peek() const noexcept;
+  /**
+   * Advance to the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *advance() noexcept;
+  /**
+   * Get the remaining length of the document, from the start of the current token.
+   */
+  simdjson_inline size_t remaining_len() const noexcept;
+  /**
+   * Check if we are at the end of the document.
+   *
+   * If this is true, there are no more tokens.
+   */
+  simdjson_inline bool at_eof() const noexcept;
+  /**
+   * Check if we are at the beginning of the document.
+   */
+  simdjson_inline bool at_beginning() const noexcept;
+  simdjson_inline uint8_t last_structural() const noexcept;
+
+  /**
+   * Log that a value has been found.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_value(const char *type) const noexcept;
+  /**
+   * Log the start of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_start_value(const char *type) const noexcept;
+  /**
+   * Log the end of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_end_value(const char *type) const noexcept;
+  /**
+   * Log an error.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_error(const char *error) const noexcept;
+
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
+};
+
+template<bool STREAMING, typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
+  logger::log_start();
+
+  //
+  // Start the document
+  //
+  if (at_eof()) { return EMPTY; }
+  log_start_value("document");
+  SIMDJSON_TRY( visitor.visit_document_start(*this) );
+
+  //
+  // Read first value
+  //
+  {
+    auto value = advance();
+
+    // Make sure the outer object or array is closed before continuing; otherwise, there are ways we
+    // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      switch (*value) {
+        case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
+        case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
+      }
+    }
+
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
+    }
+  }
+  goto document_end;
+
+//
+// Object parser states
+//
+object_begin:
+  log_start_value("object");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = false;
+  SIMDJSON_TRY( visitor.visit_object_start(*this) );
+
+  {
+    auto key = advance();
+    if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
+    SIMDJSON_TRY( visitor.increment_count(*this) );
+    SIMDJSON_TRY( visitor.visit_key(*this, key) );
+  }
+
+object_field:
+  if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+object_continue:
+  switch (*advance()) {
+    case ',':
+      SIMDJSON_TRY( visitor.increment_count(*this) );
+      {
+        auto key = advance();
+        if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
+        SIMDJSON_TRY( visitor.visit_key(*this, key) );
+      }
+      goto object_field;
+    case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
+    default: log_error("No comma between object fields"); return TAPE_ERROR;
+  }
+
+scope_end:
+  depth--;
+  if (depth == 0) { goto document_end; }
+  if (dom_parser.is_array[depth]) { goto array_continue; }
+  goto object_continue;
+
+//
+// Array parser states
+//
+array_begin:
+  log_start_value("array");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = true;
+  SIMDJSON_TRY( visitor.visit_array_start(*this) );
+  SIMDJSON_TRY( visitor.increment_count(*this) );
+
+array_value:
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+array_continue:
+  switch (*advance()) {
+    case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
+    case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
+    default: log_error("Missing comma between array values"); return TAPE_ERROR;
+  }
+
+document_end:
+  log_end_value("document");
+  SIMDJSON_TRY( visitor.visit_document_end(*this) );
+
+  dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
+
+  // If we didn't make it to the end, it's an error
+  if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
+    log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return TAPE_ERROR;
+  }
+
+  return SUCCESS;
+
+} // walk_document()
+
+simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
+  : buf{_dom_parser.buf},
+    next_structural{&_dom_parser.structural_indexes[start_structural_index]},
+    dom_parser{_dom_parser} {
+}
+
+simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
+  return &buf[*(next_structural)];
+}
+simdjson_inline const uint8_t *json_iterator::advance() noexcept {
+  return &buf[*(next_structural++)];
+}
+simdjson_inline size_t json_iterator::remaining_len() const noexcept {
+  return dom_parser.len - *(next_structural-1);
+}
+
+simdjson_inline bool json_iterator::at_eof() const noexcept {
+  return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
+}
+simdjson_inline bool json_iterator::at_beginning() const noexcept {
+  return next_structural == dom_parser.structural_indexes.get();
+}
+simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
+  return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
+}
+
+simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
+  logger::log_line(*this, "", type, "");
+}
+
+simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
+  logger::log_line(*this, "+", type, "");
+  if (logger::LOG_ENABLED) { logger::log_depth++; }
+}
+
+simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
+  if (logger::LOG_ENABLED) { logger::log_depth--; }
+  logger::log_line(*this, "-", type, "");
+}
+
+simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
+  logger::log_line(*this, "", "ERROR", error);
+}
+
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_root_string(*this, value);
+    case 't': return visitor.visit_root_true_atom(*this, value);
+    case 'f': return visitor.visit_root_false_atom(*this, value);
+    case 'n': return visitor.visit_root_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_root_number(*this, value);
+    default:
+      log_error("Document starts with a non-value character");
+      return TAPE_ERROR;
+  }
+}
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_string(*this, value);
+    case 't': return visitor.visit_true_atom(*this, value);
+    case 'f': return visitor.visit_false_atom(*this, value);
+    case 'n': return visitor.visit_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_number(*this, value);
+    default:
+      log_error("Non-value found when value was expected!");
+      return TAPE_ERROR;
+  }
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/tape_writer.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage2 {
+
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+
+  /** Write a signed 64-bit value to tape. */
+  simdjson_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  simdjson_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  simdjson_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  simdjson_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  simdjson_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  simdjson_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+simdjson_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+simdjson_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage2 {
+
+struct tape_builder {
+  template<bool STREAMING>
+  simdjson_warn_unused static simdjson_inline error_code parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept;
+
+  /** Called when a non-empty document starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty document ends without error. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty array starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty array ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
+  /** Called when an empty array is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty object starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
+  /**
+   * Called when a key in a field is encountered.
+   *
+   * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
+   * will be called after this with the field value.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
+  /** Called when a non-empty object ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
+  /** Called when an empty object is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
+
+  /**
+   * Called when a string, number, boolean or null is found.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+  /**
+   * Called when a string, number, boolean or null is found at the top level of a document (i.e.
+   * when there is no array or object and the entire document is a single string, number, boolean or
+   * null.
+   *
+   * This is separate from primitive() because simdjson's normal primitive parsing routines assume
+   * there is at least one more token after the value, which is only true in an array or object.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  /** Called each time a new field or element in an array or object is found. */
+  simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
+
+  /** Next location to write to tape */
+  tape_writer tape;
+private:
+  /** Next write location in the string buf for stage 2 parsing */
+  uint8_t *current_string_buf_loc;
+
+  simdjson_inline tape_builder(dom::document &doc) noexcept;
+
+  simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
+  simdjson_inline void start_container(json_iterator &iter) noexcept;
+  simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
+  simdjson_inline void on_end_string(uint8_t *dst) noexcept;
+}; // class tape_builder
+
+template<bool STREAMING>
+simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
+  json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  tape_builder builder(doc);
+  return iter.walk_document<STREAMING>(builder);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_root_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
+  constexpr uint32_t start_tape_index = 0;
+  tape.append(start_tape_index, internal::tape_type::ROOT);
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
+  return visit_string(iter, key, true);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
+  return SUCCESS;
+}
+
+simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
+  iter.log_value(key ? "key" : "string");
+  uint8_t *dst = on_start_string(iter);
+  dst = stringparsing::parse_string(value+1, dst);
+  if (dst == nullptr) {
+    iter.log_error("Invalid escape in string");
+    return STRING_ERROR;
+  }
+  on_end_string(dst);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
+  return visit_string(iter, value);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("number");
+  return numberparsing::parse_number(value, tape);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
+  //
+  // We need to make a copy to make sure that the string is space terminated.
+  // This is not about padding the input, which should already padded up
+  // to len + SIMDJSON_PADDING. However, we have no control at this stage
+  // on how the padding was done. What if the input string was padded with nulls?
+  // It is quite common for an input string to have an extra null character (C string).
+  // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+  // document, but the string "9\0" by itself is fine. So we make a copy and
+  // pad the input with spaces when we know that there is just one input element.
+  // This copy is relatively expensive, but it will almost never be called in
+  // practice unless you are in the strange scenario where you have many JSON
+  // documents made of single atoms.
+  //
+  std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
+  if (copy.get() == nullptr) { return MEMALLOC; }
+  std::memcpy(copy.get(), value, iter.remaining_len());
+  std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
+  error_code error = visit_number(iter, copy.get());
+  return error;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+// private:
+
+simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
+  return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  auto start_index = next_tape_index(iter);
+  tape.append(start_index+2, start);
+  tape.append(start_index, end);
+  return SUCCESS;
+}
+
+simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
+  iter.dom_parser.open_containers[iter.depth].count = 0;
+  tape.skip(); // We don't actually *write* the start element until the end.
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  // Write the ending tape element, pointing at the start location
+  const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
+  tape.append(start_tape_index, end);
+  // Write the start tape element, pointing at the end location (and including count)
+  // count can overflow if it exceeds 24 bits... so we saturate
+  // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+  const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
+  const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
+  return SUCCESS;
+}
+
+simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
+  // we advance the point, accounting for the fact that we have a NULL termination
+  tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
+  return current_string_buf_loc + sizeof(uint32_t);
+}
+
+simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
+  uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+  // TODO check for overflow in case someone has a crazy string (>=4GB?)
+  // But only add the overflow check when the document itself exceeds 4GB
+  // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+  memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+  // NULL termination is still handy if you expect all your strings to
+  // be NULL terminated? It comes at a small cost
+  *dst = 0;
+  current_string_buf_loc = dst + 1;
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file src/generic/stage2/tape_builder.h */
+
+//
+// Implementation-specific overrides
+//
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace stage1 {
+
+simdjson_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+
+} // namespace stage1
+} // unnamed namespace
+
+simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
+}
+
+simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return haswell::stage1::generic_validate_utf8(buf,len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<false>(*this, _doc);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<true>(*this, _doc);
+}
+
+simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst) const noexcept {
+  return haswell::stringparsing::parse_string(src, dst);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  auto error = stage1(_buf, _len, stage1_mode::regular);
+  if (error) { return error; }
+  return stage2(_doc);
+}
+
+} // namespace haswell
+} // namespace simdjson
+
+/* begin file include/simdjson/haswell/end.h */
+SIMDJSON_UNTARGET_HASWELL
+/* end file include/simdjson/haswell/end.h */
+/* end file src/haswell/dom_parser_implementation.cpp */
+#endif
+#if SIMDJSON_IMPLEMENTATION_PPC64
+/* begin file src/ppc64/implementation.cpp */
+/* begin file include/simdjson/ppc64/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "ppc64"
+// #define SIMDJSON_IMPLEMENTATION ppc64
+/* end file include/simdjson/ppc64/begin.h */
+
+namespace simdjson {
+namespace ppc64 {
+
+simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  if (auto err = dst->set_capacity(capacity))
+    return err;
+  if (auto err = dst->set_max_depth(max_depth))
+    return err;
+  return SUCCESS;
+}
+
+} // namespace ppc64
+} // namespace simdjson
+
+/* begin file include/simdjson/ppc64/end.h */
+/* end file include/simdjson/ppc64/end.h */
+/* end file src/ppc64/implementation.cpp */
+/* begin file src/ppc64/dom_parser_implementation.cpp */
+/* begin file include/simdjson/ppc64/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "ppc64"
+// #define SIMDJSON_IMPLEMENTATION ppc64
+/* end file include/simdjson/ppc64/begin.h */
+
+//
+// Stage 1
+//
+namespace simdjson {
+namespace ppc64 {
+namespace {
+
+using namespace simd;
+
+struct json_character_block {
+  static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
+
+  simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
+  simdjson_inline uint64_t op() const noexcept { return _op; }
+  simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
+
+  uint64_t _whitespace;
+  uint64_t _op;
+};
+
+simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
+  const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+  const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+
+  simd8x64<uint8_t> v(
+     (in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
+     (in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
+     (in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
+     (in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
+  );
+
+  uint64_t op = simd8x64<bool>(
+        v.chunks[0].any_bits_set(0x7),
+        v.chunks[1].any_bits_set(0x7),
+        v.chunks[2].any_bits_set(0x7),
+        v.chunks[3].any_bits_set(0x7)
+  ).to_bitmask();
+
+  uint64_t whitespace = simd8x64<bool>(
+        v.chunks[0].any_bits_set(0x18),
+        v.chunks[1].any_bits_set(0x18),
+        v.chunks[2].any_bits_set(0x18),
+        v.chunks[3].any_bits_set(0x18)
+  ).to_bitmask();
+
+  return { whitespace, op };
+}
+
+simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  // careful: 0x80 is not ascii.
+  return input.reduce_or().saturating_sub(0x7fu).bits_not_set_anywhere();
+}
+
+simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+simdjson_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+
+/* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+    static const uint8_t max_array[64] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#else
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#endif
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdjson_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+#ifndef SIMDJSON_IF_CONSTEXPR
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_IF_CONSTEXPR if constexpr
+#else
+#define SIMDJSON_IF_CONSTEXPR if
+#endif
+#endif
+
+    simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdjson_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
+                ||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
+                || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support one, two or four chunks per 64-byte block.");
+        SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+      }
+    }
+    // do not forget to call check_eof!
+    simdjson_inline error_code errors() {
+      return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_lookup4_algorithm.h */
+/* begin file src/generic/stage1/json_structural_indexer.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+/* begin file src/generic/stage1/buf_block_reader.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdjson_inline size_t block_index();
+  simdjson_inline bool has_full_block() const;
+  simdjson_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdjson_inline size_t get_remainder(uint8_t *dst) const;
+  simdjson_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text_64(const uint8_t *text) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdjson_unused static char * format_mask(uint64_t mask) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/buf_block_reader.h */
+/* begin file src/generic/stage1/json_string_scanner.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage1 {
+
+struct json_string_block {
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
+  _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
+
+  // Escaped characters (characters following an escape() character)
+  simdjson_inline uint64_t escaped() const { return _escaped; }
+  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+  simdjson_inline uint64_t escape() const { return _backslash & ~_escaped; }
+  // Real (non-backslashed) quotes
+  simdjson_inline uint64_t quote() const { return _quote; }
+  // Start quotes of strings
+  simdjson_inline uint64_t string_start() const { return _quote & _in_string; }
+  // End quotes of strings
+  simdjson_inline uint64_t string_end() const { return _quote & ~_in_string; }
+  // Only characters inside the string (not including the quotes)
+  simdjson_inline uint64_t string_content() const { return _in_string & ~_quote; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+  // Tail of string (everything except the start quote)
+  simdjson_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+  // backslash characters
+  uint64_t _backslash;
+  // escaped characters (backslashed--does not include the hex characters after \u)
+  uint64_t _escaped;
+  // real quotes (non-backslashed ones)
+  uint64_t _quote;
+  // string characters (includes start quote but not end quote)
+  uint64_t _in_string;
+};
+
+// Scans blocks for string characters, storing the state necessary to do so
+class json_string_scanner {
+public:
+  simdjson_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Intended to be defined by the implementation
+  simdjson_inline uint64_t find_escaped(uint64_t escape);
+  simdjson_inline uint64_t find_escaped_branchless(uint64_t escape);
+
+  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+  uint64_t prev_in_string = 0ULL;
+  // Whether the first character of the next iteration is escaped.
+  uint64_t prev_escaped = 0ULL;
+};
+
+//
+// Finds escaped characters (characters following \).
+//
+// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+//
+// Does this by:
+// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+//
+// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+// the start bit causes a carry), and leaves even-bit sequences alone.
+//
+// Example:
+//
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+// escaped        |   x  | x x  x x  x x  x  x  |
+// desired        |   x  | x x  x x  x x  x  x  |
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+//
+simdjson_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
+  // If there was overflow, pretend the first character isn't a backslash
+  backslash &= ~prev_escaped;
+  uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+  // Get sequences starting on even bits by clearing out the odd series using +
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+  uint64_t sequences_starting_on_even_bits;
+  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+  // Mask every other backslashed character as an escaped character
+  // Flip the mask for sequences that start on even bits, to correct them
+  return (even_bits ^ invert_mask) & follows_escape;
+}
+
+//
+// Return a mask of all string characters plus end quotes.
+//
+// prev_escaped is overflow saying whether the next character is escaped.
+// prev_in_string is overflow saying whether we're still in a string.
+//
+// Backslash sequences outside of quotes will be detected in stage 2.
+//
+simdjson_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  const uint64_t backslash = in.eq('\\');
+  const uint64_t escaped = find_escaped(backslash);
+  const uint64_t quote = in.eq('"') & ~escaped;
+
+  //
+  // prefix_xor flips on bits inside the string (and flips off the end quote).
+  //
+  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+  // (characters inside strings are outside, and characters outside strings are inside).
+  //
+  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+  //
+  // Check if we're still in a string at the end of the box so the next block will know
+  //
+  // right shift of a signed value expected to be well-defined and standard
+  // compliant as of C++20, John Regher from Utah U. says this is fine code
+  //
+  prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
+  // Use ^ to turn the beginning quote off, and the end quote on.
+
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_string_block(
+    backslash,
+    escaped,
+    quote,
+    in_string
+  );
+}
+
+simdjson_inline error_code json_string_scanner::finish() {
+  if (prev_in_string) {
+    return UNCLOSED_STRING;
+  }
+  return SUCCESS;
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/json_string_scanner.h */
+/* begin file src/generic/stage1/json_scanner.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage1 {
+
+/**
+ * A block of scanned json, with information on operators and scalars.
+ *
+ * We seek to identify pseudo-structural characters. Anything that is inside
+ * a string must be omitted (hence  & ~_string.string_tail()).
+ * Otherwise, pseudo-structural characters come in two forms.
+ * 1. We have the structural characters ([,],{,},:, comma). The
+ *    term 'structural character' is from the JSON RFC.
+ * 2. We have the 'scalar pseudo-structural characters'.
+ *    Scalars are quotes, and any character except structural characters and white space.
+ *
+ * To identify the scalar pseudo-structural characters, we must look at what comes
+ * before them: it must be a space, a quote or a structural characters.
+ * Starting with simdjson v0.3, we identify them by
+ * negation: we identify everything that is followed by a non-quote scalar,
+ * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
+ */
+struct json_block {
+public:
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+  simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+
+  /**
+   * The start of structurals.
+   * In simdjson prior to v0.3, these were called the pseudo-structural characters.
+   **/
+  simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
+  /** All JSON whitespace (i.e. not in a string) */
+  simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
+
+  // Helpers
+
+  /** Whether the given characters are inside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
+  /** Whether the given characters are outside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
+
+  // string and escape characters
+  json_string_block _string;
+  // whitespace, structural characters ('operators'), scalars
+  json_character_block _characters;
+  // whether the previous character was a scalar
+  uint64_t _follows_potential_nonquote_scalar;
+private:
+  // Potential structurals (i.e. disregarding strings)
+
+  /**
+   * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
+   * They may reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
+  /**
+   * The start of non-operator runs, like 123, true and "abc".
+   * It main reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_scalar_start() const noexcept {
+    // The term "scalar" refers to anything except structural characters and white space
+    // (so letters, numbers, quotes).
+    // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
+    // then we know that it is irrelevant structurally.
+    return _characters.scalar() & ~follows_potential_scalar();
+  }
+  /**
+   * Whether the given character is immediately after a non-operator like 123, true.
+   * The characters following a quote are not included.
+   */
+  simdjson_inline uint64_t follows_potential_scalar() const noexcept {
+    // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
+    // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
+    // white space.
+    // It is understood that within quoted region, anything at all could be marked (irrelevant).
+    return _follows_potential_nonquote_scalar;
+  }
+};
+
+/**
+ * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - structural characters or 'operators' ([]{},:, comma)
+ *   and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+class json_scanner {
+public:
+  json_scanner() = default;
+  simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Whether the last character of the previous iteration is part of a scalar token
+  // (anything except whitespace or a structural character/'operator').
+  uint64_t prev_scalar = 0ULL;
+  json_string_scanner string_scanner{};
+};
+
+
+//
+// Check if the current character immediately follows a matching character.
+//
+// For example, this checks for quotes with backslashes in front of them:
+//
+//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+//
+simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+  const uint64_t result = match << 1 | overflow;
+  overflow = match >> 63;
+  return result;
+}
+
+simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  json_string_block strings = string_scanner.next(in);
+  // identifies the white-space and the structural characters
+  json_character_block characters = json_character_block::classify(in);
+  // The term "scalar" refers to anything except structural characters and white space
+  // (so letters, numbers, quotes).
+  // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
+  //
+  // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
+  // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
+  // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
+  // may need to add an extra check when parsing strings.
+  //
+  // Performance: there are many ways to skin this cat.
+  const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
+  uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_block(
+    strings,// strings is a function-local object so either it moves or the copy is elided.
+    characters,
+    follows_nonquote_scalar
+  );
+}
+
+simdjson_inline error_code json_scanner::finish() {
+  return string_scanner.finish();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/json_scanner.h */
+/* begin file src/generic/stage1/json_minifier.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage1 {
+
+class json_minifier {
+public:
+  template<size_t STEP_SIZE>
+  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+
+private:
+  simdjson_inline json_minifier(uint8_t *_dst)
+  : dst{_dst}
+  {}
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
+  simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+  json_scanner scanner{};
+  uint8_t *dst;
+};
+
+simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
+  uint64_t mask = block.whitespace();
+  dst += in.compress(mask, dst);
+}
+
+simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
+  error_code error = scanner.finish();
+  if (error) { dst_len = 0; return error; }
+  dst_len = dst - dst_start;
+  return SUCCESS;
+}
+
+template<>
+simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  simd::simd8x64<uint8_t> in_2(block_buf+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1);
+  this->next(in_2, block_2);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  json_block block_1 = scanner.next(in_1);
+  this->next(block_buf, block_1);
+  reader.advance();
+}
+
+template<size_t STEP_SIZE>
+error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
+  while (reader.has_full_block()) {
+    minifier.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  size_t remaining_bytes = reader.get_remainder(block);
+  if (remaining_bytes > 0) {
+    // We do not want to write directly to the output stream. Rather, we write
+    // to a local buffer (for safety).
+    uint8_t out_block[STEP_SIZE];
+    uint8_t * const guarded_dst{minifier.dst};
+    minifier.dst = out_block;
+    minifier.step<STEP_SIZE>(block, reader);
+    size_t to_write = minifier.dst - out_block;
+    // In some cases, we could be enticed to consider the padded spaces
+    // as part of the string. This is fine as long as we do not write more
+    // than we consumed.
+    if(to_write > remaining_bytes) { to_write = remaining_bytes; }
+    memcpy(guarded_dst, out_block, to_write);
+    minifier.dst = guarded_dst + to_write;
+  }
+  return minifier.finish(dst, dst_len);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/json_minifier.h */
+/* begin file src/generic/stage1/find_next_document_index.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ':' ','
+  * and when the second element is NOT one of these characters: '}' ']' ':' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // Variant: do not count separately, just figure out depth
+  if(parser.n_structural_indexes == 0) { return 0; }
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  // If we made it to the end, we want to finish counting to see if we have a full document.
+  switch (parser.buf[parser.structural_indexes[0]]) {
+    case '}':
+      obj_cnt--;
+      break;
+    case ']':
+      arr_cnt--;
+      break;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+  }
+  if (!arr_cnt && !obj_cnt) {
+    // We have a complete document.
+    return parser.n_structural_indexes;
+  }
+  return 0;
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/find_next_document_index.h */
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage1 {
+
+class bit_indexer {
+public:
+  uint32_t *tail;
+
+  simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // base_ptr[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so base_ptr
+  // needs to be large enough to handle this
+  //
+  // If the kernel sets SIMDJSON_CUSTOM_BIT_INDEXER, then it will provide its own
+  // version of the code.
+#ifdef SIMDJSON_CUSTOM_BIT_INDEXER
+  simdjson_inline void write(uint32_t idx, uint64_t bits);
+#else
+  simdjson_inline void write(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+        return;
+#if defined(SIMDJSON_PREFER_REVERSE_BITS)
+    /**
+     * ARM lacks a fast trailing zero instruction, but it has a fast
+     * bit reversal instruction and a fast leading zero instruction.
+     * Thus it may be profitable to reverse the bits (once) and then
+     * to rely on a sequence of instructions that call the leading
+     * zero instruction.
+     *
+     * Performance notes:
+     * The chosen routine is not optimal in terms of data dependency
+     * since zero_leading_bit might require two instructions. However,
+     * it tends to minimize the total number of instructions which is
+     * beneficial.
+     */
+
+    uint64_t rev_bits = reverse_bits(bits);
+    int cnt = static_cast<int>(count_ones(bits));
+    int i = 0;
+    // Do the first 8 all together
+    for (; i<8; i++) {
+      int lz = leading_zeroes(rev_bits);
+      this->tail[i] = static_cast<uint32_t>(idx) + lz;
+      rev_bits = zero_leading_bit(rev_bits, lz);
+    }
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      i = 8;
+      for (; i<16; i++) {
+        int lz = leading_zeroes(rev_bits);
+        this->tail[i] = static_cast<uint32_t>(idx) + lz;
+        rev_bits = zero_leading_bit(rev_bits, lz);
+      }
+
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        i = 16;
+        while (rev_bits != 0) {
+          int lz = leading_zeroes(rev_bits);
+          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
+          rev_bits = zero_leading_bit(rev_bits, lz);
+        }
+      }
+    }
+    this->tail += cnt;
+#else // SIMDJSON_PREFER_REVERSE_BITS
+    /**
+     * Under recent x64 systems, we often have both a fast trailing zero
+     * instruction and a fast 'clear-lower-bit' instruction so the following
+     * algorithm can be competitive.
+     */
+
+    int cnt = static_cast<int>(count_ones(bits));
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
+      bits = clear_lowest_bit(bits);
+    }
+
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
+        bits = clear_lowest_bit(bits);
+      }
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        int i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = clear_lowest_bit(bits);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+#endif
+  }
+#endif // SIMDJSON_CUSTOM_BIT_INDEXER
+
+};
+
+class json_structural_indexer {
+public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+  template<size_t STEP_SIZE>
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
+
+private:
+  simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
+  simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
+
+  json_scanner scanner{};
+  utf8_checker checker{};
+  bit_indexer indexer;
+  uint64_t prev_structurals = 0;
+  uint64_t unescaped_chars_error = 0;
+};
+
+simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+
+// Skip the last character if it is partial
+simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (simdjson_unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
+  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
+  // We guard the rest of the code so that we can assume that len > 0 throughout.
+  if (len == 0) { return EMPTY; }
+  if (is_streaming(partial)) {
+    len = trim_partial_utf8(buf, len);
+    // If you end up with an empty window after trimming
+    // the partial UTF-8 bytes, then chances are good that you
+    // have an UTF-8 formatting error.
+    if(len == 0) { return UTF8_ERROR; }
+  }
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+
+  // Read all but the last block
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+  // Take care of the last block (will always be there unless file is empty which is
+  // not supposed to happen.)
+  uint8_t block[STEP_SIZE];
+  if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
+  indexer.step<STEP_SIZE>(block, reader);
+  return indexer.finish(parser, reader.block_index(), len, partial);
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
+
+simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+  error_code error = scanner.finish();
+  // We deliberately break down the next expression so that it is
+  // human readable.
+  const bool should_we_exit = is_streaming(partial) ?
+    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
+    : (error != SUCCESS); // if partial is false, we must have SUCCESS
+  const bool have_unclosed_string = (error == UNCLOSED_STRING);
+  if (simdjson_unlikely(should_we_exit)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * The On Demand API requires special padding.
+   *
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   *
+   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
+   * R"({ "a": [,,)"
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial == stage1_mode::streaming_partial) {
+    // If we have an unclosed string, then the last structural
+    // will be the quote and we want to make sure to omit it.
+    if(have_unclosed_string) {
+      parser.n_structural_indexes--;
+      // a valid JSON file cannot have zero structural indexes - we should have found something
+      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
+    }
+    // We truncate the input to the end of the last complete document (or zero).
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      if(parser.structural_indexes[0] == 0) {
+        // If the buffer is partial and we started at index 0 but the document is
+        // incomplete, it's too big to parse.
+        return CAPACITY;
+      } else {
+        // It is possible that the document could be parsed, we just had a lot
+        // of white space.
+        parser.n_structural_indexes = 0;
+        return EMPTY;
+      }
+    }
+
+    parser.n_structural_indexes = new_structural_indexes;
+  } else if (partial == stage1_mode::streaming_final) {
+    if(have_unclosed_string) { parser.n_structural_indexes--; }
+    // We truncate the input to the end of the last complete document (or zero).
+    // Because partial == stage1_mode::streaming_final, it means that we may
+    // silently ignore trailing garbage. Though it sounds bad, we do it
+    // deliberately because many people who have streams of JSON documents
+    // will truncate them for processing. E.g., imagine that you are uncompressing
+    // the data from a size file or receiving it in chunks from the network. You
+    // may not know where exactly the last document will be. Meanwhile the
+    // document_stream instances allow people to know the JSON documents they are
+    // parsing (see the iterator.source() method).
+    parser.n_structural_indexes = find_next_document_index(parser);
+    // We store the initial n_structural_indexes so that the client can see
+    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
+    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
+    // otherwise, it will copy some prior index.
+    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
+    // This next line is critical, do not change it unless you understand what you are
+    // doing.
+    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+    if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+        // We tolerate an unclosed string at the very end of the stream. Indeed, users
+        // often load their data in bulk without being careful and they want us to ignore
+        // the trailing garbage.
+        return EMPTY;
+    }
+  }
+  checker.check_eof();
+  return checker.errors();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/json_structural_indexer.h */
+/* begin file src/generic/stage1/utf8_validator.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage1 {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return c.errors() == error_code::SUCCESS;
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_validator.h */
+
+//
+// Stage 2
+//
+/* begin file src/generic/stage2/stringparsing.h */
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+/// @private
+namespace stringparsing {
+
+// begin copypasta
+// These chars yield themselves: " \ /
+// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+// u not handled in this table as it's complex
+static const uint8_t escape_map[256] = {
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
+    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
+    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+};
+
+// handle a unicode codepoint
+// write appropriate values into dest
+// src will advance 6 bytes or 12 bytes
+// dest will advance a variable amount (return via pointer)
+// return true if the unicode codepoint was valid
+// We work in little-endian then swap at write time
+simdjson_warn_unused
+simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                            uint8_t **dst_ptr) {
+  // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+  // conversion isn't valid; we defer the check for this to inside the
+  // multilingual plane check
+  uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
+  *src_ptr += 6;
+
+  // If we found a high surrogate, we must
+  // check for low surrogate for characters
+  // outside the Basic
+  // Multilingual Plane.
+  if (code_point >= 0xd800 && code_point < 0xdc00) {
+    const uint8_t *src_data = *src_ptr;
+    /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
+    if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
+      return false;
+    }
+    uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
+
+    // We have already checked that the high surrogate is valid and
+    // (code_point - 0xd800) < 1024.
+    //
+    // Check that code_point_2 is in the range 0xdc00..0xdfff
+    // and that code_point_2 was parsed from valid hex.
+    uint32_t low_bit = code_point_2 - 0xdc00;
+    if (low_bit >> 10) {
+      return false;
+    }
+
+    code_point =
+        (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
+    *src_ptr += 6;
+  } else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
+      // If we encounter a low surrogate (not preceded by a high surrogate)
+      // then we have an error.
+      return false;
+  }
+  size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
+  *dst_ptr += offset;
+  return offset > 0;
+}
+
+/**
+ * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
+ * must be an unescaped quote terminating the string. It returns the final output
+ * position as pointer. In case of error (e.g., the string has bad escaped codes),
+ * then null_nullptrptr is returned. It is assumed that the output buffer is large
+ * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
+ * SIMDJSON_PADDING bytes.
+ */
+simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
+  while (1) {
+    // Copy the next n bytes, and find the backslash and quote in them.
+    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+    // If the next thing is the end quote, copy and return
+    if (bs_quote.has_quote_first()) {
+      // we encountered quotes first. Move dst to point to quotes and exit
+      return dst + bs_quote.quote_index();
+    }
+    if (bs_quote.has_backslash()) {
+      /* find out where the backspace is */
+      auto bs_dist = bs_quote.backslash_index();
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return nullptr;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return nullptr; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      src += backslash_and_quote::BYTES_PROCESSED;
+      dst += backslash_and_quote::BYTES_PROCESSED;
+    }
+  }
+  /* can't be reached */
+  return nullptr;
+}
+
+} // namespace stringparsing
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/generic/stage2/tape_builder.h */
+/* begin file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace logger {
+
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+#if SIMDJSON_VERBOSE_LOGGING
+  static constexpr const bool LOG_ENABLED = true;
+#else
+  static constexpr const bool LOG_ENABLED = false;
+#endif
+  static constexpr const int LOG_EVENT_LEN = 20;
+  static constexpr const int LOG_BUFFER_LEN = 30;
+  static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
+  static constexpr const int LOG_INDEX_LEN = 5;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static simdjson_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static simdjson_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
+      printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
+    }
+  }
+
+  simdjson_unused static simdjson_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line from the stage 2 DOM parser
+  template<typename S>
+  static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
+      auto next_index = structurals.next_structural;
+      auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
+      auto next = &structurals.buf[*next_index];
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(current[i]));
+        }
+        printf(" ");
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
+          printf("%c", printable_char(next[i]));
+        }
+        printf(" ");
+      }
+      if (current_index) {
+        printf("| %*u ", LOG_INDEX_LEN, *current_index);
+      } else {
+        printf("| %-*s ", LOG_INDEX_LEN, "");
+      }
+      // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
+      printf("| %-s ", detail);
+      printf("|\n");
+    }
+  }
+
+} // namespace logger
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage2/logger.h */
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage2 {
+
+class json_iterator {
+public:
+  const uint8_t* const buf;
+  uint32_t *next_structural;
+  dom_parser_implementation &dom_parser;
+  uint32_t depth{0};
+
+  /**
+   * Walk the JSON document.
+   *
+   * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
+   * the first parameter; some callbacks have other parameters as well:
+   *
+   * - visit_document_start() - at the beginning.
+   * - visit_document_end() - at the end (if things were successful).
+   *
+   * - visit_array_start() - at the start `[` of a non-empty array.
+   * - visit_array_end() - at the end `]` of a non-empty array.
+   * - visit_empty_array() - when an empty array is encountered.
+   *
+   * - visit_object_end() - at the start `]` of a non-empty object.
+   * - visit_object_start() - at the end `]` of a non-empty object.
+   * - visit_empty_object() - when an empty object is encountered.
+   * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
+   *                                   guaranteed to point at the first quote of the string (`"key"`).
+   * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
+   * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
+   *
+   * - increment_count(iter) - each time a value is found in an array or object.
+   */
+  template<bool STREAMING, typename V>
+  simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
+
+  /**
+   * Create an iterator capable of walking a JSON document.
+   *
+   * The document must have already passed through stage 1.
+   */
+  simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
+
+  /**
+   * Look at the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *peek() const noexcept;
+  /**
+   * Advance to the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *advance() noexcept;
+  /**
+   * Get the remaining length of the document, from the start of the current token.
+   */
+  simdjson_inline size_t remaining_len() const noexcept;
+  /**
+   * Check if we are at the end of the document.
+   *
+   * If this is true, there are no more tokens.
+   */
+  simdjson_inline bool at_eof() const noexcept;
+  /**
+   * Check if we are at the beginning of the document.
+   */
+  simdjson_inline bool at_beginning() const noexcept;
+  simdjson_inline uint8_t last_structural() const noexcept;
+
+  /**
+   * Log that a value has been found.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_value(const char *type) const noexcept;
+  /**
+   * Log the start of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_start_value(const char *type) const noexcept;
+  /**
+   * Log the end of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_end_value(const char *type) const noexcept;
+  /**
+   * Log an error.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_error(const char *error) const noexcept;
+
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
+};
+
+template<bool STREAMING, typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
+  logger::log_start();
+
+  //
+  // Start the document
+  //
+  if (at_eof()) { return EMPTY; }
+  log_start_value("document");
+  SIMDJSON_TRY( visitor.visit_document_start(*this) );
+
+  //
+  // Read first value
+  //
+  {
+    auto value = advance();
+
+    // Make sure the outer object or array is closed before continuing; otherwise, there are ways we
+    // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      switch (*value) {
+        case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
+        case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
+      }
+    }
+
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
+    }
+  }
+  goto document_end;
+
+//
+// Object parser states
+//
+object_begin:
+  log_start_value("object");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = false;
+  SIMDJSON_TRY( visitor.visit_object_start(*this) );
+
+  {
+    auto key = advance();
+    if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
+    SIMDJSON_TRY( visitor.increment_count(*this) );
+    SIMDJSON_TRY( visitor.visit_key(*this, key) );
+  }
+
+object_field:
+  if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+object_continue:
+  switch (*advance()) {
+    case ',':
+      SIMDJSON_TRY( visitor.increment_count(*this) );
+      {
+        auto key = advance();
+        if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
+        SIMDJSON_TRY( visitor.visit_key(*this, key) );
+      }
+      goto object_field;
+    case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
+    default: log_error("No comma between object fields"); return TAPE_ERROR;
+  }
+
+scope_end:
+  depth--;
+  if (depth == 0) { goto document_end; }
+  if (dom_parser.is_array[depth]) { goto array_continue; }
+  goto object_continue;
+
+//
+// Array parser states
+//
+array_begin:
+  log_start_value("array");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = true;
+  SIMDJSON_TRY( visitor.visit_array_start(*this) );
+  SIMDJSON_TRY( visitor.increment_count(*this) );
+
+array_value:
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+array_continue:
+  switch (*advance()) {
+    case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
+    case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
+    default: log_error("Missing comma between array values"); return TAPE_ERROR;
+  }
+
+document_end:
+  log_end_value("document");
+  SIMDJSON_TRY( visitor.visit_document_end(*this) );
+
+  dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
+
+  // If we didn't make it to the end, it's an error
+  if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
+    log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return TAPE_ERROR;
+  }
+
+  return SUCCESS;
+
+} // walk_document()
+
+simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
+  : buf{_dom_parser.buf},
+    next_structural{&_dom_parser.structural_indexes[start_structural_index]},
+    dom_parser{_dom_parser} {
+}
+
+simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
+  return &buf[*(next_structural)];
+}
+simdjson_inline const uint8_t *json_iterator::advance() noexcept {
+  return &buf[*(next_structural++)];
+}
+simdjson_inline size_t json_iterator::remaining_len() const noexcept {
+  return dom_parser.len - *(next_structural-1);
+}
+
+simdjson_inline bool json_iterator::at_eof() const noexcept {
+  return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
+}
+simdjson_inline bool json_iterator::at_beginning() const noexcept {
+  return next_structural == dom_parser.structural_indexes.get();
+}
+simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
+  return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
+}
+
+simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
+  logger::log_line(*this, "", type, "");
+}
+
+simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
+  logger::log_line(*this, "+", type, "");
+  if (logger::LOG_ENABLED) { logger::log_depth++; }
+}
+
+simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
+  if (logger::LOG_ENABLED) { logger::log_depth--; }
+  logger::log_line(*this, "-", type, "");
+}
+
+simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
+  logger::log_line(*this, "", "ERROR", error);
+}
+
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_root_string(*this, value);
+    case 't': return visitor.visit_root_true_atom(*this, value);
+    case 'f': return visitor.visit_root_false_atom(*this, value);
+    case 'n': return visitor.visit_root_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_root_number(*this, value);
+    default:
+      log_error("Document starts with a non-value character");
+      return TAPE_ERROR;
+  }
+}
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_string(*this, value);
+    case 't': return visitor.visit_true_atom(*this, value);
+    case 'f': return visitor.visit_false_atom(*this, value);
+    case 'n': return visitor.visit_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_number(*this, value);
+    default:
+      log_error("Non-value found when value was expected!");
+      return TAPE_ERROR;
+  }
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/tape_writer.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage2 {
+
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+
+  /** Write a signed 64-bit value to tape. */
+  simdjson_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  simdjson_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  simdjson_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  simdjson_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  simdjson_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  simdjson_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+simdjson_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+simdjson_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage2 {
+
+struct tape_builder {
+  template<bool STREAMING>
+  simdjson_warn_unused static simdjson_inline error_code parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept;
+
+  /** Called when a non-empty document starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty document ends without error. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty array starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty array ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
+  /** Called when an empty array is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty object starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
+  /**
+   * Called when a key in a field is encountered.
+   *
+   * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
+   * will be called after this with the field value.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
+  /** Called when a non-empty object ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
+  /** Called when an empty object is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
+
+  /**
+   * Called when a string, number, boolean or null is found.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+  /**
+   * Called when a string, number, boolean or null is found at the top level of a document (i.e.
+   * when there is no array or object and the entire document is a single string, number, boolean or
+   * null.
+   *
+   * This is separate from primitive() because simdjson's normal primitive parsing routines assume
+   * there is at least one more token after the value, which is only true in an array or object.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  /** Called each time a new field or element in an array or object is found. */
+  simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
+
+  /** Next location to write to tape */
+  tape_writer tape;
+private:
+  /** Next write location in the string buf for stage 2 parsing */
+  uint8_t *current_string_buf_loc;
+
+  simdjson_inline tape_builder(dom::document &doc) noexcept;
+
+  simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
+  simdjson_inline void start_container(json_iterator &iter) noexcept;
+  simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
+  simdjson_inline void on_end_string(uint8_t *dst) noexcept;
+}; // class tape_builder
+
+template<bool STREAMING>
+simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
+  json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  tape_builder builder(doc);
+  return iter.walk_document<STREAMING>(builder);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_root_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
+  constexpr uint32_t start_tape_index = 0;
+  tape.append(start_tape_index, internal::tape_type::ROOT);
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
+  return visit_string(iter, key, true);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
+  return SUCCESS;
+}
+
+simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
+  iter.log_value(key ? "key" : "string");
+  uint8_t *dst = on_start_string(iter);
+  dst = stringparsing::parse_string(value+1, dst);
+  if (dst == nullptr) {
+    iter.log_error("Invalid escape in string");
+    return STRING_ERROR;
+  }
+  on_end_string(dst);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
+  return visit_string(iter, value);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("number");
+  return numberparsing::parse_number(value, tape);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
+  //
+  // We need to make a copy to make sure that the string is space terminated.
+  // This is not about padding the input, which should already padded up
+  // to len + SIMDJSON_PADDING. However, we have no control at this stage
+  // on how the padding was done. What if the input string was padded with nulls?
+  // It is quite common for an input string to have an extra null character (C string).
+  // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+  // document, but the string "9\0" by itself is fine. So we make a copy and
+  // pad the input with spaces when we know that there is just one input element.
+  // This copy is relatively expensive, but it will almost never be called in
+  // practice unless you are in the strange scenario where you have many JSON
+  // documents made of single atoms.
+  //
+  std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
+  if (copy.get() == nullptr) { return MEMALLOC; }
+  std::memcpy(copy.get(), value, iter.remaining_len());
+  std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
+  error_code error = visit_number(iter, copy.get());
+  return error;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+// private:
+
+simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
+  return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  auto start_index = next_tape_index(iter);
+  tape.append(start_index+2, start);
+  tape.append(start_index, end);
+  return SUCCESS;
+}
+
+simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
+  iter.dom_parser.open_containers[iter.depth].count = 0;
+  tape.skip(); // We don't actually *write* the start element until the end.
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  // Write the ending tape element, pointing at the start location
+  const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
+  tape.append(start_tape_index, end);
+  // Write the start tape element, pointing at the end location (and including count)
+  // count can overflow if it exceeds 24 bits... so we saturate
+  // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+  const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
+  const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
+  return SUCCESS;
+}
+
+simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
+  // we advance the point, accounting for the fact that we have a NULL termination
+  tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
+  return current_string_buf_loc + sizeof(uint32_t);
+}
+
+simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
+  uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+  // TODO check for overflow in case someone has a crazy string (>=4GB?)
+  // But only add the overflow check when the document itself exceeds 4GB
+  // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+  memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+  // NULL termination is still handy if you expect all your strings to
+  // be NULL terminated? It comes at a small cost
+  *dst = 0;
+  current_string_buf_loc = dst + 1;
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file src/generic/stage2/tape_builder.h */
+
+//
+// Implementation-specific overrides
+//
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace stage1 {
+
+simdjson_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  // On PPC, we don't short-circuit this if there are no backslashes, because the branch gives us no
+  // benefit and therefore makes things worse.
+  // if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+
+} // namespace stage1
+} // unnamed namespace
+
+simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
+}
+
+simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return ppc64::stage1::generic_validate_utf8(buf,len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<false>(*this, _doc);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<true>(*this, _doc);
+}
+
+simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst) const noexcept {
+  return ppc64::stringparsing::parse_string(src, dst);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  auto error = stage1(_buf, _len, stage1_mode::regular);
+  if (error) { return error; }
+  return stage2(_doc);
+}
+
+} // namespace ppc64
+} // namespace simdjson
+
+/* begin file include/simdjson/ppc64/end.h */
+/* end file include/simdjson/ppc64/end.h */
+/* end file src/ppc64/dom_parser_implementation.cpp */
+#endif
+#if SIMDJSON_IMPLEMENTATION_WESTMERE
+/* begin file src/westmere/implementation.cpp */
+/* begin file include/simdjson/westmere/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "westmere"
+// #define SIMDJSON_IMPLEMENTATION westmere
+SIMDJSON_TARGET_WESTMERE
+/* end file include/simdjson/westmere/begin.h */
+
+namespace simdjson {
+namespace westmere {
+
+simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  if (auto err = dst->set_capacity(capacity))
+    return err;
+  if (auto err = dst->set_max_depth(max_depth))
+    return err;
+  return SUCCESS;
+}
+
+} // namespace westmere
+} // namespace simdjson
+
+/* begin file include/simdjson/westmere/end.h */
+SIMDJSON_UNTARGET_WESTMERE
+/* end file include/simdjson/westmere/end.h */
+/* end file src/westmere/implementation.cpp */
+/* begin file src/westmere/dom_parser_implementation.cpp */
+/* begin file include/simdjson/westmere/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "westmere"
+// #define SIMDJSON_IMPLEMENTATION westmere
+SIMDJSON_TARGET_WESTMERE
+/* end file include/simdjson/westmere/begin.h */
+
+//
+// Stage 1
+//
+
+namespace simdjson {
+namespace westmere {
+namespace {
+
+using namespace simd;
+
+struct json_character_block {
+  static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
+
+  simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
+  simdjson_inline uint64_t op() const noexcept { return _op; }
+  simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
+
+  uint64_t _whitespace;
+  uint64_t _op;
+};
+
+simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
+  // These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
+  // we can't use the generic lookup_16.
+  auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
+
+  // The 6 operators (:,[]{}) have these values:
+  //
+  // , 2C
+  // : 3A
+  // [ 5B
+  // { 7B
+  // ] 5D
+  // } 7D
+  //
+  // If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
+  // We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
+  // match it (against | 0x20).
+  //
+  // To prevent recognizing other characters, everything else gets compared with 0, which cannot
+  // match due to the | 0x20.
+  //
+  // NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
+  // and :. This gets caught in stage 2, which checks the actual character to ensure the right
+  // operators are in the right places.
+  const auto op_table = simd8<uint8_t>::repeat_16(
+    0, 0, 0, 0,
+    0, 0, 0, 0,
+    0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
+    ',', '}', 0, 0  // , = 2C, ] = 5D, } = 7D
+  );
+
+  // We compute whitespace and op separately. If the code later only use one or the
+  // other, given the fact that all functions are aggressively inlined, we can
+  // hope that useless computations will be omitted. This is namely case when
+  // minifying (we only need whitespace).
+
+
+  const uint64_t whitespace = in.eq({
+    _mm_shuffle_epi8(whitespace_table, in.chunks[0]),
+    _mm_shuffle_epi8(whitespace_table, in.chunks[1]),
+    _mm_shuffle_epi8(whitespace_table, in.chunks[2]),
+    _mm_shuffle_epi8(whitespace_table, in.chunks[3])
+  });
+  // Turn [ and ] into { and }
+  const simd8x64<uint8_t> curlified{
+    in.chunks[0] | 0x20,
+    in.chunks[1] | 0x20,
+    in.chunks[2] | 0x20,
+    in.chunks[3] | 0x20
+  };
+  const uint64_t op = curlified.eq({
+    _mm_shuffle_epi8(op_table, in.chunks[0]),
+    _mm_shuffle_epi8(op_table, in.chunks[1]),
+    _mm_shuffle_epi8(op_table, in.chunks[2]),
+    _mm_shuffle_epi8(op_table, in.chunks[3])
+  });
+    return { whitespace, op };
+}
+
+simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+  return input.reduce_or().is_ascii();
+}
+
+simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+simdjson_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+
+/* begin file src/generic/stage1/utf8_lookup4_algorithm.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+  simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+// Bit 1 = Too Long (ASCII followed by continuation)
+// Bit 2 = Overlong 3-byte
+// Bit 4 = Surrogate
+// Bit 5 = Overlong 2-byte
+// Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+                                                // 11______ 11______
+    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+                                                // 11110100 101_____
+                                                // 11110101 1001____
+                                                // 11110101 101_____
+                                                // 1111011_ 1001____
+                                                // 1111011_ 101_____
+                                                // 11111___ 1001____
+                                                // 11111___ 101_____
+    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
+                                                // 11110101 1000____
+                                                // 1111011_ 1000____
+                                                // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+
+    const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+    );
+    constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+    const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
+      // ____0000 ________
+      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+      // ____0001 ________
+      CARRY | OVERLONG_2,
+      // ____001_ ________
+      CARRY,
+      CARRY,
+
+      // ____0100 ________
+      CARRY | TOO_LARGE,
+      // ____0101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____011_ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+      // ____1___ ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // ____1101 ________
+      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+      CARRY | TOO_LARGE | TOO_LARGE_1000,
+      CARRY | TOO_LARGE | TOO_LARGE_1000
+    );
+    const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
+    );
+    return (byte_1_high & byte_1_low & byte_2_high);
+  }
+  simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+    simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+    simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+    simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+    simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+    return must23_80 ^ sc;
+  }
+
+  //
+  // Return nonzero if there are incomplete multibyte characters at the end of the block:
+  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+  //
+  simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+    // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
+    // ... 1111____ 111_____ 11______
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+    static const uint8_t max_array[64] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#else
+    static const uint8_t max_array[32] = {
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
+    };
+#endif
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    return input.gt_bits(max_value);
+  }
+
+  struct utf8_checker {
+    // If this is nonzero, there has been a UTF-8 error.
+    simd8<uint8_t> error;
+    // The last input we received
+    simd8<uint8_t> prev_input_block;
+    // Whether the last input we received was incomplete (used for ASCII fast path)
+    simd8<uint8_t> prev_incomplete;
+
+    //
+    // Check whether the current bytes are valid UTF-8.
+    //
+    simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
+      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+      simd8<uint8_t> sc = check_special_cases(input, prev1);
+      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    // The only problem that can happen at EOF is that a multibyte character is too short
+    // or a byte value too large in the last bytes: check_special_cases only checks for bytes
+    // too large in the first of two bytes.
+    simdjson_inline void check_eof() {
+      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+      // possibly finish them.
+      this->error |= this->prev_incomplete;
+    }
+
+#ifndef SIMDJSON_IF_CONSTEXPR
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_IF_CONSTEXPR if constexpr
+#else
+#define SIMDJSON_IF_CONSTEXPR if
+#endif
+#endif
+
+    simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
+      if(simdjson_likely(is_ascii(input))) {
+        this->error |= this->prev_incomplete;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
+                ||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
+                || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support one, two or four chunks per 64-byte block.");
+        SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
+        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
+      }
+    }
+    // do not forget to call check_eof!
+    simdjson_inline error_code errors() {
+      return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
+    }
+
+  }; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_lookup4_algorithm.h */
+/* begin file src/generic/stage1/json_structural_indexer.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+/* begin file src/generic/stage1/buf_block_reader.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with spaces
+template<size_t STEP_SIZE>
+struct buf_block_reader {
+public:
+  simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdjson_inline size_t block_index();
+  simdjson_inline bool has_full_block() const;
+  simdjson_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdjson_inline size_t get_remainder(uint8_t *dst) const;
+  simdjson_inline void advance();
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text_64(const uint8_t *text) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  in.store(reinterpret_cast<uint8_t*>(buf));
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') { buf[i] = '_'; }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdjson_unused static char * format_mask(uint64_t mask) {
+  static char buf[sizeof(simd8x64<uint8_t>) + 1];
+  for (size_t i=0; i<64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/buf_block_reader.h */
+/* begin file src/generic/stage1/json_string_scanner.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage1 {
+
+struct json_string_block {
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_string_block(uint64_t backslash, uint64_t escaped, uint64_t quote, uint64_t in_string) :
+  _backslash(backslash), _escaped(escaped), _quote(quote), _in_string(in_string) {}
+
+  // Escaped characters (characters following an escape() character)
+  simdjson_inline uint64_t escaped() const { return _escaped; }
+  // Escape characters (backslashes that are not escaped--i.e. in \\, includes only the first \)
+  simdjson_inline uint64_t escape() const { return _backslash & ~_escaped; }
+  // Real (non-backslashed) quotes
+  simdjson_inline uint64_t quote() const { return _quote; }
+  // Start quotes of strings
+  simdjson_inline uint64_t string_start() const { return _quote & _in_string; }
+  // End quotes of strings
+  simdjson_inline uint64_t string_end() const { return _quote & ~_in_string; }
+  // Only characters inside the string (not including the quotes)
+  simdjson_inline uint64_t string_content() const { return _in_string & ~_quote; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
+  // Return a mask of whether the given characters are inside a string (only works on non-quotes)
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
+  // Tail of string (everything except the start quote)
+  simdjson_inline uint64_t string_tail() const { return _in_string ^ _quote; }
+
+  // backslash characters
+  uint64_t _backslash;
+  // escaped characters (backslashed--does not include the hex characters after \u)
+  uint64_t _escaped;
+  // real quotes (non-backslashed ones)
+  uint64_t _quote;
+  // string characters (includes start quote but not end quote)
+  uint64_t _in_string;
+};
+
+// Scans blocks for string characters, storing the state necessary to do so
+class json_string_scanner {
+public:
+  simdjson_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Intended to be defined by the implementation
+  simdjson_inline uint64_t find_escaped(uint64_t escape);
+  simdjson_inline uint64_t find_escaped_branchless(uint64_t escape);
+
+  // Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
+  uint64_t prev_in_string = 0ULL;
+  // Whether the first character of the next iteration is escaped.
+  uint64_t prev_escaped = 0ULL;
+};
+
+//
+// Finds escaped characters (characters following \).
+//
+// Handles runs of backslashes like \\\" and \\\\" correctly (yielding 0101 and 01010, respectively).
+//
+// Does this by:
+// - Shift the escape mask to get potentially escaped characters (characters after backslashes).
+// - Mask escaped sequences that start on *even* bits with 1010101010 (odd bits are escaped, even bits are not)
+// - Mask escaped sequences that start on *odd* bits with 0101010101 (even bits are escaped, odd bits are not)
+//
+// To distinguish between escaped sequences starting on even/odd bits, it finds the start of all
+// escape sequences, filters out the ones that start on even bits, and adds that to the mask of
+// escape sequences. This causes the addition to clear out the sequences starting on odd bits (since
+// the start bit causes a carry), and leaves even-bit sequences alone.
+//
+// Example:
+//
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+// escape         |  xxx |  xx xxx  xxx  xx xx  | Removed overflow backslash; will | it into follows_escape
+// odd_starts     |  x   |  x       x       x   | escape & ~even_bits & ~follows_escape
+// even_seq       |     c|    cxxx     c xx   c | c = carry bit -- will be masked out later
+// invert_mask    |      |     cxxx     c xx   c| even_seq << 1
+// follows_escape |   xx | x xx xxx  xxx  xx xx | Includes overflow bit
+// escaped        |   x  | x x  x x  x x  x  x  |
+// desired        |   x  | x x  x x  x x  x  x  |
+// text           |  \\\ | \\\"\\\" \\\" \\"\\" |
+//
+simdjson_inline uint64_t json_string_scanner::find_escaped_branchless(uint64_t backslash) {
+  // If there was overflow, pretend the first character isn't a backslash
+  backslash &= ~prev_escaped;
+  uint64_t follows_escape = backslash << 1 | prev_escaped;
+
+  // Get sequences starting on even bits by clearing out the odd series using +
+  const uint64_t even_bits = 0x5555555555555555ULL;
+  uint64_t odd_sequence_starts = backslash & ~even_bits & ~follows_escape;
+  uint64_t sequences_starting_on_even_bits;
+  prev_escaped = add_overflow(odd_sequence_starts, backslash, &sequences_starting_on_even_bits);
+  uint64_t invert_mask = sequences_starting_on_even_bits << 1; // The mask we want to return is the *escaped* bits, not escapes.
+
+  // Mask every other backslashed character as an escaped character
+  // Flip the mask for sequences that start on even bits, to correct them
+  return (even_bits ^ invert_mask) & follows_escape;
+}
+
+//
+// Return a mask of all string characters plus end quotes.
+//
+// prev_escaped is overflow saying whether the next character is escaped.
+// prev_in_string is overflow saying whether we're still in a string.
+//
+// Backslash sequences outside of quotes will be detected in stage 2.
+//
+simdjson_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  const uint64_t backslash = in.eq('\\');
+  const uint64_t escaped = find_escaped(backslash);
+  const uint64_t quote = in.eq('"') & ~escaped;
+
+  //
+  // prefix_xor flips on bits inside the string (and flips off the end quote).
+  //
+  // Then we xor with prev_in_string: if we were in a string already, its effect is flipped
+  // (characters inside strings are outside, and characters outside strings are inside).
+  //
+  const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
+
+  //
+  // Check if we're still in a string at the end of the box so the next block will know
+  //
+  // right shift of a signed value expected to be well-defined and standard
+  // compliant as of C++20, John Regher from Utah U. says this is fine code
+  //
+  prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
+
+  // Use ^ to turn the beginning quote off, and the end quote on.
+
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_string_block(
+    backslash,
+    escaped,
+    quote,
+    in_string
+  );
+}
+
+simdjson_inline error_code json_string_scanner::finish() {
+  if (prev_in_string) {
+    return UNCLOSED_STRING;
+  }
+  return SUCCESS;
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/json_string_scanner.h */
+/* begin file src/generic/stage1/json_scanner.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage1 {
+
+/**
+ * A block of scanned json, with information on operators and scalars.
+ *
+ * We seek to identify pseudo-structural characters. Anything that is inside
+ * a string must be omitted (hence  & ~_string.string_tail()).
+ * Otherwise, pseudo-structural characters come in two forms.
+ * 1. We have the structural characters ([,],{,},:, comma). The
+ *    term 'structural character' is from the JSON RFC.
+ * 2. We have the 'scalar pseudo-structural characters'.
+ *    Scalars are quotes, and any character except structural characters and white space.
+ *
+ * To identify the scalar pseudo-structural characters, we must look at what comes
+ * before them: it must be a space, a quote or a structural characters.
+ * Starting with simdjson v0.3, we identify them by
+ * negation: we identify everything that is followed by a non-quote scalar,
+ * and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
+ */
+struct json_block {
+public:
+  // We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
+  simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+  simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
+  _string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
+
+  /**
+   * The start of structurals.
+   * In simdjson prior to v0.3, these were called the pseudo-structural characters.
+   **/
+  simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
+  /** All JSON whitespace (i.e. not in a string) */
+  simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
+
+  // Helpers
+
+  /** Whether the given characters are inside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
+  /** Whether the given characters are outside a string (only works on non-quotes) */
+  simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
+
+  // string and escape characters
+  json_string_block _string;
+  // whitespace, structural characters ('operators'), scalars
+  json_character_block _characters;
+  // whether the previous character was a scalar
+  uint64_t _follows_potential_nonquote_scalar;
+private:
+  // Potential structurals (i.e. disregarding strings)
+
+  /**
+   * structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
+   * They may reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
+  /**
+   * The start of non-operator runs, like 123, true and "abc".
+   * It main reside inside a string.
+   **/
+  simdjson_inline uint64_t potential_scalar_start() const noexcept {
+    // The term "scalar" refers to anything except structural characters and white space
+    // (so letters, numbers, quotes).
+    // Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
+    // then we know that it is irrelevant structurally.
+    return _characters.scalar() & ~follows_potential_scalar();
+  }
+  /**
+   * Whether the given character is immediately after a non-operator like 123, true.
+   * The characters following a quote are not included.
+   */
+  simdjson_inline uint64_t follows_potential_scalar() const noexcept {
+    // _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
+    // that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
+    // white space.
+    // It is understood that within quoted region, anything at all could be marked (irrelevant).
+    return _follows_potential_nonquote_scalar;
+  }
+};
+
+/**
+ * Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
+ *
+ * The scanner starts by calculating two distinct things:
+ * - string characters (taking \" into account)
+ * - structural characters or 'operators' ([]{},:, comma)
+ *   and scalars (runs of non-operators like 123, true and "abc")
+ *
+ * To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
+ * in particular, the operator/scalar bit will find plenty of things that are actually part of
+ * strings. When we're done, json_block will fuse the two together by masking out tokens that are
+ * part of a string.
+ */
+class json_scanner {
+public:
+  json_scanner() = default;
+  simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
+  // Returns either UNCLOSED_STRING or SUCCESS
+  simdjson_inline error_code finish();
+
+private:
+  // Whether the last character of the previous iteration is part of a scalar token
+  // (anything except whitespace or a structural character/'operator').
+  uint64_t prev_scalar = 0ULL;
+  json_string_scanner string_scanner{};
+};
+
+
+//
+// Check if the current character immediately follows a matching character.
+//
+// For example, this checks for quotes with backslashes in front of them:
+//
+//     const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
+//
+simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
+  const uint64_t result = match << 1 | overflow;
+  overflow = match >> 63;
+  return result;
+}
+
+simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
+  json_string_block strings = string_scanner.next(in);
+  // identifies the white-space and the structural characters
+  json_character_block characters = json_character_block::classify(in);
+  // The term "scalar" refers to anything except structural characters and white space
+  // (so letters, numbers, quotes).
+  // We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
+  //
+  // A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
+  // or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
+  // pseudo-structural character just like we would if we had  ' "a string" true '; otherwise we
+  // may need to add an extra check when parsing strings.
+  //
+  // Performance: there are many ways to skin this cat.
+  const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
+  uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
+  // We are returning a function-local object so either we get a move constructor
+  // or we get copy elision.
+  return json_block(
+    strings,// strings is a function-local object so either it moves or the copy is elided.
+    characters,
+    follows_nonquote_scalar
+  );
+}
+
+simdjson_inline error_code json_scanner::finish() {
+  return string_scanner.finish();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/json_scanner.h */
+/* begin file src/generic/stage1/json_minifier.h */
+// This file contains the common code every implementation uses in stage1
+// It is intended to be included multiple times and compiled multiple times
+// We assume the file in which it is included already includes
+// "simdjson/stage1.h" (this simplifies amalgation)
+
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage1 {
+
+class json_minifier {
+public:
+  template<size_t STEP_SIZE>
+  static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
+
+private:
+  simdjson_inline json_minifier(uint8_t *_dst)
+  : dst{_dst}
+  {}
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
+  simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
+  json_scanner scanner{};
+  uint8_t *dst;
+};
+
+simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
+  uint64_t mask = block.whitespace();
+  dst += in.compress(mask, dst);
+}
+
+simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
+  error_code error = scanner.finish();
+  if (error) { dst_len = 0; return error; }
+  dst_len = dst - dst_start;
+  return SUCCESS;
+}
+
+template<>
+simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  simd::simd8x64<uint8_t> in_2(block_buf+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1);
+  this->next(in_2, block_2);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block_buf);
+  json_block block_1 = scanner.next(in_1);
+  this->next(block_buf, block_1);
+  reader.advance();
+}
+
+template<size_t STEP_SIZE>
+error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
+  while (reader.has_full_block()) {
+    minifier.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  size_t remaining_bytes = reader.get_remainder(block);
+  if (remaining_bytes > 0) {
+    // We do not want to write directly to the output stream. Rather, we write
+    // to a local buffer (for safety).
+    uint8_t out_block[STEP_SIZE];
+    uint8_t * const guarded_dst{minifier.dst};
+    minifier.dst = out_block;
+    minifier.step<STEP_SIZE>(block, reader);
+    size_t to_write = minifier.dst - out_block;
+    // In some cases, we could be enticed to consider the padded spaces
+    // as part of the string. This is fine as long as we do not write more
+    // than we consumed.
+    if(to_write > remaining_bytes) { to_write = remaining_bytes; }
+    memcpy(guarded_dst, out_block, to_write);
+    minifier.dst = guarded_dst + to_write;
+  }
+  return minifier.finish(dst, dst_len);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/json_minifier.h */
+/* begin file src/generic/stage1/find_next_document_index.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ':' ','
+  * and when the second element is NOT one of these characters: '}' ']' ':' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // Variant: do not count separately, just figure out depth
+  if(parser.n_structural_indexes == 0) { return 0; }
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  // If we made it to the end, we want to finish counting to see if we have a full document.
+  switch (parser.buf[parser.structural_indexes[0]]) {
+    case '}':
+      obj_cnt--;
+      break;
+    case ']':
+      arr_cnt--;
+      break;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+  }
+  if (!arr_cnt && !obj_cnt) {
+    // We have a complete document.
+    return parser.n_structural_indexes;
+  }
+  return 0;
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/find_next_document_index.h */
+
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage1 {
+
+class bit_indexer {
+public:
+  uint32_t *tail;
+
+  simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
+
+  // flatten out values in 'bits' assuming that they are are to have values of idx
+  // plus their position in the bitvector, and store these indexes at
+  // base_ptr[base] incrementing base as we go
+  // will potentially store extra values beyond end of valid bits, so base_ptr
+  // needs to be large enough to handle this
+  //
+  // If the kernel sets SIMDJSON_CUSTOM_BIT_INDEXER, then it will provide its own
+  // version of the code.
+#ifdef SIMDJSON_CUSTOM_BIT_INDEXER
+  simdjson_inline void write(uint32_t idx, uint64_t bits);
+#else
+  simdjson_inline void write(uint32_t idx, uint64_t bits) {
+    // In some instances, the next branch is expensive because it is mispredicted.
+    // Unfortunately, in other cases,
+    // it helps tremendously.
+    if (bits == 0)
+        return;
+#if defined(SIMDJSON_PREFER_REVERSE_BITS)
+    /**
+     * ARM lacks a fast trailing zero instruction, but it has a fast
+     * bit reversal instruction and a fast leading zero instruction.
+     * Thus it may be profitable to reverse the bits (once) and then
+     * to rely on a sequence of instructions that call the leading
+     * zero instruction.
+     *
+     * Performance notes:
+     * The chosen routine is not optimal in terms of data dependency
+     * since zero_leading_bit might require two instructions. However,
+     * it tends to minimize the total number of instructions which is
+     * beneficial.
+     */
+
+    uint64_t rev_bits = reverse_bits(bits);
+    int cnt = static_cast<int>(count_ones(bits));
+    int i = 0;
+    // Do the first 8 all together
+    for (; i<8; i++) {
+      int lz = leading_zeroes(rev_bits);
+      this->tail[i] = static_cast<uint32_t>(idx) + lz;
+      rev_bits = zero_leading_bit(rev_bits, lz);
+    }
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      i = 8;
+      for (; i<16; i++) {
+        int lz = leading_zeroes(rev_bits);
+        this->tail[i] = static_cast<uint32_t>(idx) + lz;
+        rev_bits = zero_leading_bit(rev_bits, lz);
+      }
+
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        i = 16;
+        while (rev_bits != 0) {
+          int lz = leading_zeroes(rev_bits);
+          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
+          rev_bits = zero_leading_bit(rev_bits, lz);
+        }
+      }
+    }
+    this->tail += cnt;
+#else // SIMDJSON_PREFER_REVERSE_BITS
+    /**
+     * Under recent x64 systems, we often have both a fast trailing zero
+     * instruction and a fast 'clear-lower-bit' instruction so the following
+     * algorithm can be competitive.
+     */
+
+    int cnt = static_cast<int>(count_ones(bits));
+    // Do the first 8 all together
+    for (int i=0; i<8; i++) {
+      this->tail[i] = idx + trailing_zeroes(bits);
+      bits = clear_lowest_bit(bits);
+    }
+
+    // Do the next 8 all together (we hope in most cases it won't happen at all
+    // and the branch is easily predicted).
+    if (simdjson_unlikely(cnt > 8)) {
+      for (int i=8; i<16; i++) {
+        this->tail[i] = idx + trailing_zeroes(bits);
+        bits = clear_lowest_bit(bits);
+      }
+
+      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
+      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
+      // or the start of a value ("abc" true 123) every four characters.
+      if (simdjson_unlikely(cnt > 16)) {
+        int i = 16;
+        do {
+          this->tail[i] = idx + trailing_zeroes(bits);
+          bits = clear_lowest_bit(bits);
+          i++;
+        } while (i < cnt);
+      }
+    }
+
+    this->tail += cnt;
+#endif
+  }
+#endif // SIMDJSON_CUSTOM_BIT_INDEXER
+
+};
+
+class json_structural_indexer {
+public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
+  template<size_t STEP_SIZE>
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
+
+private:
+  simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
+  template<size_t STEP_SIZE>
+  simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
+  simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
+  simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
+
+  json_scanner scanner{};
+  utf8_checker checker{};
+  bit_indexer indexer;
+  uint64_t prev_structurals = 0;
+  uint64_t unescaped_chars_error = 0;
+};
+
+simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+
+// Skip the last character if it is partial
+simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (simdjson_unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
+  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
+  // We guard the rest of the code so that we can assume that len > 0 throughout.
+  if (len == 0) { return EMPTY; }
+  if (is_streaming(partial)) {
+    len = trim_partial_utf8(buf, len);
+    // If you end up with an empty window after trimming
+    // the partial UTF-8 bytes, then chances are good that you
+    // have an UTF-8 formatting error.
+    if(len == 0) { return UTF8_ERROR; }
+  }
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+
+  // Read all but the last block
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  }
+  // Take care of the last block (will always be there unless file is empty which is
+  // not supposed to happen.)
+  uint8_t block[STEP_SIZE];
+  if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
+  indexer.step<STEP_SIZE>(block, reader);
+  return indexer.finish(parser, reader.block_index(), len, partial);
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
+
+simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+  error_code error = scanner.finish();
+  // We deliberately break down the next expression so that it is
+  // human readable.
+  const bool should_we_exit = is_streaming(partial) ?
+    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
+    : (error != SUCCESS); // if partial is false, we must have SUCCESS
+  const bool have_unclosed_string = (error == UNCLOSED_STRING);
+  if (simdjson_unlikely(should_we_exit)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * The On Demand API requires special padding.
+   *
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   *
+   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
+   * R"({ "a": [,,)"
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial == stage1_mode::streaming_partial) {
+    // If we have an unclosed string, then the last structural
+    // will be the quote and we want to make sure to omit it.
+    if(have_unclosed_string) {
+      parser.n_structural_indexes--;
+      // a valid JSON file cannot have zero structural indexes - we should have found something
+      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
+    }
+    // We truncate the input to the end of the last complete document (or zero).
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      if(parser.structural_indexes[0] == 0) {
+        // If the buffer is partial and we started at index 0 but the document is
+        // incomplete, it's too big to parse.
+        return CAPACITY;
+      } else {
+        // It is possible that the document could be parsed, we just had a lot
+        // of white space.
+        parser.n_structural_indexes = 0;
+        return EMPTY;
+      }
+    }
+
+    parser.n_structural_indexes = new_structural_indexes;
+  } else if (partial == stage1_mode::streaming_final) {
+    if(have_unclosed_string) { parser.n_structural_indexes--; }
+    // We truncate the input to the end of the last complete document (or zero).
+    // Because partial == stage1_mode::streaming_final, it means that we may
+    // silently ignore trailing garbage. Though it sounds bad, we do it
+    // deliberately because many people who have streams of JSON documents
+    // will truncate them for processing. E.g., imagine that you are uncompressing
+    // the data from a size file or receiving it in chunks from the network. You
+    // may not know where exactly the last document will be. Meanwhile the
+    // document_stream instances allow people to know the JSON documents they are
+    // parsing (see the iterator.source() method).
+    parser.n_structural_indexes = find_next_document_index(parser);
+    // We store the initial n_structural_indexes so that the client can see
+    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
+    // then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
+    // otherwise, it will copy some prior index.
+    parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
+    // This next line is critical, do not change it unless you understand what you are
+    // doing.
+    parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+    if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
+        // We tolerate an unclosed string at the very end of the stream. Indeed, users
+        // often load their data in bulk without being careful and they want us to ignore
+        // the trailing garbage.
+        return EMPTY;
+    }
+  }
+  checker.check_eof();
+  return checker.errors();
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/json_structural_indexer.h */
+/* begin file src/generic/stage1/utf8_validator.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage1 {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template<class checker>
+bool generic_validate_utf8(const uint8_t * input, size_t length) {
+    checker c{};
+    buf_block_reader<64> reader(input, length);
+    while (reader.has_full_block()) {
+      simd::simd8x64<uint8_t> in(reader.full_block());
+      c.check_next_input(in);
+      reader.advance();
+    }
+    uint8_t block[64]{};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
+    c.check_next_input(in);
+    reader.advance();
+    c.check_eof();
+    return c.errors() == error_code::SUCCESS;
+}
+
+bool generic_validate_utf8(const char * input, size_t length) {
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+}
+
+} // namespace stage1
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage1/utf8_validator.h */
+
+//
+// Stage 2
+//
+/* begin file src/generic/stage2/stringparsing.h */
+// This file contains the common code every implementation uses
+// It is intended to be included multiple times and compiled multiple times
+
+namespace simdjson {
+namespace westmere {
+namespace {
+/// @private
+namespace stringparsing {
+
+// begin copypasta
+// These chars yield themselves: " \ /
+// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
+// u not handled in this table as it's complex
+static const uint8_t escape_map[256] = {
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x0.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0x22, 0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0x2f,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x4.
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0x5c, 0, 0,    0, // 0x5.
+    0, 0, 0x08, 0, 0,    0, 0x0c, 0, 0, 0, 0, 0, 0,    0, 0x0a, 0, // 0x6.
+    0, 0, 0x0d, 0, 0x09, 0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0, // 0x7.
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+    0, 0, 0,    0, 0,    0, 0,    0, 0, 0, 0, 0, 0,    0, 0,    0,
+};
+
+// handle a unicode codepoint
+// write appropriate values into dest
+// src will advance 6 bytes or 12 bytes
+// dest will advance a variable amount (return via pointer)
+// return true if the unicode codepoint was valid
+// We work in little-endian then swap at write time
+simdjson_warn_unused
+simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
+                                            uint8_t **dst_ptr) {
+  // jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
+  // conversion isn't valid; we defer the check for this to inside the
+  // multilingual plane check
+  uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
+  *src_ptr += 6;
+
+  // If we found a high surrogate, we must
+  // check for low surrogate for characters
+  // outside the Basic
+  // Multilingual Plane.
+  if (code_point >= 0xd800 && code_point < 0xdc00) {
+    const uint8_t *src_data = *src_ptr;
+    /* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
+    if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
+      return false;
+    }
+    uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
+
+    // We have already checked that the high surrogate is valid and
+    // (code_point - 0xd800) < 1024.
+    //
+    // Check that code_point_2 is in the range 0xdc00..0xdfff
+    // and that code_point_2 was parsed from valid hex.
+    uint32_t low_bit = code_point_2 - 0xdc00;
+    if (low_bit >> 10) {
+      return false;
+    }
+
+    code_point =
+        (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
+    *src_ptr += 6;
+  } else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
+      // If we encounter a low surrogate (not preceded by a high surrogate)
+      // then we have an error.
+      return false;
+  }
+  size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
+  *dst_ptr += offset;
+  return offset > 0;
+}
+
+/**
+ * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
+ * must be an unescaped quote terminating the string. It returns the final output
+ * position as pointer. In case of error (e.g., the string has bad escaped codes),
+ * then null_nullptrptr is returned. It is assumed that the output buffer is large
+ * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
+ * SIMDJSON_PADDING bytes.
+ */
+simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst) {
+  while (1) {
+    // Copy the next n bytes, and find the backslash and quote in them.
+    auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
+    // If the next thing is the end quote, copy and return
+    if (bs_quote.has_quote_first()) {
+      // we encountered quotes first. Move dst to point to quotes and exit
+      return dst + bs_quote.quote_index();
+    }
+    if (bs_quote.has_backslash()) {
+      /* find out where the backspace is */
+      auto bs_dist = bs_quote.backslash_index();
+      uint8_t escape_char = src[bs_dist + 1];
+      /* we encountered backslash first. Handle backslash */
+      if (escape_char == 'u') {
+        /* move src/dst up to the start; they will be further adjusted
+           within the unicode codepoint handling code. */
+        src += bs_dist;
+        dst += bs_dist;
+        if (!handle_unicode_codepoint(&src, &dst)) {
+          return nullptr;
+        }
+      } else {
+        /* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+         * write bs_dist+1 characters to output
+         * note this may reach beyond the part of the buffer we've actually
+         * seen. I think this is ok */
+        uint8_t escape_result = escape_map[escape_char];
+        if (escape_result == 0u) {
+          return nullptr; /* bogus escape value is an error */
+        }
+        dst[bs_dist] = escape_result;
+        src += bs_dist + 2;
+        dst += bs_dist + 1;
+      }
+    } else {
+      /* they are the same. Since they can't co-occur, it means we
+       * encountered neither. */
+      src += backslash_and_quote::BYTES_PROCESSED;
+      dst += backslash_and_quote::BYTES_PROCESSED;
+    }
+  }
+  /* can't be reached */
+  return nullptr;
+}
+
+} // namespace stringparsing
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage2/stringparsing.h */
+/* begin file src/generic/stage2/tape_builder.h */
+/* begin file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace logger {
+
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+#if SIMDJSON_VERBOSE_LOGGING
+  static constexpr const bool LOG_ENABLED = true;
+#else
+  static constexpr const bool LOG_ENABLED = false;
+#endif
+  static constexpr const int LOG_EVENT_LEN = 20;
+  static constexpr const int LOG_BUFFER_LEN = 30;
+  static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
+  static constexpr const int LOG_INDEX_LEN = 5;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static simdjson_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static simdjson_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
+      printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
+    }
+  }
+
+  simdjson_unused static simdjson_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line from the stage 2 DOM parser
+  template<typename S>
+  static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
+      auto next_index = structurals.next_structural;
+      auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>("                                                       ");
+      auto next = &structurals.buf[*next_index];
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(current[i]));
+        }
+        printf(" ");
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
+          printf("%c", printable_char(next[i]));
+        }
+        printf(" ");
+      }
+      if (current_index) {
+        printf("| %*u ", LOG_INDEX_LEN, *current_index);
+      } else {
+        printf("| %-*s ", LOG_INDEX_LEN, "");
+      }
+      // printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
+      printf("| %-s ", detail);
+      printf("|\n");
+    }
+  }
+
+} // namespace logger
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage2/logger.h */
+
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage2 {
+
+class json_iterator {
+public:
+  const uint8_t* const buf;
+  uint32_t *next_structural;
+  dom_parser_implementation &dom_parser;
+  uint32_t depth{0};
+
+  /**
+   * Walk the JSON document.
+   *
+   * The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
+   * the first parameter; some callbacks have other parameters as well:
+   *
+   * - visit_document_start() - at the beginning.
+   * - visit_document_end() - at the end (if things were successful).
+   *
+   * - visit_array_start() - at the start `[` of a non-empty array.
+   * - visit_array_end() - at the end `]` of a non-empty array.
+   * - visit_empty_array() - when an empty array is encountered.
+   *
+   * - visit_object_end() - at the start `]` of a non-empty object.
+   * - visit_object_start() - at the end `]` of a non-empty object.
+   * - visit_empty_object() - when an empty object is encountered.
+   * - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
+   *                                   guaranteed to point at the first quote of the string (`"key"`).
+   * - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
+   * - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
+   *
+   * - increment_count(iter) - each time a value is found in an array or object.
+   */
+  template<bool STREAMING, typename V>
+  simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
+
+  /**
+   * Create an iterator capable of walking a JSON document.
+   *
+   * The document must have already passed through stage 1.
+   */
+  simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
+
+  /**
+   * Look at the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *peek() const noexcept;
+  /**
+   * Advance to the next token.
+   *
+   * Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
+   *
+   * They may include invalid JSON as well (such as `1.2.3` or `ture`).
+   */
+  simdjson_inline const uint8_t *advance() noexcept;
+  /**
+   * Get the remaining length of the document, from the start of the current token.
+   */
+  simdjson_inline size_t remaining_len() const noexcept;
+  /**
+   * Check if we are at the end of the document.
+   *
+   * If this is true, there are no more tokens.
+   */
+  simdjson_inline bool at_eof() const noexcept;
+  /**
+   * Check if we are at the beginning of the document.
+   */
+  simdjson_inline bool at_beginning() const noexcept;
+  simdjson_inline uint8_t last_structural() const noexcept;
+
+  /**
+   * Log that a value has been found.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_value(const char *type) const noexcept;
+  /**
+   * Log the start of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_start_value(const char *type) const noexcept;
+  /**
+   * Log the end of a multipart value.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_end_value(const char *type) const noexcept;
+  /**
+   * Log an error.
+   *
+   * Set LOG_ENABLED=true in logger.h to see logging.
+   */
+  simdjson_inline void log_error(const char *error) const noexcept;
+
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
+  template<typename V>
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
+};
+
+template<bool STREAMING, typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
+  logger::log_start();
+
+  //
+  // Start the document
+  //
+  if (at_eof()) { return EMPTY; }
+  log_start_value("document");
+  SIMDJSON_TRY( visitor.visit_document_start(*this) );
+
+  //
+  // Read first value
+  //
+  {
+    auto value = advance();
+
+    // Make sure the outer object or array is closed before continuing; otherwise, there are ways we
+    // could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      switch (*value) {
+        case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
+        case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
+      }
+    }
+
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
+    }
+  }
+  goto document_end;
+
+//
+// Object parser states
+//
+object_begin:
+  log_start_value("object");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = false;
+  SIMDJSON_TRY( visitor.visit_object_start(*this) );
+
+  {
+    auto key = advance();
+    if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
+    SIMDJSON_TRY( visitor.increment_count(*this) );
+    SIMDJSON_TRY( visitor.visit_key(*this, key) );
+  }
+
+object_field:
+  if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+object_continue:
+  switch (*advance()) {
+    case ',':
+      SIMDJSON_TRY( visitor.increment_count(*this) );
+      {
+        auto key = advance();
+        if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
+        SIMDJSON_TRY( visitor.visit_key(*this, key) );
+      }
+      goto object_field;
+    case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
+    default: log_error("No comma between object fields"); return TAPE_ERROR;
+  }
+
+scope_end:
+  depth--;
+  if (depth == 0) { goto document_end; }
+  if (dom_parser.is_array[depth]) { goto array_continue; }
+  goto object_continue;
+
+//
+// Array parser states
+//
+array_begin:
+  log_start_value("array");
+  depth++;
+  if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
+  dom_parser.is_array[depth] = true;
+  SIMDJSON_TRY( visitor.visit_array_start(*this) );
+  SIMDJSON_TRY( visitor.increment_count(*this) );
+
+array_value:
+  {
+    auto value = advance();
+    switch (*value) {
+      case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
+      case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
+      default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
+    }
+  }
+
+array_continue:
+  switch (*advance()) {
+    case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
+    case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
+    default: log_error("Missing comma between array values"); return TAPE_ERROR;
+  }
+
+document_end:
+  log_end_value("document");
+  SIMDJSON_TRY( visitor.visit_document_end(*this) );
+
+  dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
+
+  // If we didn't make it to the end, it's an error
+  if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
+    log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return TAPE_ERROR;
+  }
+
+  return SUCCESS;
+
+} // walk_document()
+
+simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
+  : buf{_dom_parser.buf},
+    next_structural{&_dom_parser.structural_indexes[start_structural_index]},
+    dom_parser{_dom_parser} {
+}
+
+simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
+  return &buf[*(next_structural)];
+}
+simdjson_inline const uint8_t *json_iterator::advance() noexcept {
+  return &buf[*(next_structural++)];
+}
+simdjson_inline size_t json_iterator::remaining_len() const noexcept {
+  return dom_parser.len - *(next_structural-1);
+}
+
+simdjson_inline bool json_iterator::at_eof() const noexcept {
+  return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
+}
+simdjson_inline bool json_iterator::at_beginning() const noexcept {
+  return next_structural == dom_parser.structural_indexes.get();
+}
+simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
+  return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
+}
+
+simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
+  logger::log_line(*this, "", type, "");
+}
+
+simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
+  logger::log_line(*this, "+", type, "");
+  if (logger::LOG_ENABLED) { logger::log_depth++; }
+}
+
+simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
+  if (logger::LOG_ENABLED) { logger::log_depth--; }
+  logger::log_line(*this, "-", type, "");
+}
+
+simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
+  logger::log_line(*this, "", "ERROR", error);
+}
+
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_root_string(*this, value);
+    case 't': return visitor.visit_root_true_atom(*this, value);
+    case 'f': return visitor.visit_root_false_atom(*this, value);
+    case 'n': return visitor.visit_root_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_root_number(*this, value);
+    default:
+      log_error("Document starts with a non-value character");
+      return TAPE_ERROR;
+  }
+}
+template<typename V>
+simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  switch (*value) {
+    case '"': return visitor.visit_string(*this, value);
+    case 't': return visitor.visit_true_atom(*this, value);
+    case 'f': return visitor.visit_false_atom(*this, value);
+    case 'n': return visitor.visit_null_atom(*this, value);
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return visitor.visit_number(*this, value);
+    default:
+      log_error("Non-value found when value was expected!");
+      return TAPE_ERROR;
+  }
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage2/json_iterator.h */
+/* begin file src/generic/stage2/tape_writer.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage2 {
+
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+
+  /** Write a signed 64-bit value to tape. */
+  simdjson_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  simdjson_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  simdjson_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  simdjson_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  simdjson_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  simdjson_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+simdjson_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+simdjson_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage2 {
+
+struct tape_builder {
+  template<bool STREAMING>
+  simdjson_warn_unused static simdjson_inline error_code parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept;
+
+  /** Called when a non-empty document starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty document ends without error. */
+  simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty array starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
+  /** Called when a non-empty array ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
+  /** Called when an empty array is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
+
+  /** Called when a non-empty object starts. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
+  /**
+   * Called when a key in a field is encountered.
+   *
+   * primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
+   * will be called after this with the field value.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
+  /** Called when a non-empty object ends. */
+  simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
+  /** Called when an empty object is found. */
+  simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
+
+  /**
+   * Called when a string, number, boolean or null is found.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+  /**
+   * Called when a string, number, boolean or null is found at the top level of a document (i.e.
+   * when there is no array or object and the entire document is a single string, number, boolean or
+   * null.
+   *
+   * This is separate from primitive() because simdjson's normal primitive parsing routines assume
+   * there is at least one more token after the value, which is only true in an array or object.
+   */
+  simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
+  simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
+
+  /** Called each time a new field or element in an array or object is found. */
+  simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
+
+  /** Next location to write to tape */
+  tape_writer tape;
+private:
+  /** Next write location in the string buf for stage 2 parsing */
+  uint8_t *current_string_buf_loc;
+
+  simdjson_inline tape_builder(dom::document &doc) noexcept;
+
+  simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
+  simdjson_inline void start_container(json_iterator &iter) noexcept;
+  simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
+  simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
+  simdjson_inline void on_end_string(uint8_t *dst) noexcept;
+}; // class tape_builder
+
+template<bool STREAMING>
+simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
+    dom_parser_implementation &dom_parser,
+    dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
+  json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  tape_builder builder(doc);
+  return iter.walk_document<STREAMING>(builder);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_root_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
+  return iter.visit_primitive(*this, value);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
+  return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
+  start_container(iter);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
+  return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
+  constexpr uint32_t start_tape_index = 0;
+  tape.append(start_tape_index, internal::tape_type::ROOT);
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
+  return SUCCESS;
+}
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
+  return visit_string(iter, key, true);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
+  return SUCCESS;
+}
+
+simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
+  iter.log_value(key ? "key" : "string");
+  uint8_t *dst = on_start_string(iter);
+  dst = stringparsing::parse_string(value+1, dst);
+  if (dst == nullptr) {
+    iter.log_error("Invalid escape in string");
+    return STRING_ERROR;
+  }
+  on_end_string(dst);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
+  return visit_string(iter, value);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("number");
+  return numberparsing::parse_number(value, tape);
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
+  //
+  // We need to make a copy to make sure that the string is space terminated.
+  // This is not about padding the input, which should already padded up
+  // to len + SIMDJSON_PADDING. However, we have no control at this stage
+  // on how the padding was done. What if the input string was padded with nulls?
+  // It is quite common for an input string to have an extra null character (C string).
+  // We do not want to allow 9\0 (where \0 is the null character) inside a JSON
+  // document, but the string "9\0" by itself is fine. So we make a copy and
+  // pad the input with spaces when we know that there is just one input element.
+  // This copy is relatively expensive, but it will almost never be called in
+  // practice unless you are in the strange scenario where you have many JSON
+  // documents made of single atoms.
+  //
+  std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
+  if (copy.get() == nullptr) { return MEMALLOC; }
+  std::memcpy(copy.get(), value, iter.remaining_len());
+  std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
+  error_code error = visit_number(iter, copy.get());
+  return error;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("true");
+  if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::TRUE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("false");
+  if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::FALSE_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
+  iter.log_value("null");
+  if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
+  tape.append(0, internal::tape_type::NULL_VALUE);
+  return SUCCESS;
+}
+
+// private:
+
+simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
+  return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  auto start_index = next_tape_index(iter);
+  tape.append(start_index+2, start);
+  tape.append(start_index, end);
+  return SUCCESS;
+}
+
+simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
+  iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
+  iter.dom_parser.open_containers[iter.depth].count = 0;
+  tape.skip(); // We don't actually *write* the start element until the end.
+}
+
+simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
+  // Write the ending tape element, pointing at the start location
+  const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
+  tape.append(start_tape_index, end);
+  // Write the start tape element, pointing at the end location (and including count)
+  // count can overflow if it exceeds 24 bits... so we saturate
+  // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
+  const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
+  const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
+  tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
+  return SUCCESS;
+}
+
+simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
+  // we advance the point, accounting for the fact that we have a NULL termination
+  tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
+  return current_string_buf_loc + sizeof(uint32_t);
+}
+
+simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
+  uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
+  // TODO check for overflow in case someone has a crazy string (>=4GB?)
+  // But only add the overflow check when the document itself exceeds 4GB
+  // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+  memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
+  // NULL termination is still handy if you expect all your strings to
+  // be NULL terminated? It comes at a small cost
+  *dst = 0;
+  current_string_buf_loc = dst + 1;
+}
+
+} // namespace stage2
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file src/generic/stage2/tape_builder.h */
+
+//
+// Implementation-specific overrides
+//
+
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace stage1 {
+
+simdjson_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
+  if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
+  return find_escaped_branchless(backslash);
+}
+
+} // namespace stage1
+} // unnamed namespace
+
+simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
+  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
+}
+
+simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return westmere::stage1::generic_validate_utf8(buf,len);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<false>(*this, _doc);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::tape_builder::parse_document<true>(*this, _doc);
+}
+
+simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst) const noexcept {
+  return westmere::stringparsing::parse_string(src, dst);
+}
+
+simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  auto error = stage1(_buf, _len, stage1_mode::regular);
+  if (error) { return error; }
+  return stage2(_doc);
+}
+
+} // namespace westmere
+} // namespace simdjson
+
+/* begin file include/simdjson/westmere/end.h */
+SIMDJSON_UNTARGET_WESTMERE
+/* end file include/simdjson/westmere/end.h */
+/* end file src/westmere/dom_parser_implementation.cpp */
+#endif
+
+SIMDJSON_POP_DISABLE_WARNINGS
+/* end file src/simdjson.cpp */
diff --git a/kram-profile/CBA/simdjson.h b/kram-profile/CBA/simdjson.h
new file mode 100644
index 00000000..4ad510d3
--- /dev/null
+++ b/kram-profile/CBA/simdjson.h
@@ -0,0 +1,31622 @@
+/* auto-generated on 2022-10-16 16:59:15 +0000. Do not edit! */
+/* begin file include/simdjson.h */
+#ifndef SIMDJSON_H
+#define SIMDJSON_H
+
+/**
+ * @mainpage
+ *
+ * Check the [README.md](https://github.com/simdjson/simdjson/blob/master/README.md#simdjson--parsing-gigabytes-of-json-per-second).
+ *
+ * Sample code. See https://github.com/simdjson/simdjson/blob/master/doc/basics.md for more examples.
+
+    #include "simdjson.h"
+
+    int main(void) {
+      // load from `twitter.json` file:
+      simdjson::dom::parser parser;
+      simdjson::dom::element tweets = parser.load("twitter.json");
+      std::cout << tweets["search_metadata"]["count"] << " results." << std::endl;
+
+      // Parse and iterate through an array of objects
+      auto abstract_json = R"( [
+        {  "12345" : {"a":12.34, "b":56.78, "c": 9998877}   },
+        {  "12545" : {"a":11.44, "b":12.78, "c": 11111111}  }
+        ] )"_padded;
+
+      for (simdjson::dom::object obj : parser.parse(abstract_json)) {
+        for(const auto key_value : obj) {
+          cout << "key: " << key_value.key << " : ";
+          simdjson::dom::object innerobj = key_value.value;
+          cout << "a: " << double(innerobj["a"]) << ", ";
+          cout << "b: " << double(innerobj["b"]) << ", ";
+          cout << "c: " << int64_t(innerobj["c"]) << endl;
+        }
+      }
+    }
+ */
+
+/* begin file include/simdjson/simdjson_version.h */
+// /include/simdjson/simdjson_version.h automatically generated by release.py,
+// do not change by hand
+#ifndef SIMDJSON_SIMDJSON_VERSION_H
+#define SIMDJSON_SIMDJSON_VERSION_H
+
+/** The version of simdjson being used (major.minor.revision) */
+#define SIMDJSON_VERSION 3.0.0
+
+namespace simdjson {
+enum {
+  /**
+   * The major version (MAJOR.minor.revision) of simdjson being used.
+   */
+  SIMDJSON_VERSION_MAJOR = 3,
+  /**
+   * The minor version (major.MINOR.revision) of simdjson being used.
+   */
+  SIMDJSON_VERSION_MINOR = 0,
+  /**
+   * The revision (major.minor.REVISION) of simdjson being used.
+   */
+  SIMDJSON_VERSION_REVISION = 0
+};
+} // namespace simdjson
+
+#endif // SIMDJSON_SIMDJSON_VERSION_H
+/* end file include/simdjson/simdjson_version.h */
+/* begin file include/simdjson/dom.h */
+#ifndef SIMDJSON_DOM_H
+#define SIMDJSON_DOM_H
+
+/* begin file include/simdjson/base.h */
+#ifndef SIMDJSON_BASE_H
+#define SIMDJSON_BASE_H
+
+/* begin file include/simdjson/compiler_check.h */
+#ifndef SIMDJSON_COMPILER_CHECK_H
+#define SIMDJSON_COMPILER_CHECK_H
+
+#ifndef __cplusplus
+#error simdjson requires a C++ compiler
+#endif
+
+#ifndef SIMDJSON_CPLUSPLUS
+#if defined(_MSVC_LANG) && !defined(__clang__)
+#define SIMDJSON_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
+#else
+#define SIMDJSON_CPLUSPLUS __cplusplus
+#endif
+#endif
+
+// C++ 17
+#if !defined(SIMDJSON_CPLUSPLUS17) && (SIMDJSON_CPLUSPLUS >= 201703L)
+#define SIMDJSON_CPLUSPLUS17 1
+#endif
+
+// C++ 14
+#if !defined(SIMDJSON_CPLUSPLUS14) && (SIMDJSON_CPLUSPLUS >= 201402L)
+#define SIMDJSON_CPLUSPLUS14 1
+#endif
+
+// C++ 11
+#if !defined(SIMDJSON_CPLUSPLUS11) && (SIMDJSON_CPLUSPLUS >= 201103L)
+#define SIMDJSON_CPLUSPLUS11 1
+#endif
+
+#ifndef SIMDJSON_CPLUSPLUS11
+#error simdjson requires a compiler compliant with the C++11 standard
+#endif
+
+#endif // SIMDJSON_COMPILER_CHECK_H
+/* end file include/simdjson/compiler_check.h */
+/* begin file include/simdjson/common_defs.h */
+#ifndef SIMDJSON_COMMON_DEFS_H
+#define SIMDJSON_COMMON_DEFS_H
+
+#include <cassert>
+/* begin file include/simdjson/portability.h */
+#ifndef SIMDJSON_PORTABILITY_H
+#define SIMDJSON_PORTABILITY_H
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cfloat>
+#include <cassert>
+#ifndef _WIN32
+// strcasecmp, strncasecmp
+#include <strings.h>
+#endif
+
+#ifdef _MSC_VER
+#define SIMDJSON_VISUAL_STUDIO 1
+/**
+ * We want to differentiate carefully between
+ * clang under visual studio and regular visual
+ * studio.
+ *
+ * Under clang for Windows, we enable:
+ *  * target pragmas so that part and only part of the
+ *     code gets compiled for advanced instructions.
+ *
+ */
+#ifdef __clang__
+// clang under visual studio
+#define SIMDJSON_CLANG_VISUAL_STUDIO 1
+#else
+// just regular visual studio (best guess)
+#define SIMDJSON_REGULAR_VISUAL_STUDIO 1
+#endif // __clang__
+#endif // _MSC_VER
+
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+// https://en.wikipedia.org/wiki/C_alternative_tokens
+// This header should have no effect, except maybe
+// under Visual Studio.
+#include <iso646.h>
+#endif
+
+#if defined(__x86_64__) || defined(_M_AMD64)
+#define SIMDJSON_IS_X86_64 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define SIMDJSON_IS_ARM64 1
+#elif defined(__PPC64__) || defined(_M_PPC64)
+#define SIMDJSON_IS_PPC64 1
+#else
+#define SIMDJSON_IS_32BITS 1
+
+// We do not support 32-bit platforms, but it can be
+// handy to identify them.
+#if defined(_M_IX86) || defined(__i386__)
+#define SIMDJSON_IS_X86_32BITS 1
+#elif defined(__arm__) || defined(_M_ARM)
+#define SIMDJSON_IS_ARM_32BITS 1
+#elif defined(__PPC__) || defined(_M_PPC)
+#define SIMDJSON_IS_PPC_32BITS 1
+#endif
+
+#endif // defined(__x86_64__) || defined(_M_AMD64)
+
+#ifdef SIMDJSON_IS_32BITS
+#ifndef SIMDJSON_NO_PORTABILITY_WARNING
+#pragma message("The simdjson library is designed \
+for 64-bit processors and it seems that you are not \
+compiling for a known 64-bit platform. All fast kernels \
+will be disabled and performance may be poor. Please \
+use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
+#endif // SIMDJSON_NO_PORTABILITY_WARNING
+#endif // SIMDJSON_IS_32BITS
+
+// this is almost standard?
+#undef SIMDJSON_STRINGIFY_IMPLEMENTATION_
+#undef SIMDJSON_STRINGIFY
+#define SIMDJSON_STRINGIFY_IMPLEMENTATION_(a) #a
+#define SIMDJSON_STRINGIFY(a) SIMDJSON_STRINGIFY_IMPLEMENTATION_(a)
+
+// Our fast kernels require 64-bit systems.
+//
+// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
+// Furthermore, the number of SIMD registers is reduced.
+//
+// On 32-bit ARM, we would have smaller registers.
+//
+// The simdjson users should still have the fallback kernel. It is
+// slower, but it should run everywhere.
+
+//
+// Enable valid runtime implementations, and select SIMDJSON_BUILTIN_IMPLEMENTATION
+//
+
+// We are going to use runtime dispatch.
+#ifdef SIMDJSON_IS_X86_64
+#ifdef __clang__
+// clang does not have GCC push pop
+// warning: clang attribute push can't be used within a namespace in clang up
+// til 8.0 so SIMDJSON_TARGET_REGION and SIMDJSON_UNTARGET_REGION must be *outside* of a
+// namespace.
+#define SIMDJSON_TARGET_REGION(T)                                                       \
+  _Pragma(SIMDJSON_STRINGIFY(                                                           \
+      clang attribute push(__attribute__((target(T))), apply_to = function)))
+#define SIMDJSON_UNTARGET_REGION _Pragma("clang attribute pop")
+#elif defined(__GNUC__)
+// GCC is easier
+#define SIMDJSON_TARGET_REGION(T)                                                       \
+  _Pragma("GCC push_options") _Pragma(SIMDJSON_STRINGIFY(GCC target(T)))
+#define SIMDJSON_UNTARGET_REGION _Pragma("GCC pop_options")
+#endif // clang then gcc
+
+#endif // x86
+
+// Default target region macros don't do anything.
+#ifndef SIMDJSON_TARGET_REGION
+#define SIMDJSON_TARGET_REGION(T)
+#define SIMDJSON_UNTARGET_REGION
+#endif
+
+// Is threading enabled?
+#if defined(_REENTRANT) || defined(_MT)
+#ifndef SIMDJSON_THREADS_ENABLED
+#define SIMDJSON_THREADS_ENABLED
+#endif
+#endif
+
+// workaround for large stack sizes under -O0.
+// https://github.com/simdjson/simdjson/issues/691
+#ifdef __APPLE__
+#ifndef __OPTIMIZE__
+// Apple systems have small stack sizes in secondary threads.
+// Lack of compiler optimization may generate high stack usage.
+// Users may want to disable threads for safety, but only when
+// in debug mode which we detect by the fact that the __OPTIMIZE__
+// macro is not defined.
+#undef SIMDJSON_THREADS_ENABLED
+#endif
+#endif
+
+
+#if defined(__clang__)
+#define SIMDJSON_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined")))
+#elif defined(__GNUC__)
+#define SIMDJSON_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined))
+#else
+#define SIMDJSON_NO_SANITIZE_UNDEFINED
+#endif
+
+#ifdef SIMDJSON_VISUAL_STUDIO
+// This is one case where we do not distinguish between
+// regular visual studio and clang under visual studio.
+// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
+#define simdjson_strcasecmp _stricmp
+#define simdjson_strncasecmp _strnicmp
+#else
+// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
+// So they are only useful for ASCII in our context.
+// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
+#define simdjson_strcasecmp strcasecmp
+#define simdjson_strncasecmp strncasecmp
+#endif
+
+#ifdef NDEBUG
+
+#ifdef SIMDJSON_VISUAL_STUDIO
+#define SIMDJSON_UNREACHABLE() __assume(0)
+#define SIMDJSON_ASSUME(COND) __assume(COND)
+#else
+#define SIMDJSON_UNREACHABLE() __builtin_unreachable();
+#define SIMDJSON_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
+#endif
+
+#else // NDEBUG
+
+#define SIMDJSON_UNREACHABLE() assert(0);
+#define SIMDJSON_ASSUME(COND) assert(COND)
+
+#endif
+
+#endif // SIMDJSON_PORTABILITY_H
+/* end file include/simdjson/portability.h */
+
+namespace simdjson {
+
+namespace internal {
+/**
+ * @private
+ * Our own implementation of the C++17 to_chars function.
+ * Defined in src/to_chars
+ */
+char *to_chars(char *first, const char *last, double value);
+/**
+ * @private
+ * A number parsing routine.
+ * Defined in src/from_chars
+ */
+double from_chars(const char *first) noexcept;
+double from_chars(const char *first, const char* end) noexcept;
+
+}
+
+#ifndef SIMDJSON_EXCEPTIONS
+#if __cpp_exceptions
+#define SIMDJSON_EXCEPTIONS 1
+#else
+#define SIMDJSON_EXCEPTIONS 0
+#endif
+#endif
+
+/** The maximum document size supported by simdjson. */
+constexpr size_t SIMDJSON_MAXSIZE_BYTES = 0xFFFFFFFF;
+
+/**
+ * The amount of padding needed in a buffer to parse JSON.
+ *
+ * The input buf should be readable up to buf + SIMDJSON_PADDING
+ * this is a stopgap; there should be a better description of the
+ * main loop and its behavior that abstracts over this
+ * See https://github.com/simdjson/simdjson/issues/174
+ */
+constexpr size_t SIMDJSON_PADDING = 64;
+
+/**
+ * By default, simdjson supports this many nested objects and arrays.
+ *
+ * This is the default for parser::max_depth().
+ */
+constexpr size_t DEFAULT_MAX_DEPTH = 1024;
+
+} // namespace simdjson
+
+#if defined(__GNUC__)
+  // Marks a block with a name so that MCA analysis can see it.
+  #define SIMDJSON_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
+  #define SIMDJSON_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
+  #define SIMDJSON_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
+#else
+  #define SIMDJSON_BEGIN_DEBUG_BLOCK(name)
+  #define SIMDJSON_END_DEBUG_BLOCK(name)
+  #define SIMDJSON_DEBUG_BLOCK(name, block)
+#endif
+
+// Align to N-byte boundary
+#define SIMDJSON_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
+#define SIMDJSON_ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
+
+#define SIMDJSON_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
+
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO)
+
+  #define simdjson_really_inline __forceinline
+  #define simdjson_never_inline __declspec(noinline)
+
+  #define simdjson_unused
+  #define simdjson_warn_unused
+
+  #ifndef simdjson_likely
+  #define simdjson_likely(x) x
+  #endif
+  #ifndef simdjson_unlikely
+  #define simdjson_unlikely(x) x
+  #endif
+
+  #define SIMDJSON_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
+  #define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
+  #define SIMDJSON_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
+  // Get rid of Intellisense-only warnings (Code Analysis)
+  // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
+  #ifdef __has_include
+  #if __has_include(<CppCoreCheck\Warnings.h>)
+  #include <CppCoreCheck\Warnings.h>
+  #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS SIMDJSON_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
+  #endif
+  #endif
+
+  #ifndef SIMDJSON_DISABLE_UNDESIRED_WARNINGS
+  #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS
+  #endif
+
+  #define SIMDJSON_DISABLE_DEPRECATED_WARNING SIMDJSON_DISABLE_VS_WARNING(4996)
+  #define SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING
+  #define SIMDJSON_POP_DISABLE_WARNINGS __pragma(warning( pop ))
+
+#else // SIMDJSON_REGULAR_VISUAL_STUDIO
+
+  #define simdjson_really_inline inline __attribute__((always_inline))
+  #define simdjson_never_inline inline __attribute__((noinline))
+
+  #define simdjson_unused __attribute__((unused))
+  #define simdjson_warn_unused __attribute__((warn_unused_result))
+
+  #ifndef simdjson_likely
+  #define simdjson_likely(x) __builtin_expect(!!(x), 1)
+  #endif
+  #ifndef simdjson_unlikely
+  #define simdjson_unlikely(x) __builtin_expect(!!(x), 0)
+  #endif
+
+  #define SIMDJSON_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
+  // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
+  // We do it separately for clang since it has different warnings.
+  #ifdef __clang__
+  // clang is missing -Wmaybe-uninitialized.
+  #define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \
+    SIMDJSON_DISABLE_GCC_WARNING(-Weffc++) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wall) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wconversion) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wextra) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wattributes) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wreturn-type) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wshadow) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wunused-parameter) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wunused-variable)
+  #else // __clang__
+  #define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \
+    SIMDJSON_DISABLE_GCC_WARNING(-Weffc++) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wall) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wconversion) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wextra) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wattributes) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wreturn-type) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wshadow) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wunused-parameter) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wunused-variable) \
+    SIMDJSON_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+  #endif // __clang__
+
+  #define SIMDJSON_PRAGMA(P) _Pragma(#P)
+  #define SIMDJSON_DISABLE_GCC_WARNING(WARNING) SIMDJSON_PRAGMA(GCC diagnostic ignored #WARNING)
+  #if defined(SIMDJSON_CLANG_VISUAL_STUDIO)
+  #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS SIMDJSON_DISABLE_GCC_WARNING(-Wmicrosoft-include)
+  #else
+  #define SIMDJSON_DISABLE_UNDESIRED_WARNINGS
+  #endif
+  #define SIMDJSON_DISABLE_DEPRECATED_WARNING SIMDJSON_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
+  #define SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING SIMDJSON_DISABLE_GCC_WARNING(-Wstrict-overflow)
+  #define SIMDJSON_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
+
+
+
+#endif // MSC_VER
+
+#if defined(simdjson_inline)
+  // Prefer the user's definition of simdjson_inline; don't define it ourselves.
+#elif defined(__GNUC__) && !defined(__OPTIMIZE__)
+  // If optimizations are disabled, forcing inlining can lead to significant
+  // code bloat and high compile times. Don't use simdjson_really_inline for
+  // unoptimized builds.
+  #define simdjson_inline inline
+#else
+  // Force inlining for most simdjson functions.
+  #define simdjson_inline simdjson_really_inline
+#endif
+
+#if defined(SIMDJSON_VISUAL_STUDIO)
+    /**
+     * Windows users need to do some extra work when building
+     * or using a dynamic library (DLL). When building, we need
+     * to set SIMDJSON_DLLIMPORTEXPORT to __declspec(dllexport).
+     * When *using* the DLL, the user needs to set
+     * SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport).
+     *
+     * Static libraries not need require such work.
+     *
+     * It does not matter here whether you are using
+     * the regular visual studio or clang under visual
+     * studio, you still need to handle these issues.
+     *
+     * Non-Windows systems do not have this complexity.
+     */
+    #if SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY
+    // We set SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY when we build a DLL under Windows.
+    // It should never happen that both SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY and
+    // SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY are set.
+    #define SIMDJSON_DLLIMPORTEXPORT __declspec(dllexport)
+    #elif SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY
+    // Windows user who call a dynamic library should set SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY to 1.
+    #define SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport)
+    #else
+    // We assume by default static linkage
+    #define SIMDJSON_DLLIMPORTEXPORT
+    #endif
+
+/**
+ * Workaround for the vcpkg package manager. Only vcpkg should
+ * ever touch the next line. The SIMDJSON_USING_LIBRARY macro is otherwise unused.
+ */
+#if SIMDJSON_USING_LIBRARY
+#define SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport)
+#endif
+/**
+ * End of workaround for the vcpkg package manager.
+ */
+#else
+    #define SIMDJSON_DLLIMPORTEXPORT
+#endif
+
+// C++17 requires string_view.
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_HAS_STRING_VIEW
+#include <string_view> // by the standard, this has to be safe.
+#endif
+
+// This macro (__cpp_lib_string_view) has to be defined
+// for C++17 and better, but if it is otherwise defined,
+// we are going to assume that string_view is available
+// even if we do not have C++17 support.
+#ifdef __cpp_lib_string_view
+#define SIMDJSON_HAS_STRING_VIEW
+#endif
+
+// Some systems have string_view even if we do not have C++17 support,
+// and even if __cpp_lib_string_view is undefined, it is the case
+// with Apple clang version 11.
+// We must handle it. *This is important.*
+#ifndef SIMDJSON_HAS_STRING_VIEW
+#if defined __has_include
+// do not combine the next #if with the previous one (unsafe)
+#if __has_include (<string_view>)
+// now it is safe to trigger the include
+#include <string_view> // though the file is there, it does not follow that we got the implementation
+#if defined(_LIBCPP_STRING_VIEW)
+// Ah! So we under libc++ which under its Library Fundamentals Technical Specification, which preceded C++17,
+// included string_view.
+// This means that we have string_view *even though* we may not have C++17.
+#define SIMDJSON_HAS_STRING_VIEW
+#endif // _LIBCPP_STRING_VIEW
+#endif // __has_include (<string_view>)
+#endif // defined __has_include
+#endif // def SIMDJSON_HAS_STRING_VIEW
+// end of complicated but important routine to try to detect string_view.
+
+//
+// Backfill std::string_view using nonstd::string_view on systems where
+// we expect that string_view is missing. Important: if we get this wrong,
+// we will end up with two string_view definitions and potential trouble.
+// That is why we work so hard above to avoid it.
+//
+#ifndef SIMDJSON_HAS_STRING_VIEW
+SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
+/* begin file include/simdjson/nonstd/string_view.hpp */
+// Copyright 2017-2020 by Martin Moene
+//
+// string-view lite, a C++17-like string_view for C++98 and later.
+// For more information see https://github.com/martinmoene/string-view-lite
+//
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#ifndef NONSTD_SV_LITE_H_INCLUDED
+#define NONSTD_SV_LITE_H_INCLUDED
+
+#define string_view_lite_MAJOR  1
+#define string_view_lite_MINOR  6
+#define string_view_lite_PATCH  0
+
+#define string_view_lite_VERSION  nssv_STRINGIFY(string_view_lite_MAJOR) "." nssv_STRINGIFY(string_view_lite_MINOR) "." nssv_STRINGIFY(string_view_lite_PATCH)
+
+#define nssv_STRINGIFY(  x )  nssv_STRINGIFY_( x )
+#define nssv_STRINGIFY_( x )  #x
+
+// string-view lite configuration:
+
+#define nssv_STRING_VIEW_DEFAULT  0
+#define nssv_STRING_VIEW_NONSTD   1
+#define nssv_STRING_VIEW_STD      2
+
+// tweak header support:
+
+#ifdef __has_include
+# if __has_include(<nonstd/string_view.tweak.hpp>)
+#  include <nonstd/string_view.tweak.hpp>
+# endif
+#define nssv_HAVE_TWEAK_HEADER  1
+#else
+#define nssv_HAVE_TWEAK_HEADER  0
+//# pragma message("string_view.hpp: Note: Tweak header not supported.")
+#endif
+
+// string_view selection and configuration:
+
+#if !defined( nssv_CONFIG_SELECT_STRING_VIEW )
+# define nssv_CONFIG_SELECT_STRING_VIEW  ( nssv_HAVE_STD_STRING_VIEW ? nssv_STRING_VIEW_STD : nssv_STRING_VIEW_NONSTD )
+#endif
+
+#ifndef  nssv_CONFIG_STD_SV_OPERATOR
+# define nssv_CONFIG_STD_SV_OPERATOR  0
+#endif
+
+#ifndef  nssv_CONFIG_USR_SV_OPERATOR
+# define nssv_CONFIG_USR_SV_OPERATOR  1
+#endif
+
+#ifdef   nssv_CONFIG_CONVERSION_STD_STRING
+# define nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS   nssv_CONFIG_CONVERSION_STD_STRING
+# define nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS  nssv_CONFIG_CONVERSION_STD_STRING
+#endif
+
+#ifndef  nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
+# define nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS  1
+#endif
+
+#ifndef  nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+# define nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS  1
+#endif
+
+#ifndef  nssv_CONFIG_NO_STREAM_INSERTION
+# define nssv_CONFIG_NO_STREAM_INSERTION  0
+#endif
+
+// Control presence of exception handling (try and auto discover):
+
+#ifndef nssv_CONFIG_NO_EXCEPTIONS
+# if _MSC_VER
+#  include <cstddef>    // for _HAS_EXCEPTIONS
+# endif
+# if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || (_HAS_EXCEPTIONS)
+#  define nssv_CONFIG_NO_EXCEPTIONS  0
+# else
+#  define nssv_CONFIG_NO_EXCEPTIONS  1
+# endif
+#endif
+
+// C++ language version detection (C++20 is speculative):
+// Note: VC14.0/1900 (VS2015) lacks too much from C++14.
+
+#ifndef   nssv_CPLUSPLUS
+# if defined(_MSVC_LANG ) && !defined(__clang__)
+#  define nssv_CPLUSPLUS  (_MSC_VER == 1900 ? 201103L : _MSVC_LANG )
+# else
+#  define nssv_CPLUSPLUS  __cplusplus
+# endif
+#endif
+
+#define nssv_CPP98_OR_GREATER  ( nssv_CPLUSPLUS >= 199711L )
+#define nssv_CPP11_OR_GREATER  ( nssv_CPLUSPLUS >= 201103L )
+#define nssv_CPP11_OR_GREATER_ ( nssv_CPLUSPLUS >= 201103L )
+#define nssv_CPP14_OR_GREATER  ( nssv_CPLUSPLUS >= 201402L )
+#define nssv_CPP17_OR_GREATER  ( nssv_CPLUSPLUS >= 201703L )
+#define nssv_CPP20_OR_GREATER  ( nssv_CPLUSPLUS >= 202000L )
+
+// use C++17 std::string_view if available and requested:
+
+#if nssv_CPP17_OR_GREATER && defined(__has_include )
+# if __has_include( <string_view> )
+#  define nssv_HAVE_STD_STRING_VIEW  1
+# else
+#  define nssv_HAVE_STD_STRING_VIEW  0
+# endif
+#else
+# define  nssv_HAVE_STD_STRING_VIEW  0
+#endif
+
+#define  nssv_USES_STD_STRING_VIEW  ( (nssv_CONFIG_SELECT_STRING_VIEW == nssv_STRING_VIEW_STD) || ((nssv_CONFIG_SELECT_STRING_VIEW == nssv_STRING_VIEW_DEFAULT) && nssv_HAVE_STD_STRING_VIEW) )
+
+#define nssv_HAVE_STARTS_WITH ( nssv_CPP20_OR_GREATER || !nssv_USES_STD_STRING_VIEW )
+#define nssv_HAVE_ENDS_WITH     nssv_HAVE_STARTS_WITH
+
+//
+// Use C++17 std::string_view:
+//
+
+#if nssv_USES_STD_STRING_VIEW
+
+#include <string_view>
+
+// Extensions for std::string:
+
+#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+namespace nonstd {
+
+template< class CharT, class Traits, class Allocator = std::allocator<CharT> >
+std::basic_string<CharT, Traits, Allocator>
+to_string( std::basic_string_view<CharT, Traits> v, Allocator const & a = Allocator() )
+{
+    return std::basic_string<CharT,Traits, Allocator>( v.begin(), v.end(), a );
+}
+
+template< class CharT, class Traits, class Allocator >
+std::basic_string_view<CharT, Traits>
+to_string_view( std::basic_string<CharT, Traits, Allocator> const & s )
+{
+    return std::basic_string_view<CharT, Traits>( s.data(), s.size() );
+}
+
+// Literal operators sv and _sv:
+
+#if nssv_CONFIG_STD_SV_OPERATOR
+
+using namespace std::literals::string_view_literals;
+
+#endif
+
+#if nssv_CONFIG_USR_SV_OPERATOR
+
+inline namespace literals {
+inline namespace string_view_literals {
+
+
+constexpr std::string_view operator "" _sv( const char* str, size_t len ) noexcept  // (1)
+{
+    return std::string_view{ str, len };
+}
+
+constexpr std::u16string_view operator "" _sv( const char16_t* str, size_t len ) noexcept  // (2)
+{
+    return std::u16string_view{ str, len };
+}
+
+constexpr std::u32string_view operator "" _sv( const char32_t* str, size_t len ) noexcept  // (3)
+{
+    return std::u32string_view{ str, len };
+}
+
+constexpr std::wstring_view operator "" _sv( const wchar_t* str, size_t len ) noexcept  // (4)
+{
+    return std::wstring_view{ str, len };
+}
+
+}} // namespace literals::string_view_literals
+
+#endif // nssv_CONFIG_USR_SV_OPERATOR
+
+} // namespace nonstd
+
+#endif // nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+namespace nonstd {
+
+using std::string_view;
+using std::wstring_view;
+using std::u16string_view;
+using std::u32string_view;
+using std::basic_string_view;
+
+// literal "sv" and "_sv", see above
+
+using std::operator==;
+using std::operator!=;
+using std::operator<;
+using std::operator<=;
+using std::operator>;
+using std::operator>=;
+
+using std::operator<<;
+
+} // namespace nonstd
+
+#else // nssv_HAVE_STD_STRING_VIEW
+
+//
+// Before C++17: use string_view lite:
+//
+
+// Compiler versions:
+//
+// MSVC++  6.0  _MSC_VER == 1200  nssv_COMPILER_MSVC_VERSION ==  60  (Visual Studio 6.0)
+// MSVC++  7.0  _MSC_VER == 1300  nssv_COMPILER_MSVC_VERSION ==  70  (Visual Studio .NET 2002)
+// MSVC++  7.1  _MSC_VER == 1310  nssv_COMPILER_MSVC_VERSION ==  71  (Visual Studio .NET 2003)
+// MSVC++  8.0  _MSC_VER == 1400  nssv_COMPILER_MSVC_VERSION ==  80  (Visual Studio 2005)
+// MSVC++  9.0  _MSC_VER == 1500  nssv_COMPILER_MSVC_VERSION ==  90  (Visual Studio 2008)
+// MSVC++ 10.0  _MSC_VER == 1600  nssv_COMPILER_MSVC_VERSION == 100  (Visual Studio 2010)
+// MSVC++ 11.0  _MSC_VER == 1700  nssv_COMPILER_MSVC_VERSION == 110  (Visual Studio 2012)
+// MSVC++ 12.0  _MSC_VER == 1800  nssv_COMPILER_MSVC_VERSION == 120  (Visual Studio 2013)
+// MSVC++ 14.0  _MSC_VER == 1900  nssv_COMPILER_MSVC_VERSION == 140  (Visual Studio 2015)
+// MSVC++ 14.1  _MSC_VER >= 1910  nssv_COMPILER_MSVC_VERSION == 141  (Visual Studio 2017)
+// MSVC++ 14.2  _MSC_VER >= 1920  nssv_COMPILER_MSVC_VERSION == 142  (Visual Studio 2019)
+
+#if defined(_MSC_VER ) && !defined(__clang__)
+# define nssv_COMPILER_MSVC_VER      (_MSC_VER )
+# define nssv_COMPILER_MSVC_VERSION  (_MSC_VER / 10 - 10 * ( 5 + (_MSC_VER < 1900 ) ) )
+#else
+# define nssv_COMPILER_MSVC_VER      0
+# define nssv_COMPILER_MSVC_VERSION  0
+#endif
+
+#define nssv_COMPILER_VERSION( major, minor, patch )  ( 10 * ( 10 * (major) + (minor) ) + (patch) )
+
+#if defined( __apple_build_version__ )
+# define nssv_COMPILER_APPLECLANG_VERSION  nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__)
+# define nssv_COMPILER_CLANG_VERSION       0
+#elif defined( __clang__ )
+# define nssv_COMPILER_APPLECLANG_VERSION  0
+# define nssv_COMPILER_CLANG_VERSION       nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__)
+#else
+# define nssv_COMPILER_APPLECLANG_VERSION  0
+# define nssv_COMPILER_CLANG_VERSION       0
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+# define nssv_COMPILER_GNUC_VERSION  nssv_COMPILER_VERSION(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#else
+# define nssv_COMPILER_GNUC_VERSION  0
+#endif
+
+// half-open range [lo..hi):
+#define nssv_BETWEEN( v, lo, hi ) ( (lo) <= (v) && (v) < (hi) )
+
+// Presence of language and library features:
+
+#ifdef _HAS_CPP0X
+# define nssv_HAS_CPP0X  _HAS_CPP0X
+#else
+# define nssv_HAS_CPP0X  0
+#endif
+
+// Unless defined otherwise below, consider VC14 as C++11 for variant-lite:
+
+#if nssv_COMPILER_MSVC_VER >= 1900
+# undef  nssv_CPP11_OR_GREATER
+# define nssv_CPP11_OR_GREATER  1
+#endif
+
+#define nssv_CPP11_90   (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1500)
+#define nssv_CPP11_100  (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1600)
+#define nssv_CPP11_110  (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1700)
+#define nssv_CPP11_120  (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1800)
+#define nssv_CPP11_140  (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1900)
+#define nssv_CPP11_141  (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1910)
+
+#define nssv_CPP14_000  (nssv_CPP14_OR_GREATER)
+#define nssv_CPP17_000  (nssv_CPP17_OR_GREATER)
+
+// Presence of C++11 language features:
+
+#define nssv_HAVE_CONSTEXPR_11          nssv_CPP11_140
+#define nssv_HAVE_EXPLICIT_CONVERSION   nssv_CPP11_140
+#define nssv_HAVE_INLINE_NAMESPACE      nssv_CPP11_140
+#define nssv_HAVE_NOEXCEPT              nssv_CPP11_140
+#define nssv_HAVE_NULLPTR               nssv_CPP11_100
+#define nssv_HAVE_REF_QUALIFIER         nssv_CPP11_140
+#define nssv_HAVE_UNICODE_LITERALS      nssv_CPP11_140
+#define nssv_HAVE_USER_DEFINED_LITERALS nssv_CPP11_140
+#define nssv_HAVE_WCHAR16_T             nssv_CPP11_100
+#define nssv_HAVE_WCHAR32_T             nssv_CPP11_100
+
+#if ! ( ( nssv_CPP11_OR_GREATER && nssv_COMPILER_CLANG_VERSION ) || nssv_BETWEEN( nssv_COMPILER_CLANG_VERSION, 300, 400 ) )
+# define nssv_HAVE_STD_DEFINED_LITERALS  nssv_CPP11_140
+#else
+# define nssv_HAVE_STD_DEFINED_LITERALS  0
+#endif
+
+// Presence of C++14 language features:
+
+#define nssv_HAVE_CONSTEXPR_14          nssv_CPP14_000
+
+// Presence of C++17 language features:
+
+#define nssv_HAVE_NODISCARD             nssv_CPP17_000
+
+// Presence of C++ library features:
+
+#define nssv_HAVE_STD_HASH              nssv_CPP11_120
+
+// Presence of compiler intrinsics:
+
+// Providing char-type specializations for compare() and length() that
+// use compiler intrinsics can improve compile- and run-time performance.
+//
+// The challenge is in using the right combinations of builtin availability
+// and its constexpr-ness.
+//
+// | compiler | __builtin_memcmp (constexpr) | memcmp  (constexpr) |
+// |----------|------------------------------|---------------------|
+// | clang    | 4.0              (>= 4.0   ) | any     (?        ) |
+// | clang-a  | 9.0              (>= 9.0   ) | any     (?        ) |
+// | gcc      | any              (constexpr) | any     (?        ) |
+// | msvc     | >= 14.2 C++17    (>= 14.2  ) | any     (?        ) |
+
+#define nssv_HAVE_BUILTIN_VER     ( (nssv_CPP17_000 && nssv_COMPILER_MSVC_VERSION >= 142) || nssv_COMPILER_GNUC_VERSION > 0 || nssv_COMPILER_CLANG_VERSION >= 400 || nssv_COMPILER_APPLECLANG_VERSION >= 900 )
+#define nssv_HAVE_BUILTIN_CE      (  nssv_HAVE_BUILTIN_VER )
+
+#define nssv_HAVE_BUILTIN_MEMCMP  ( (nssv_HAVE_CONSTEXPR_14 && nssv_HAVE_BUILTIN_CE) || !nssv_HAVE_CONSTEXPR_14 )
+#define nssv_HAVE_BUILTIN_STRLEN  ( (nssv_HAVE_CONSTEXPR_11 && nssv_HAVE_BUILTIN_CE) || !nssv_HAVE_CONSTEXPR_11 )
+
+#ifdef __has_builtin
+# define nssv_HAVE_BUILTIN( x )  __has_builtin( x )
+#else
+# define nssv_HAVE_BUILTIN( x )  0
+#endif
+
+#if nssv_HAVE_BUILTIN(__builtin_memcmp) || nssv_HAVE_BUILTIN_VER
+# define nssv_BUILTIN_MEMCMP  __builtin_memcmp
+#else
+# define nssv_BUILTIN_MEMCMP  memcmp
+#endif
+
+#if nssv_HAVE_BUILTIN(__builtin_strlen) || nssv_HAVE_BUILTIN_VER
+# define nssv_BUILTIN_STRLEN  __builtin_strlen
+#else
+# define nssv_BUILTIN_STRLEN  strlen
+#endif
+
+// C++ feature usage:
+
+#if nssv_HAVE_CONSTEXPR_11
+# define nssv_constexpr  constexpr
+#else
+# define nssv_constexpr  /*constexpr*/
+#endif
+
+#if  nssv_HAVE_CONSTEXPR_14
+# define nssv_constexpr14  constexpr
+#else
+# define nssv_constexpr14  /*constexpr*/
+#endif
+
+#if nssv_HAVE_EXPLICIT_CONVERSION
+# define nssv_explicit  explicit
+#else
+# define nssv_explicit  /*explicit*/
+#endif
+
+#if nssv_HAVE_INLINE_NAMESPACE
+# define nssv_inline_ns  inline
+#else
+# define nssv_inline_ns  /*inline*/
+#endif
+
+#if nssv_HAVE_NOEXCEPT
+# define nssv_noexcept  noexcept
+#else
+# define nssv_noexcept  /*noexcept*/
+#endif
+
+//#if nssv_HAVE_REF_QUALIFIER
+//# define nssv_ref_qual  &
+//# define nssv_refref_qual  &&
+//#else
+//# define nssv_ref_qual  /*&*/
+//# define nssv_refref_qual  /*&&*/
+//#endif
+
+#if nssv_HAVE_NULLPTR
+# define nssv_nullptr  nullptr
+#else
+# define nssv_nullptr  NULL
+#endif
+
+#if nssv_HAVE_NODISCARD
+# define nssv_nodiscard  [[nodiscard]]
+#else
+# define nssv_nodiscard  /*[[nodiscard]]*/
+#endif
+
+// Additional includes:
+
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <string>   // std::char_traits<>
+
+#if ! nssv_CONFIG_NO_STREAM_INSERTION
+# include <ostream>
+#endif
+
+#if ! nssv_CONFIG_NO_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if nssv_CPP11_OR_GREATER
+# include <type_traits>
+#endif
+
+// Clang, GNUC, MSVC warning suppression macros:
+
+#if defined(__clang__)
+# pragma clang diagnostic ignored "-Wreserved-user-defined-literal"
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wuser-defined-literals"
+#elif defined(__GNUC__)
+# pragma  GCC  diagnostic push
+# pragma  GCC  diagnostic ignored "-Wliteral-suffix"
+#endif // __clang__
+
+#if nssv_COMPILER_MSVC_VERSION >= 140
+# define nssv_SUPPRESS_MSGSL_WARNING(expr)        [[gsl::suppress(expr)]]
+# define nssv_SUPPRESS_MSVC_WARNING(code, descr)  __pragma(warning(suppress: code) )
+# define nssv_DISABLE_MSVC_WARNINGS(codes)        __pragma(warning(push))  __pragma(warning(disable: codes))
+#else
+# define nssv_SUPPRESS_MSGSL_WARNING(expr)
+# define nssv_SUPPRESS_MSVC_WARNING(code, descr)
+# define nssv_DISABLE_MSVC_WARNINGS(codes)
+#endif
+
+#if defined(__clang__)
+# define nssv_RESTORE_WARNINGS()  _Pragma("clang diagnostic pop")
+#elif defined(__GNUC__)
+# define nssv_RESTORE_WARNINGS()  _Pragma("GCC diagnostic pop")
+#elif nssv_COMPILER_MSVC_VERSION >= 140
+# define nssv_RESTORE_WARNINGS()  __pragma(warning(pop ))
+#else
+# define nssv_RESTORE_WARNINGS()
+#endif
+
+// Suppress the following MSVC (GSL) warnings:
+// - C4455, non-gsl   : 'operator ""sv': literal suffix identifiers that do not
+//                      start with an underscore are reserved
+// - C26472, gsl::t.1 : don't use a static_cast for arithmetic conversions;
+//                      use brace initialization, gsl::narrow_cast or gsl::narow
+// - C26481: gsl::b.1 : don't use pointer arithmetic. Use span instead
+
+nssv_DISABLE_MSVC_WARNINGS( 4455 26481 26472 )
+//nssv_DISABLE_CLANG_WARNINGS( "-Wuser-defined-literals" )
+//nssv_DISABLE_GNUC_WARNINGS( -Wliteral-suffix )
+
+namespace nonstd { namespace sv_lite {
+
+namespace detail {
+
+// support constexpr comparison in C++14;
+// for C++17 and later, use provided traits:
+
+template< typename CharT >
+inline nssv_constexpr14 int compare( CharT const * s1, CharT const * s2, std::size_t count )
+{
+    while ( count-- != 0 )
+    {
+        if ( *s1 < *s2 ) return -1;
+        if ( *s1 > *s2 ) return +1;
+        ++s1; ++s2;
+    }
+    return 0;
+}
+
+#if nssv_HAVE_BUILTIN_MEMCMP
+
+// specialization of compare() for char, see also generic compare() above:
+
+inline nssv_constexpr14 int compare( char const * s1, char const * s2, std::size_t count )
+{
+    return nssv_BUILTIN_MEMCMP( s1, s2, count );
+}
+
+#endif
+
+#if nssv_HAVE_BUILTIN_STRLEN
+
+// specialization of length() for char, see also generic length() further below:
+
+inline nssv_constexpr std::size_t length( char const * s )
+{
+    return nssv_BUILTIN_STRLEN( s );
+}
+
+#endif
+
+#if defined(__OPTIMIZE__)
+
+// gcc, clang provide __OPTIMIZE__
+// Expect tail call optimization to make length() non-recursive:
+
+template< typename CharT >
+inline nssv_constexpr std::size_t length( CharT * s, std::size_t result = 0 )
+{
+    return *s == '\0' ? result : length( s + 1, result + 1 );
+}
+
+#else // OPTIMIZE
+
+// non-recursive:
+
+template< typename CharT >
+inline nssv_constexpr14 std::size_t length( CharT * s )
+{
+    std::size_t result = 0;
+    while ( *s++ != '\0' )
+    {
+       ++result;
+    }
+    return result;
+}
+
+#endif // OPTIMIZE
+
+} // namespace detail
+
+template
+<
+    class CharT,
+    class Traits = std::char_traits<CharT>
+>
+class basic_string_view;
+
+//
+// basic_string_view:
+//
+
+template
+<
+    class CharT,
+    class Traits /* = std::char_traits<CharT> */
+>
+class basic_string_view
+{
+public:
+    // Member types:
+
+    typedef Traits traits_type;
+    typedef CharT  value_type;
+
+    typedef CharT       * pointer;
+    typedef CharT const * const_pointer;
+    typedef CharT       & reference;
+    typedef CharT const & const_reference;
+
+    typedef const_pointer iterator;
+    typedef const_pointer const_iterator;
+    typedef std::reverse_iterator< const_iterator > reverse_iterator;
+    typedef	std::reverse_iterator< const_iterator > const_reverse_iterator;
+
+    typedef std::size_t     size_type;
+    typedef std::ptrdiff_t  difference_type;
+
+    // 24.4.2.1 Construction and assignment:
+
+    nssv_constexpr basic_string_view() nssv_noexcept
+        : data_( nssv_nullptr )
+        , size_( 0 )
+    {}
+
+#if nssv_CPP11_OR_GREATER
+    nssv_constexpr basic_string_view( basic_string_view const & other ) nssv_noexcept = default;
+#else
+    nssv_constexpr basic_string_view( basic_string_view const & other ) nssv_noexcept
+        : data_( other.data_)
+        , size_( other.size_)
+    {}
+#endif
+
+    nssv_constexpr basic_string_view( CharT const * s, size_type count ) nssv_noexcept // non-standard noexcept
+        : data_( s )
+        , size_( count )
+    {}
+
+    nssv_constexpr basic_string_view( CharT const * s) nssv_noexcept // non-standard noexcept
+        : data_( s )
+#if nssv_CPP17_OR_GREATER
+        , size_( Traits::length(s) )
+#elif nssv_CPP11_OR_GREATER
+        , size_( detail::length(s) )
+#else
+        , size_( Traits::length(s) )
+#endif
+    {}
+
+    // Assignment:
+
+#if nssv_CPP11_OR_GREATER
+    nssv_constexpr14 basic_string_view & operator=( basic_string_view const & other ) nssv_noexcept = default;
+#else
+    nssv_constexpr14 basic_string_view & operator=( basic_string_view const & other ) nssv_noexcept
+    {
+        data_ = other.data_;
+        size_ = other.size_;
+        return *this;
+    }
+#endif
+
+    // 24.4.2.2 Iterator support:
+
+    nssv_constexpr const_iterator begin()  const nssv_noexcept { return data_;         }
+    nssv_constexpr const_iterator end()    const nssv_noexcept { return data_ + size_; }
+
+    nssv_constexpr const_iterator cbegin() const nssv_noexcept { return begin(); }
+    nssv_constexpr const_iterator cend()   const nssv_noexcept { return end();   }
+
+    nssv_constexpr const_reverse_iterator rbegin()  const nssv_noexcept { return const_reverse_iterator( end() );   }
+    nssv_constexpr const_reverse_iterator rend()    const nssv_noexcept { return const_reverse_iterator( begin() ); }
+
+    nssv_constexpr const_reverse_iterator crbegin() const nssv_noexcept { return rbegin(); }
+    nssv_constexpr const_reverse_iterator crend()   const nssv_noexcept { return rend();   }
+
+    // 24.4.2.3 Capacity:
+
+    nssv_constexpr size_type size()     const nssv_noexcept { return size_; }
+    nssv_constexpr size_type length()   const nssv_noexcept { return size_; }
+    nssv_constexpr size_type max_size() const nssv_noexcept { return (std::numeric_limits< size_type >::max)(); }
+
+    // since C++20
+    nssv_nodiscard nssv_constexpr bool empty() const nssv_noexcept
+    {
+        return 0 == size_;
+    }
+
+    // 24.4.2.4 Element access:
+
+    nssv_constexpr const_reference operator[]( size_type pos ) const
+    {
+        return data_at( pos );
+    }
+
+    nssv_constexpr14 const_reference at( size_type pos ) const
+    {
+#if nssv_CONFIG_NO_EXCEPTIONS
+        assert( pos < size() );
+#else
+        if ( pos >= size() )
+        {
+            throw std::out_of_range("nonstd::string_view::at()");
+        }
+#endif
+        return data_at( pos );
+    }
+
+    nssv_constexpr const_reference front() const { return data_at( 0 );          }
+    nssv_constexpr const_reference back()  const { return data_at( size() - 1 ); }
+
+    nssv_constexpr const_pointer   data()  const nssv_noexcept { return data_; }
+
+    // 24.4.2.5 Modifiers:
+
+    nssv_constexpr14 void remove_prefix( size_type n )
+    {
+        assert( n <= size() );
+        data_ += n;
+        size_ -= n;
+    }
+
+    nssv_constexpr14 void remove_suffix( size_type n )
+    {
+        assert( n <= size() );
+        size_ -= n;
+    }
+
+    nssv_constexpr14 void swap( basic_string_view & other ) nssv_noexcept
+    {
+        const basic_string_view tmp(other);
+        other = *this;
+        *this = tmp;
+    }
+
+    // 24.4.2.6 String operations:
+
+    size_type copy( CharT * dest, size_type n, size_type pos = 0 ) const
+    {
+#if nssv_CONFIG_NO_EXCEPTIONS
+        assert( pos <= size() );
+#else
+        if ( pos > size() )
+        {
+            throw std::out_of_range("nonstd::string_view::copy()");
+        }
+#endif
+        const size_type rlen = (std::min)( n, size() - pos );
+
+        (void) Traits::copy( dest, data() + pos, rlen );
+
+        return rlen;
+    }
+
+    nssv_constexpr14 basic_string_view substr( size_type pos = 0, size_type n = npos ) const
+    {
+#if nssv_CONFIG_NO_EXCEPTIONS
+        assert( pos <= size() );
+#else
+        if ( pos > size() )
+        {
+            throw std::out_of_range("nonstd::string_view::substr()");
+        }
+#endif
+        return basic_string_view( data() + pos, (std::min)( n, size() - pos ) );
+    }
+
+    // compare(), 6x:
+
+    nssv_constexpr14 int compare( basic_string_view other ) const nssv_noexcept // (1)
+    {
+#if nssv_CPP17_OR_GREATER
+        if ( const int result = Traits::compare( data(), other.data(), (std::min)( size(), other.size() ) ) )
+#else
+        if ( const int result = detail::compare( data(), other.data(), (std::min)( size(), other.size() ) ) )
+#endif
+        {
+            return result;
+        }
+
+        return size() == other.size() ? 0 : size() < other.size() ? -1 : 1;
+    }
+
+    nssv_constexpr int compare( size_type pos1, size_type n1, basic_string_view other ) const // (2)
+    {
+        return substr( pos1, n1 ).compare( other );
+    }
+
+    nssv_constexpr int compare( size_type pos1, size_type n1, basic_string_view other, size_type pos2, size_type n2 ) const // (3)
+    {
+        return substr( pos1, n1 ).compare( other.substr( pos2, n2 ) );
+    }
+
+    nssv_constexpr int compare( CharT const * s ) const // (4)
+    {
+        return compare( basic_string_view( s ) );
+    }
+
+    nssv_constexpr int compare( size_type pos1, size_type n1, CharT const * s ) const // (5)
+    {
+        return substr( pos1, n1 ).compare( basic_string_view( s ) );
+    }
+
+    nssv_constexpr int compare( size_type pos1, size_type n1, CharT const * s, size_type n2 ) const // (6)
+    {
+        return substr( pos1, n1 ).compare( basic_string_view( s, n2 ) );
+    }
+
+    // 24.4.2.7 Searching:
+
+    // starts_with(), 3x, since C++20:
+
+    nssv_constexpr bool starts_with( basic_string_view v ) const nssv_noexcept  // (1)
+    {
+        return size() >= v.size() && compare( 0, v.size(), v ) == 0;
+    }
+
+    nssv_constexpr bool starts_with( CharT c ) const nssv_noexcept  // (2)
+    {
+        return starts_with( basic_string_view( &c, 1 ) );
+    }
+
+    nssv_constexpr bool starts_with( CharT const * s ) const  // (3)
+    {
+        return starts_with( basic_string_view( s ) );
+    }
+
+    // ends_with(), 3x, since C++20:
+
+    nssv_constexpr bool ends_with( basic_string_view v ) const nssv_noexcept  // (1)
+    {
+        return size() >= v.size() && compare( size() - v.size(), npos, v ) == 0;
+    }
+
+    nssv_constexpr bool ends_with( CharT c ) const nssv_noexcept  // (2)
+    {
+        return ends_with( basic_string_view( &c, 1 ) );
+    }
+
+    nssv_constexpr bool ends_with( CharT const * s ) const  // (3)
+    {
+        return ends_with( basic_string_view( s ) );
+    }
+
+    // find(), 4x:
+
+    nssv_constexpr14 size_type find( basic_string_view v, size_type pos = 0 ) const nssv_noexcept  // (1)
+    {
+        return assert( v.size() == 0 || v.data() != nssv_nullptr )
+            , pos >= size()
+            ? npos
+            : to_pos( std::search( cbegin() + pos, cend(), v.cbegin(), v.cend(), Traits::eq ) );
+    }
+
+    nssv_constexpr14 size_type find( CharT c, size_type pos = 0 ) const nssv_noexcept  // (2)
+    {
+        return find( basic_string_view( &c, 1 ), pos );
+    }
+
+    nssv_constexpr14 size_type find( CharT const * s, size_type pos, size_type n ) const  // (3)
+    {
+        return find( basic_string_view( s, n ), pos );
+    }
+
+    nssv_constexpr14 size_type find( CharT const * s, size_type pos = 0 ) const  // (4)
+    {
+        return find( basic_string_view( s ), pos );
+    }
+
+    // rfind(), 4x:
+
+    nssv_constexpr14 size_type rfind( basic_string_view v, size_type pos = npos ) const nssv_noexcept  // (1)
+    {
+        if ( size() < v.size() )
+        {
+            return npos;
+        }
+
+        if ( v.empty() )
+        {
+            return (std::min)( size(), pos );
+        }
+
+        const_iterator last   = cbegin() + (std::min)( size() - v.size(), pos ) + v.size();
+        const_iterator result = std::find_end( cbegin(), last, v.cbegin(), v.cend(), Traits::eq );
+
+        return result != last ? size_type( result - cbegin() ) : npos;
+    }
+
+    nssv_constexpr14 size_type rfind( CharT c, size_type pos = npos ) const nssv_noexcept  // (2)
+    {
+        return rfind( basic_string_view( &c, 1 ), pos );
+    }
+
+    nssv_constexpr14 size_type rfind( CharT const * s, size_type pos, size_type n ) const  // (3)
+    {
+        return rfind( basic_string_view( s, n ), pos );
+    }
+
+    nssv_constexpr14 size_type rfind( CharT const * s, size_type pos = npos ) const  // (4)
+    {
+        return rfind( basic_string_view( s ), pos );
+    }
+
+    // find_first_of(), 4x:
+
+    nssv_constexpr size_type find_first_of( basic_string_view v, size_type pos = 0 ) const nssv_noexcept  // (1)
+    {
+        return pos >= size()
+            ? npos
+            : to_pos( std::find_first_of( cbegin() + pos, cend(), v.cbegin(), v.cend(), Traits::eq ) );
+    }
+
+    nssv_constexpr size_type find_first_of( CharT c, size_type pos = 0 ) const nssv_noexcept  // (2)
+    {
+        return find_first_of( basic_string_view( &c, 1 ), pos );
+    }
+
+    nssv_constexpr size_type find_first_of( CharT const * s, size_type pos, size_type n ) const  // (3)
+    {
+        return find_first_of( basic_string_view( s, n ), pos );
+    }
+
+    nssv_constexpr size_type find_first_of(  CharT const * s, size_type pos = 0 ) const  // (4)
+    {
+        return find_first_of( basic_string_view( s ), pos );
+    }
+
+    // find_last_of(), 4x:
+
+    nssv_constexpr size_type find_last_of( basic_string_view v, size_type pos = npos ) const nssv_noexcept  // (1)
+    {
+        return empty()
+            ? npos
+            : pos >= size()
+            ? find_last_of( v, size() - 1 )
+            : to_pos( std::find_first_of( const_reverse_iterator( cbegin() + pos + 1 ), crend(), v.cbegin(), v.cend(), Traits::eq ) );
+    }
+
+    nssv_constexpr size_type find_last_of( CharT c, size_type pos = npos ) const nssv_noexcept  // (2)
+    {
+        return find_last_of( basic_string_view( &c, 1 ), pos );
+    }
+
+    nssv_constexpr size_type find_last_of( CharT const * s, size_type pos, size_type count ) const  // (3)
+    {
+        return find_last_of( basic_string_view( s, count ), pos );
+    }
+
+    nssv_constexpr size_type find_last_of( CharT const * s, size_type pos = npos ) const  // (4)
+    {
+        return find_last_of( basic_string_view( s ), pos );
+    }
+
+    // find_first_not_of(), 4x:
+
+    nssv_constexpr size_type find_first_not_of( basic_string_view v, size_type pos = 0 ) const nssv_noexcept  // (1)
+    {
+        return pos >= size()
+            ? npos
+            : to_pos( std::find_if( cbegin() + pos, cend(), not_in_view( v ) ) );
+    }
+
+    nssv_constexpr size_type find_first_not_of( CharT c, size_type pos = 0 ) const nssv_noexcept  // (2)
+    {
+        return find_first_not_of( basic_string_view( &c, 1 ), pos );
+    }
+
+    nssv_constexpr size_type find_first_not_of( CharT const * s, size_type pos, size_type count ) const  // (3)
+    {
+        return find_first_not_of( basic_string_view( s, count ), pos );
+    }
+
+    nssv_constexpr size_type find_first_not_of( CharT const * s, size_type pos = 0 ) const  // (4)
+    {
+        return find_first_not_of( basic_string_view( s ), pos );
+    }
+
+    // find_last_not_of(), 4x:
+
+    nssv_constexpr size_type find_last_not_of( basic_string_view v, size_type pos = npos ) const nssv_noexcept  // (1)
+    {
+        return empty()
+            ? npos
+            : pos >= size()
+            ? find_last_not_of( v, size() - 1 )
+            : to_pos( std::find_if( const_reverse_iterator( cbegin() + pos + 1 ), crend(), not_in_view( v ) ) );
+    }
+
+    nssv_constexpr size_type find_last_not_of( CharT c, size_type pos = npos ) const nssv_noexcept  // (2)
+    {
+        return find_last_not_of( basic_string_view( &c, 1 ), pos );
+    }
+
+    nssv_constexpr size_type find_last_not_of( CharT const * s, size_type pos, size_type count ) const  // (3)
+    {
+        return find_last_not_of( basic_string_view( s, count ), pos );
+    }
+
+    nssv_constexpr size_type find_last_not_of( CharT const * s, size_type pos = npos ) const  // (4)
+    {
+        return find_last_not_of( basic_string_view( s ), pos );
+    }
+
+    // Constants:
+
+#if nssv_CPP17_OR_GREATER
+    static nssv_constexpr size_type npos = size_type(-1);
+#elif nssv_CPP11_OR_GREATER
+    enum : size_type { npos = size_type(-1) };
+#else
+    enum { npos = size_type(-1) };
+#endif
+
+private:
+    struct not_in_view
+    {
+        const basic_string_view v;
+
+        nssv_constexpr explicit not_in_view( basic_string_view v_ ) : v( v_ ) {}
+
+        nssv_constexpr bool operator()( CharT c ) const
+        {
+            return npos == v.find_first_of( c );
+        }
+    };
+
+    nssv_constexpr size_type to_pos( const_iterator it ) const
+    {
+        return it == cend() ? npos : size_type( it - cbegin() );
+    }
+
+    nssv_constexpr size_type to_pos( const_reverse_iterator it ) const
+    {
+        return it == crend() ? npos : size_type( crend() - it - 1 );
+    }
+
+    nssv_constexpr const_reference data_at( size_type pos ) const
+    {
+#if nssv_BETWEEN( nssv_COMPILER_GNUC_VERSION, 1, 500 )
+        return data_[pos];
+#else
+        return assert( pos < size() ), data_[pos];
+#endif
+    }
+
+private:
+    const_pointer data_;
+    size_type     size_;
+
+public:
+#if nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
+
+    template< class Allocator >
+    basic_string_view( std::basic_string<CharT, Traits, Allocator> const & s ) nssv_noexcept
+        : data_( s.data() )
+        , size_( s.size() )
+    {}
+
+#if nssv_HAVE_EXPLICIT_CONVERSION
+
+    template< class Allocator >
+    explicit operator std::basic_string<CharT, Traits, Allocator>() const
+    {
+        return to_string( Allocator() );
+    }
+
+#endif // nssv_HAVE_EXPLICIT_CONVERSION
+
+#if nssv_CPP11_OR_GREATER
+
+    template< class Allocator = std::allocator<CharT> >
+    std::basic_string<CharT, Traits, Allocator>
+    to_string( Allocator const & a = Allocator() ) const
+    {
+        return std::basic_string<CharT, Traits, Allocator>( begin(), end(), a );
+    }
+
+#else
+
+    std::basic_string<CharT, Traits>
+    to_string() const
+    {
+        return std::basic_string<CharT, Traits>( begin(), end() );
+    }
+
+    template< class Allocator >
+    std::basic_string<CharT, Traits, Allocator>
+    to_string( Allocator const & a ) const
+    {
+        return std::basic_string<CharT, Traits, Allocator>( begin(), end(), a );
+    }
+
+#endif // nssv_CPP11_OR_GREATER
+
+#endif // nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
+};
+
+//
+// Non-member functions:
+//
+
+// 24.4.3 Non-member comparison functions:
+// lexicographically compare two string views (function template):
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator== (
+    basic_string_view <CharT, Traits> lhs,
+    basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator!= (
+    basic_string_view <CharT, Traits> lhs,
+    basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return !( lhs == rhs ); }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator< (
+    basic_string_view <CharT, Traits> lhs,
+    basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator<= (
+    basic_string_view <CharT, Traits> lhs,
+    basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator> (
+    basic_string_view <CharT, Traits> lhs,
+    basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0; }
+
+template< class CharT, class Traits >
+nssv_constexpr bool operator>= (
+    basic_string_view <CharT, Traits> lhs,
+    basic_string_view <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0; }
+
+// Let S be basic_string_view<CharT, Traits>, and sv be an instance of S.
+// Implementations shall provide sufficient additional overloads marked
+// constexpr and noexcept so that an object t with an implicit conversion
+// to S can be compared according to Table 67.
+
+#if ! nssv_CPP11_OR_GREATER || nssv_BETWEEN( nssv_COMPILER_MSVC_VERSION, 100, 141 )
+
+// accommodate for older compilers:
+
+// ==
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+    basic_string_view<CharT, Traits> lhs,
+    CharT const * rhs ) nssv_noexcept
+{ return lhs.size() == detail::length( rhs ) && lhs.compare( rhs ) == 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+    CharT const * lhs,
+    basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return detail::length( lhs ) == rhs.size() && rhs.compare( lhs ) == 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+    basic_string_view<CharT, Traits> lhs,
+    std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator==(
+    std::basic_string<CharT, Traits> rhs,
+    basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+// !=
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+    basic_string_view<CharT, Traits> lhs,
+    CharT const * rhs ) nssv_noexcept
+{ return !( lhs == rhs ); }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+    CharT const * lhs,
+    basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return !( lhs == rhs ); }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+    basic_string_view<CharT, Traits> lhs,
+    std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return !( lhs == rhs ); }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator!=(
+    std::basic_string<CharT, Traits> rhs,
+    basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return !( lhs == rhs ); }
+
+// <
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+    basic_string_view<CharT, Traits> lhs,
+    CharT const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+    CharT const * lhs,
+    basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) > 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+    basic_string_view<CharT, Traits> lhs,
+    std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<(
+    std::basic_string<CharT, Traits> rhs,
+    basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) > 0; }
+
+// <=
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+    basic_string_view<CharT, Traits> lhs,
+    CharT const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+    CharT const * lhs,
+    basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) >= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+    basic_string_view<CharT, Traits> lhs,
+    std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator<=(
+    std::basic_string<CharT, Traits> rhs,
+    basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) >= 0; }
+
+// >
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+    basic_string_view<CharT, Traits> lhs,
+    CharT const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+    CharT const * lhs,
+    basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) < 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+    basic_string_view<CharT, Traits> lhs,
+    std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>(
+    std::basic_string<CharT, Traits> rhs,
+    basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) < 0; }
+
+// >=
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+    basic_string_view<CharT, Traits> lhs,
+    CharT const * rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+    CharT const * lhs,
+    basic_string_view<CharT, Traits> rhs ) nssv_noexcept
+{ return rhs.compare( lhs ) <= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+    basic_string_view<CharT, Traits> lhs,
+    std::basic_string<CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0; }
+
+template< class CharT, class Traits>
+nssv_constexpr bool operator>=(
+    std::basic_string<CharT, Traits> rhs,
+    basic_string_view<CharT, Traits> lhs ) nssv_noexcept
+{ return rhs.compare( lhs ) <= 0; }
+
+#else // newer compilers:
+
+#define nssv_BASIC_STRING_VIEW_I(T,U)  typename std::decay< basic_string_view<T,U> >::type
+
+#if defined(_MSC_VER)       // issue 40
+# define nssv_MSVC_ORDER(x)  , int=x
+#else
+# define nssv_MSVC_ORDER(x)  /*, int=x*/
+#endif
+
+// ==
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator==(
+         basic_string_view  <CharT, Traits> lhs,
+    nssv_BASIC_STRING_VIEW_I(CharT, Traits) rhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator==(
+    nssv_BASIC_STRING_VIEW_I(CharT, Traits) lhs,
+         basic_string_view  <CharT, Traits> rhs ) nssv_noexcept
+{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
+
+// !=
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator!= (
+         basic_string_view  < CharT, Traits > lhs,
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return !( lhs == rhs ); }
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator!= (
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+         basic_string_view  < CharT, Traits > rhs ) nssv_noexcept
+{ return !( lhs == rhs ); }
+
+// <
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator< (
+         basic_string_view  < CharT, Traits > lhs,
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0; }
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator< (
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+         basic_string_view  < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) < 0; }
+
+// <=
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator<= (
+         basic_string_view  < CharT, Traits > lhs,
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0; }
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator<= (
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+         basic_string_view  < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) <= 0; }
+
+// >
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator> (
+         basic_string_view  < CharT, Traits > lhs,
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0; }
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator> (
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+         basic_string_view  < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) > 0; }
+
+// >=
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(1) >
+nssv_constexpr bool operator>= (
+         basic_string_view  < CharT, Traits > lhs,
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0; }
+
+template< class CharT, class Traits  nssv_MSVC_ORDER(2) >
+nssv_constexpr bool operator>= (
+    nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
+         basic_string_view  < CharT, Traits > rhs ) nssv_noexcept
+{ return lhs.compare( rhs ) >= 0; }
+
+#undef nssv_MSVC_ORDER
+#undef nssv_BASIC_STRING_VIEW_I
+
+#endif // compiler-dependent approach to comparisons
+
+// 24.4.4 Inserters and extractors:
+
+#if ! nssv_CONFIG_NO_STREAM_INSERTION
+
+namespace detail {
+
+template< class Stream >
+void write_padding( Stream & os, std::streamsize n )
+{
+    for ( std::streamsize i = 0; i < n; ++i )
+        os.rdbuf()->sputc( os.fill() );
+}
+
+template< class Stream, class View >
+Stream & write_to_stream( Stream & os, View const & sv )
+{
+    typename Stream::sentry sentry( os );
+
+    if ( !os )
+        return os;
+
+    const std::streamsize length = static_cast<std::streamsize>( sv.length() );
+
+    // Whether, and how, to pad:
+    const bool      pad = ( length < os.width() );
+    const bool left_pad = pad && ( os.flags() & std::ios_base::adjustfield ) == std::ios_base::right;
+
+    if ( left_pad )
+        write_padding( os, os.width() - length );
+
+    // Write span characters:
+    os.rdbuf()->sputn( sv.begin(), length );
+
+    if ( pad && !left_pad )
+        write_padding( os, os.width() - length );
+
+    // Reset output stream width:
+    os.width( 0 );
+
+    return os;
+}
+
+} // namespace detail
+
+template< class CharT, class Traits >
+std::basic_ostream<CharT, Traits> &
+operator<<(
+    std::basic_ostream<CharT, Traits>& os,
+    basic_string_view <CharT, Traits> sv )
+{
+    return detail::write_to_stream( os, sv );
+}
+
+#endif // nssv_CONFIG_NO_STREAM_INSERTION
+
+// Several typedefs for common character types are provided:
+
+typedef basic_string_view<char>      string_view;
+typedef basic_string_view<wchar_t>   wstring_view;
+#if nssv_HAVE_WCHAR16_T
+typedef basic_string_view<char16_t>  u16string_view;
+typedef basic_string_view<char32_t>  u32string_view;
+#endif
+
+}} // namespace nonstd::sv_lite
+
+//
+// 24.4.6 Suffix for basic_string_view literals:
+//
+
+#if nssv_HAVE_USER_DEFINED_LITERALS
+
+namespace nonstd {
+nssv_inline_ns namespace literals {
+nssv_inline_ns namespace string_view_literals {
+
+#if nssv_CONFIG_STD_SV_OPERATOR && nssv_HAVE_STD_DEFINED_LITERALS
+
+nssv_constexpr nonstd::sv_lite::string_view operator "" sv( const char* str, size_t len ) nssv_noexcept  // (1)
+{
+    return nonstd::sv_lite::string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u16string_view operator "" sv( const char16_t* str, size_t len ) nssv_noexcept  // (2)
+{
+    return nonstd::sv_lite::u16string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u32string_view operator "" sv( const char32_t* str, size_t len ) nssv_noexcept  // (3)
+{
+    return nonstd::sv_lite::u32string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::wstring_view operator "" sv( const wchar_t* str, size_t len ) nssv_noexcept  // (4)
+{
+    return nonstd::sv_lite::wstring_view{ str, len };
+}
+
+#endif // nssv_CONFIG_STD_SV_OPERATOR && nssv_HAVE_STD_DEFINED_LITERALS
+
+#if nssv_CONFIG_USR_SV_OPERATOR
+
+nssv_constexpr nonstd::sv_lite::string_view operator "" _sv( const char* str, size_t len ) nssv_noexcept  // (1)
+{
+    return nonstd::sv_lite::string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u16string_view operator "" _sv( const char16_t* str, size_t len ) nssv_noexcept  // (2)
+{
+    return nonstd::sv_lite::u16string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::u32string_view operator "" _sv( const char32_t* str, size_t len ) nssv_noexcept  // (3)
+{
+    return nonstd::sv_lite::u32string_view{ str, len };
+}
+
+nssv_constexpr nonstd::sv_lite::wstring_view operator "" _sv( const wchar_t* str, size_t len ) nssv_noexcept  // (4)
+{
+    return nonstd::sv_lite::wstring_view{ str, len };
+}
+
+#endif // nssv_CONFIG_USR_SV_OPERATOR
+
+}}} // namespace nonstd::literals::string_view_literals
+
+#endif
+
+//
+// Extensions for std::string:
+//
+
+#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+namespace nonstd {
+namespace sv_lite {
+
+// Exclude MSVC 14 (19.00): it yields ambiguous to_string():
+
+#if nssv_CPP11_OR_GREATER && nssv_COMPILER_MSVC_VERSION != 140
+
+template< class CharT, class Traits, class Allocator = std::allocator<CharT> >
+std::basic_string<CharT, Traits, Allocator>
+to_string( basic_string_view<CharT, Traits> v, Allocator const & a = Allocator() )
+{
+    return std::basic_string<CharT,Traits, Allocator>( v.begin(), v.end(), a );
+}
+
+#else
+
+template< class CharT, class Traits >
+std::basic_string<CharT, Traits>
+to_string( basic_string_view<CharT, Traits> v )
+{
+    return std::basic_string<CharT, Traits>( v.begin(), v.end() );
+}
+
+template< class CharT, class Traits, class Allocator >
+std::basic_string<CharT, Traits, Allocator>
+to_string( basic_string_view<CharT, Traits> v, Allocator const & a )
+{
+    return std::basic_string<CharT, Traits, Allocator>( v.begin(), v.end(), a );
+}
+
+#endif // nssv_CPP11_OR_GREATER
+
+template< class CharT, class Traits, class Allocator >
+basic_string_view<CharT, Traits>
+to_string_view( std::basic_string<CharT, Traits, Allocator> const & s )
+{
+    return basic_string_view<CharT, Traits>( s.data(), s.size() );
+}
+
+}} // namespace nonstd::sv_lite
+
+#endif // nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+
+//
+// make types and algorithms available in namespace nonstd:
+//
+
+namespace nonstd {
+
+using sv_lite::basic_string_view;
+using sv_lite::string_view;
+using sv_lite::wstring_view;
+
+#if nssv_HAVE_WCHAR16_T
+using sv_lite::u16string_view;
+#endif
+#if nssv_HAVE_WCHAR32_T
+using sv_lite::u32string_view;
+#endif
+
+// literal "sv"
+
+using sv_lite::operator==;
+using sv_lite::operator!=;
+using sv_lite::operator<;
+using sv_lite::operator<=;
+using sv_lite::operator>;
+using sv_lite::operator>=;
+
+#if ! nssv_CONFIG_NO_STREAM_INSERTION
+using sv_lite::operator<<;
+#endif
+
+#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
+using sv_lite::to_string;
+using sv_lite::to_string_view;
+#endif
+
+} // namespace nonstd
+
+// 24.4.5 Hash support (C++11):
+
+// Note: The hash value of a string view object is equal to the hash value of
+// the corresponding string object.
+
+#if nssv_HAVE_STD_HASH
+
+#include <functional>
+
+namespace std {
+
+template<>
+struct hash< nonstd::string_view >
+{
+public:
+    std::size_t operator()( nonstd::string_view v ) const nssv_noexcept
+    {
+        return std::hash<std::string>()( std::string( v.data(), v.size() ) );
+    }
+};
+
+template<>
+struct hash< nonstd::wstring_view >
+{
+public:
+    std::size_t operator()( nonstd::wstring_view v ) const nssv_noexcept
+    {
+        return std::hash<std::wstring>()( std::wstring( v.data(), v.size() ) );
+    }
+};
+
+template<>
+struct hash< nonstd::u16string_view >
+{
+public:
+    std::size_t operator()( nonstd::u16string_view v ) const nssv_noexcept
+    {
+        return std::hash<std::u16string>()( std::u16string( v.data(), v.size() ) );
+    }
+};
+
+template<>
+struct hash< nonstd::u32string_view >
+{
+public:
+    std::size_t operator()( nonstd::u32string_view v ) const nssv_noexcept
+    {
+        return std::hash<std::u32string>()( std::u32string( v.data(), v.size() ) );
+    }
+};
+
+} // namespace std
+
+#endif // nssv_HAVE_STD_HASH
+
+nssv_RESTORE_WARNINGS()
+
+#endif // nssv_HAVE_STD_STRING_VIEW
+#endif // NONSTD_SV_LITE_H_INCLUDED
+/* end file include/simdjson/nonstd/string_view.hpp */
+SIMDJSON_POP_DISABLE_WARNINGS
+
+namespace std {
+  using string_view = nonstd::string_view;
+}
+#endif // SIMDJSON_HAS_STRING_VIEW
+#undef SIMDJSON_HAS_STRING_VIEW // We are not going to need this macro anymore.
+
+/// If EXPR is an error, returns it.
+#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
+
+// Unless the programmer has already set SIMDJSON_DEVELOPMENT_CHECKS,
+// we want to set it under debug builds. We detect a debug build
+// under Visual Studio when the _DEBUG macro is set. Under the other
+// compilers, we use the fact that they define __OPTIMIZE__ whenever
+// they allow optimizations.
+// It is possible that this could miss some cases where SIMDJSON_DEVELOPMENT_CHECKS
+// is helpful, but the programmer can set the macro SIMDJSON_DEVELOPMENT_CHECKS.
+// It could also wrongly set SIMDJSON_DEVELOPMENT_CHECKS (e.g., if the programmer
+// sets _DEBUG in a release build under Visual Studio, or if some compiler fails to
+// set the __OPTIMIZE__ macro).
+#ifndef SIMDJSON_DEVELOPMENT_CHECKS
+#ifdef _MSC_VER
+// Visual Studio seems to set _DEBUG for debug builds.
+#ifdef _DEBUG
+#define SIMDJSON_DEVELOPMENT_CHECKS 1
+#endif // _DEBUG
+#else // _MSC_VER
+// All other compilers appear to set __OPTIMIZE__ to a positive integer
+// when the compiler is optimizing.
+#ifndef __OPTIMIZE__
+#define SIMDJSON_DEVELOPMENT_CHECKS 1
+#endif // __OPTIMIZE__
+#endif // _MSC_VER
+#endif // SIMDJSON_DEVELOPMENT_CHECKS
+
+// The SIMDJSON_CHECK_EOF macro is a feature flag for the "don't require padding"
+// feature.
+
+#if SIMDJSON_CPLUSPLUS17
+// if we have C++, then fallthrough is a default attribute
+# define simdjson_fallthrough [[fallthrough]]
+// check if we have __attribute__ support
+#elif defined(__has_attribute)
+// check if we have the __fallthrough__ attribute
+#if __has_attribute(__fallthrough__)
+// we are good to go:
+# define simdjson_fallthrough                    __attribute__((__fallthrough__))
+#endif // __has_attribute(__fallthrough__)
+#endif // SIMDJSON_CPLUSPLUS17
+// on some systems, we simply do not have support for fallthrough, so use a default:
+#ifndef simdjson_fallthrough
+# define simdjson_fallthrough do {} while (0)  /* fallthrough */
+#endif // simdjson_fallthrough
+
+#endif // SIMDJSON_COMMON_DEFS_H
+/* end file include/simdjson/common_defs.h */
+
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_UNDESIRED_WARNINGS
+
+// Public API
+/* begin file include/simdjson/error.h */
+#ifndef SIMDJSON_ERROR_H
+#define SIMDJSON_ERROR_H
+
+#include <string>
+
+namespace simdjson {
+
+/**
+ * All possible errors returned by simdjson. These error codes are subject to change
+ * and not all simdjson kernel returns the same error code given the same input: it is not
+ * well defined which error a given input should produce.
+ *
+ * Only SUCCESS evaluates to false as a Boolean. All other error codes will evaluate
+ * to true as a Boolean.
+ */
+enum error_code {
+  SUCCESS = 0,                ///< No error
+  CAPACITY,                   ///< This parser can't support a document that big
+  MEMALLOC,                   ///< Error allocating memory, most likely out of memory
+  TAPE_ERROR,                 ///< Something went wrong, this is a generic error
+  DEPTH_ERROR,                ///< Your document exceeds the user-specified depth limitation
+  STRING_ERROR,               ///< Problem while parsing a string
+  T_ATOM_ERROR,               ///< Problem while parsing an atom starting with the letter 't'
+  F_ATOM_ERROR,               ///< Problem while parsing an atom starting with the letter 'f'
+  N_ATOM_ERROR,               ///< Problem while parsing an atom starting with the letter 'n'
+  NUMBER_ERROR,               ///< Problem while parsing a number
+  UTF8_ERROR,                 ///< the input is not valid UTF-8
+  UNINITIALIZED,              ///< unknown error, or uninitialized document
+  EMPTY,                      ///< no structural element found
+  UNESCAPED_CHARS,            ///< found unescaped characters in a string.
+  UNCLOSED_STRING,            ///< missing quote at the end
+  UNSUPPORTED_ARCHITECTURE,   ///< unsupported architecture
+  INCORRECT_TYPE,             ///< JSON element has a different type than user expected
+  NUMBER_OUT_OF_RANGE,        ///< JSON number does not fit in 64 bits
+  INDEX_OUT_OF_BOUNDS,        ///< JSON array index too large
+  NO_SUCH_FIELD,              ///< JSON field not found in object
+  IO_ERROR,                   ///< Error reading a file
+  INVALID_JSON_POINTER,       ///< Invalid JSON pointer reference
+  INVALID_URI_FRAGMENT,       ///< Invalid URI fragment
+  UNEXPECTED_ERROR,           ///< indicative of a bug in simdjson
+  PARSER_IN_USE,              ///< parser is already in use.
+  OUT_OF_ORDER_ITERATION,     ///< tried to iterate an array or object out of order
+  INSUFFICIENT_PADDING,       ///< The JSON doesn't have enough padding for simdjson to safely parse it.
+  INCOMPLETE_ARRAY_OR_OBJECT, ///< The document ends early.
+  SCALAR_DOCUMENT_AS_VALUE,   ///< A scalar document is treated as a value.
+  OUT_OF_BOUNDS,              ///< Attempted to access location outside of document.
+  TRAILING_CONTENT,           ///< Unexpected trailing content in the JSON input
+  NUM_ERROR_CODES
+};
+
+/**
+ * Get the error message for the given error code.
+ *
+ *   dom::parser parser;
+ *   dom::element doc;
+ *   auto error = parser.parse("foo",3).get(doc);
+ *   if (error) { printf("Error: %s\n", error_message(error)); }
+ *
+ * @return The error message.
+ */
+inline const char *error_message(error_code error) noexcept;
+
+/**
+ * Write the error message to the output stream
+ */
+inline std::ostream& operator<<(std::ostream& out, error_code error) noexcept;
+
+/**
+ * Exception thrown when an exception-supporting simdjson method is called
+ */
+struct simdjson_error : public std::exception {
+  /**
+   * Create an exception from a simdjson error code.
+   * @param error The error code
+   */
+  simdjson_error(error_code error) noexcept : _error{error} { }
+  /** The error message */
+  const char *what() const noexcept { return error_message(error()); }
+  /** The error code */
+  error_code error() const noexcept { return _error; }
+private:
+  /** The error code that was used */
+  error_code _error;
+};
+
+namespace internal {
+
+/**
+ * The result of a simdjson operation that could fail.
+ *
+ * Gives the option of reading error codes, or throwing an exception by casting to the desired result.
+ *
+ * This is a base class for implementations that want to add functions to the result type for
+ * chaining.
+ *
+ * Override like:
+ *
+ *   struct simdjson_result<T> : public internal::simdjson_result_base<T> {
+ *     simdjson_result() noexcept : internal::simdjson_result_base<T>() {}
+ *     simdjson_result(error_code error) noexcept : internal::simdjson_result_base<T>(error) {}
+ *     simdjson_result(T &&value) noexcept : internal::simdjson_result_base<T>(std::forward(value)) {}
+ *     simdjson_result(T &&value, error_code error) noexcept : internal::simdjson_result_base<T>(value, error) {}
+ *     // Your extra methods here
+ *   }
+ *
+ * Then any method returning simdjson_result<T> will be chainable with your methods.
+ */
+template<typename T>
+struct simdjson_result_base : protected std::pair<T, error_code> {
+
+  /**
+   * Create a new empty result with error = UNINITIALIZED.
+   */
+  simdjson_inline simdjson_result_base() noexcept;
+
+  /**
+   * Create a new error result.
+   */
+  simdjson_inline simdjson_result_base(error_code error) noexcept;
+
+  /**
+   * Create a new successful result.
+   */
+  simdjson_inline simdjson_result_base(T &&value) noexcept;
+
+  /**
+   * Create a new result with both things (use if you don't want to branch when creating the result).
+   */
+  simdjson_inline simdjson_result_base(T &&value, error_code error) noexcept;
+
+  /**
+   * Move the value and the error to the provided variables.
+   *
+   * @param value The variable to assign the value to. May not be set if there is an error.
+   * @param error The variable to assign the error to. Set to SUCCESS if there is no error.
+   */
+  simdjson_inline void tie(T &value, error_code &error) && noexcept;
+
+  /**
+   * Move the value to the provided variable.
+   *
+   * @param value The variable to assign the value to. May not be set if there is an error.
+   */
+  simdjson_inline error_code get(T &value) && noexcept;
+
+  /**
+   * The error.
+   */
+  simdjson_inline error_code error() const noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+
+  /**
+   * Get the result value.
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T& value() & noexcept(false);
+
+  /**
+   * Take the result value (move it).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T&& value() && noexcept(false);
+
+  /**
+   * Take the result value (move it).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T&& take_value() && noexcept(false);
+
+  /**
+   * Cast to the value (will throw on error).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline operator T&&() && noexcept(false);
+#endif // SIMDJSON_EXCEPTIONS
+
+  /**
+   * Get the result value. This function is safe if and only
+   * the error() method returns a value that evaluates to false.
+   */
+  simdjson_inline const T& value_unsafe() const& noexcept;
+
+  /**
+   * Take the result value (move it). This function is safe if and only
+   * the error() method returns a value that evaluates to false.
+   */
+  simdjson_inline T&& value_unsafe() && noexcept;
+
+}; // struct simdjson_result_base
+
+} // namespace internal
+
+/**
+ * The result of a simdjson operation that could fail.
+ *
+ * Gives the option of reading error codes, or throwing an exception by casting to the desired result.
+ */
+template<typename T>
+struct simdjson_result : public internal::simdjson_result_base<T> {
+  /**
+   * @private Create a new empty result with error = UNINITIALIZED.
+   */
+  simdjson_inline simdjson_result() noexcept;
+  /**
+   * @private Create a new error result.
+   */
+  simdjson_inline simdjson_result(T &&value) noexcept;
+  /**
+   * @private Create a new successful result.
+   */
+  simdjson_inline simdjson_result(error_code error_code) noexcept;
+  /**
+   * @private Create a new result with both things (use if you don't want to branch when creating the result).
+   */
+  simdjson_inline simdjson_result(T &&value, error_code error) noexcept;
+
+  /**
+   * Move the value and the error to the provided variables.
+   *
+   * @param value The variable to assign the value to. May not be set if there is an error.
+   * @param error The variable to assign the error to. Set to SUCCESS if there is no error.
+   */
+  simdjson_inline void tie(T &value, error_code &error) && noexcept;
+
+  /**
+   * Move the value to the provided variable.
+   *
+   * @param value The variable to assign the value to. May not be set if there is an error.
+   */
+  simdjson_warn_unused simdjson_inline error_code get(T &value) && noexcept;
+
+  /**
+   * The error.
+   */
+  simdjson_inline error_code error() const noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+
+  /**
+   * Get the result value.
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T& value() & noexcept(false);
+
+  /**
+   * Take the result value (move it).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T&& value() && noexcept(false);
+
+  /**
+   * Take the result value (move it).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T&& take_value() && noexcept(false);
+
+  /**
+   * Cast to the value (will throw on error).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline operator T&&() && noexcept(false);
+#endif // SIMDJSON_EXCEPTIONS
+
+  /**
+   * Get the result value. This function is safe if and only
+   * the error() method returns a value that evaluates to false.
+   */
+  simdjson_inline const T& value_unsafe() const& noexcept;
+
+  /**
+   * Take the result value (move it). This function is safe if and only
+   * the error() method returns a value that evaluates to false.
+   */
+  simdjson_inline T&& value_unsafe() && noexcept;
+
+}; // struct simdjson_result
+
+#if SIMDJSON_EXCEPTIONS
+
+template<typename T>
+inline std::ostream& operator<<(std::ostream& out, simdjson_result<T> value) { return out << value.value(); }
+#endif // SIMDJSON_EXCEPTIONS
+
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+/**
+ * @deprecated This is an alias and will be removed, use error_code instead
+ */
+using ErrorValues [[deprecated("This is an alias and will be removed, use error_code instead")]] = error_code;
+
+/**
+ * @deprecated Error codes should be stored and returned as `error_code`, use `error_message()` instead.
+ */
+[[deprecated("Error codes should be stored and returned as `error_code`, use `error_message()` instead.")]]
+inline const std::string error_message(int error) noexcept;
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+} // namespace simdjson
+
+#endif // SIMDJSON_ERROR_H
+/* end file include/simdjson/error.h */
+/* begin file include/simdjson/minify.h */
+#ifndef SIMDJSON_MINIFY_H
+#define SIMDJSON_MINIFY_H
+
+/* begin file include/simdjson/padded_string.h */
+#ifndef SIMDJSON_PADDED_STRING_H
+#define SIMDJSON_PADDED_STRING_H
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <ostream>
+
+namespace simdjson {
+
+class padded_string_view;
+
+/**
+ * String with extra allocation for ease of use with parser::parse()
+ *
+ * This is a move-only class, it cannot be copied.
+ */
+struct padded_string final {
+
+  /**
+   * Create a new, empty padded string.
+   */
+  explicit inline padded_string() noexcept;
+  /**
+   * Create a new padded string buffer.
+   *
+   * @param length the size of the string.
+   */
+  explicit inline padded_string(size_t length) noexcept;
+  /**
+   * Create a new padded string by copying the given input.
+   *
+   * @param data the buffer to copy
+   * @param length the number of bytes to copy
+   */
+  explicit inline padded_string(const char *data, size_t length) noexcept;
+  /**
+   * Create a new padded string by copying the given input.
+   *
+   * @param str_ the string to copy
+   */
+  inline padded_string(const std::string & str_ ) noexcept;
+  /**
+   * Create a new padded string by copying the given input.
+   *
+   * @param sv_ the string to copy
+   */
+  inline padded_string(std::string_view sv_) noexcept;
+  /**
+   * Move one padded string into another.
+   *
+   * The original padded string will be reduced to zero capacity.
+   *
+   * @param o the string to move.
+   */
+  inline padded_string(padded_string &&o) noexcept;
+  /**
+   * Move one padded string into another.
+   *
+   * The original padded string will be reduced to zero capacity.
+   *
+   * @param o the string to move.
+   */
+  inline padded_string &operator=(padded_string &&o) noexcept;
+  inline void swap(padded_string &o) noexcept;
+  ~padded_string() noexcept;
+
+  /**
+   * The length of the string.
+   *
+   * Does not include padding.
+   */
+  size_t size() const noexcept;
+
+  /**
+   * The length of the string.
+   *
+   * Does not include padding.
+   */
+  size_t length() const noexcept;
+
+  /**
+   * The string data.
+   **/
+  const char *data() const noexcept;
+  const uint8_t *u8data() const noexcept { return static_cast<const uint8_t*>(static_cast<const void*>(data_ptr));}
+
+  /**
+   * The string data.
+   **/
+  char *data() noexcept;
+
+  /**
+   * Create a std::string_view with the same content.
+   */
+  operator std::string_view() const;
+
+  /**
+   * Create a padded_string_view with the same content.
+   */
+  operator padded_string_view() const noexcept;
+
+  /**
+   * Load this padded string from a file.
+   *
+   * @return IO_ERROR on error. Be mindful that on some 32-bit systems,
+   * the file size might be limited to 2 GB.
+   *
+   * @param path the path to the file.
+   **/
+  inline static simdjson_result<padded_string> load(std::string_view path) noexcept;
+
+private:
+  padded_string &operator=(const padded_string &o) = delete;
+  padded_string(const padded_string &o) = delete;
+
+  size_t viable_size{0};
+  char *data_ptr{nullptr};
+
+}; // padded_string
+
+/**
+ * Send padded_string instance to an output stream.
+ *
+ * @param out The output stream.
+ * @param s The padded_string instance.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, const padded_string& s) { return out << s.data(); }
+
+#if SIMDJSON_EXCEPTIONS
+/**
+ * Send padded_string instance to an output stream.
+ *
+ * @param out The output stream.
+ * @param s The padded_string instance.
+  * @throw simdjson_error if the result being printed has an error. If there is an error with the
+ *        underlying output stream, that error will be propagated (simdjson_error will not be
+ *        thrown).
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson_result<padded_string> &s) noexcept(false) { return out << s.value(); }
+#endif
+
+} // namespace simdjson
+
+// This is deliberately outside of simdjson so that people get it without having to use the namespace
+inline simdjson::padded_string operator "" _padded(const char *str, size_t len) {
+  return simdjson::padded_string(str, len);
+}
+
+namespace simdjson {
+namespace internal {
+
+// The allocate_padded_buffer function is a low-level function to allocate memory
+// with padding so we can read past the "length" bytes safely. It is used by
+// the padded_string class automatically. It returns nullptr in case
+// of error: the caller should check for a null pointer.
+// The length parameter is the maximum size in bytes of the string.
+// The caller is responsible to free the memory (e.g., delete[] (...)).
+inline char *allocate_padded_buffer(size_t length) noexcept;
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_PADDED_STRING_H
+/* end file include/simdjson/padded_string.h */
+#include <string>
+#include <ostream>
+#include <sstream>
+
+namespace simdjson {
+
+
+
+/**
+ *
+ * Minify the input string assuming that it represents a JSON string, does not parse or validate.
+ * This function is much faster than parsing a JSON string and then writing a minified version of it.
+ * However, it does not validate the input. It will merely return an error in simple cases (e.g., if
+ * there is a string that was never terminated).
+ *
+ *
+ * @param buf the json document to minify.
+ * @param len the length of the json document.
+ * @param dst the buffer to write the minified document to. *MUST* be allocated up to len bytes.
+ * @param dst_len the number of bytes written. Output only.
+ * @return the error code, or SUCCESS if there was no error.
+ */
+simdjson_warn_unused error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept;
+
+} // namespace simdjson
+
+#endif // SIMDJSON_MINIFY_H
+/* end file include/simdjson/minify.h */
+/* begin file include/simdjson/padded_string_view.h */
+#ifndef SIMDJSON_PADDED_STRING_VIEW_H
+#define SIMDJSON_PADDED_STRING_VIEW_H
+
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <ostream>
+
+namespace simdjson {
+
+/**
+ * User-provided string that promises it has extra padded bytes at the end for use with parser::parse().
+ */
+class padded_string_view : public std::string_view {
+private:
+  size_t _capacity;
+
+public:
+  /** Create an empty padded_string_view. */
+  inline padded_string_view() noexcept = default;
+
+  /**
+   * Promise the given buffer has at least SIMDJSON_PADDING extra bytes allocated to it.
+   *
+   * @param s The string.
+   * @param len The length of the string (not including padding).
+   * @param capacity The allocated length of the string, including padding.
+   */
+  explicit inline padded_string_view(const char* s, size_t len, size_t capacity) noexcept;
+  /** overload explicit inline padded_string_view(const char* s, size_t len) noexcept */
+  explicit inline padded_string_view(const uint8_t* s, size_t len, size_t capacity) noexcept;
+
+  /**
+   * Promise the given string has at least SIMDJSON_PADDING extra bytes allocated to it.
+   *
+   * The capacity of the string will be used to determine its padding.
+   *
+   * @param s The string.
+   */
+  explicit inline padded_string_view(const std::string &s) noexcept;
+
+  /**
+   * Promise the given string_view has at least SIMDJSON_PADDING extra bytes allocated to it.
+   *
+   * @param s The string.
+   * @param capacity The allocated length of the string, including padding.
+   */
+  explicit inline padded_string_view(std::string_view s, size_t capacity) noexcept;
+
+  /** The number of allocated bytes. */
+  inline size_t capacity() const noexcept;
+
+  /** The amount of padding on the string (capacity() - length()) */
+  inline size_t padding() const noexcept;
+
+}; // padded_string_view
+
+#if SIMDJSON_EXCEPTIONS
+/**
+ * Send padded_string instance to an output stream.
+ *
+ * @param out The output stream.
+ * @param s The padded_string_view.
+ * @throw simdjson_error if the result being printed has an error. If there is an error with the
+ *        underlying output stream, that error will be propagated (simdjson_error will not be
+ *        thrown).
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson_result<padded_string_view> &s) noexcept(false) { return out << s.value(); }
+#endif
+
+} // namespace simdjson
+
+#endif // SIMDJSON_PADDED_STRING_VIEW_H
+/* end file include/simdjson/padded_string_view.h */
+/* begin file include/simdjson/implementation.h */
+#ifndef SIMDJSON_IMPLEMENTATION_H
+#define SIMDJSON_IMPLEMENTATION_H
+
+/* begin file include/simdjson/internal/dom_parser_implementation.h */
+#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
+
+#include <memory>
+
+namespace simdjson {
+
+namespace dom {
+class document;
+} // namespace dom
+
+/**
+* This enum is used with the dom_parser_implementation::stage1 function.
+* 1) The regular mode expects a fully formed JSON document.
+* 2) The streaming_partial mode expects a possibly truncated
+* input within a stream on JSON documents.
+* 3) The stream_final mode allows us to truncate final
+* unterminated strings. It is useful in conjunction with streaming_partial.
+*/
+enum class stage1_mode { regular, streaming_partial, streaming_final};
+
+/**
+ * Returns true if mode == streaming_partial or mode == streaming_final
+ */
+inline bool is_streaming(stage1_mode mode) {
+  // performance note: it is probably faster to check that mode is different
+  // from regular than checking that it is either streaming_partial or streaming_final.
+  return (mode != stage1_mode::regular);
+  // return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final);
+}
+
+
+namespace internal {
+
+
+/**
+ * An implementation of simdjson's DOM parser for a particular CPU architecture.
+ *
+ * This class is expected to be accessed only by pointer, and never move in memory (though the
+ * pointer can move).
+ */
+class dom_parser_implementation {
+public:
+
+  /**
+   * @private For internal implementation use
+   *
+   * Run a full JSON parse on a single document (stage1 + stage2).
+   *
+   * Guaranteed only to be called when capacity > document length.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len The length of the json document.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  simdjson_warn_unused virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 1 of the document parser.
+   *
+   * Guaranteed only to be called when capacity > document length.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf The json document to parse.
+   * @param len The length of the json document.
+   * @param streaming Whether this is being called by parser::parse_many.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 2 of the document parser.
+   *
+   * Called after stage1().
+   *
+   * Overridden by each implementation.
+   *
+   * @param doc The document to output to.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  simdjson_warn_unused virtual error_code stage2(dom::document &doc) noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 2 of the document parser for parser::parse_many.
+   *
+   * Guaranteed only to be called after stage1().
+   * Overridden by each implementation.
+   *
+   * @param doc The document to output to.
+   * @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed.
+   */
+  simdjson_warn_unused virtual error_code stage2_next(dom::document &doc) noexcept = 0;
+
+  /**
+   * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
+   * must be an unescaped quote terminating the string. It returns the final output
+   * position as pointer. In case of error (e.g., the string has bad escaped codes),
+   * then null_nullptrptr is returned. It is assumed that the output buffer is large
+   * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
+   * SIMDJSON_PADDING bytes.
+   *
+   * Overridden by each implementation.
+   *
+   * @param src pointer to the beginning of a valid UTF-8 JSON string, must end with an unescaped quote.
+   * @param dst pointer to a destination buffer, it must point a region in memory of sufficient size.
+   * @return end of the of the written region (exclusive) or nullptr in case of error.
+   */
+  simdjson_warn_unused virtual uint8_t *parse_string(const uint8_t *src, uint8_t *dst) const noexcept = 0;
+
+  /**
+   * Change the capacity of this parser.
+   *
+   * The capacity can never exceed SIMDJSON_MAXSIZE_BYTES (e.g., 4 GB)
+   * and an CAPACITY error is returned if it is attempted.
+   *
+   * Generally used for reallocation.
+   *
+   * @param capacity The new capacity.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  virtual error_code set_capacity(size_t capacity) noexcept = 0;
+
+  /**
+   * Change the max depth of this parser.
+   *
+   * Generally used for reallocation.
+   *
+   * @param max_depth The new max_depth.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  virtual error_code set_max_depth(size_t max_depth) noexcept = 0;
+
+  /**
+   * Deallocate this parser.
+   */
+  virtual ~dom_parser_implementation() = default;
+
+  /** Number of structural indices passed from stage 1 to stage 2 */
+  uint32_t n_structural_indexes{0};
+  /** Structural indices passed from stage 1 to stage 2 */
+  std::unique_ptr<uint32_t[]> structural_indexes{};
+  /** Next structural index to parse */
+  uint32_t next_structural_index{0};
+
+  /**
+   * The largest document this parser can support without reallocating.
+   *
+   * @return Current capacity, in bytes.
+   */
+  simdjson_inline size_t capacity() const noexcept;
+
+  /**
+   * The maximum level of nested object and arrays supported by this parser.
+   *
+   * @return Maximum depth, in bytes.
+   */
+  simdjson_inline size_t max_depth() const noexcept;
+
+  /**
+   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
+   * and `max_depth` depth.
+   *
+   * @param capacity The new capacity.
+   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
+   * @return The error, if there is one.
+   */
+  simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth) noexcept;
+
+
+protected:
+  /**
+   * The maximum document length this parser supports.
+   *
+   * Buffers are large enough to handle any document up to this length.
+   */
+  size_t _capacity{0};
+
+  /**
+   * The maximum depth (number of nested objects and arrays) supported by this parser.
+   *
+   * Defaults to DEFAULT_MAX_DEPTH.
+   */
+  size_t _max_depth{0};
+
+  // Declaring these so that subclasses can use them to implement their constructors.
+  simdjson_inline dom_parser_implementation() noexcept;
+  simdjson_inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
+  simdjson_inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
+
+  simdjson_inline dom_parser_implementation(const dom_parser_implementation &) noexcept = delete;
+  simdjson_inline dom_parser_implementation &operator=(const dom_parser_implementation &other) noexcept = delete;
+}; // class dom_parser_implementation
+
+simdjson_inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
+simdjson_inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
+simdjson_inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
+
+simdjson_inline size_t dom_parser_implementation::capacity() const noexcept {
+  return _capacity;
+}
+
+simdjson_inline size_t dom_parser_implementation::max_depth() const noexcept {
+  return _max_depth;
+}
+
+simdjson_warn_unused
+inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept {
+  if (this->max_depth() != max_depth) {
+    error_code err = set_max_depth(max_depth);
+    if (err) { return err; }
+  }
+  if (_capacity != capacity) {
+    error_code err = set_capacity(capacity);
+    if (err) { return err; }
+  }
+  return SUCCESS;
+}
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
+/* end file include/simdjson/internal/dom_parser_implementation.h */
+/* begin file include/simdjson/internal/isadetection.h */
+/* From
+https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
+Highly modified.
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
+Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
+(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
+Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
+America and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SIMDJSON_INTERNAL_ISADETECTION_H
+#define SIMDJSON_INTERNAL_ISADETECTION_H
+
+#include <cstdint>
+#include <cstdlib>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+#include <cpuid.h>
+#endif
+
+namespace simdjson {
+namespace internal {
+
+
+enum instruction_set {
+  DEFAULT = 0x0,
+  NEON = 0x1,
+  AVX2 = 0x4,
+  SSE42 = 0x8,
+  PCLMULQDQ = 0x10,
+  BMI1 = 0x20,
+  BMI2 = 0x40,
+  ALTIVEC = 0x80,
+  AVX512F = 0x100,
+  AVX512DQ = 0x200,
+  AVX512IFMA = 0x400,
+  AVX512PF = 0x800,
+  AVX512ER = 0x1000,
+  AVX512CD = 0x2000,
+  AVX512BW = 0x4000,
+  AVX512VL = 0x8000,
+  AVX512VBMI2 = 0x10000
+};
+
+#if defined(__PPC64__)
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::ALTIVEC;
+}
+
+#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
+
+#if defined(__ARM_NEON)
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::NEON;
+}
+
+#else // ARM without NEON
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::DEFAULT;
+}
+
+#endif
+
+#elif defined(__x86_64__) || defined(_M_AMD64) // x64
+
+
+namespace {
+// Can be found on Intel ISA Reference for CPUID
+constexpr uint32_t cpuid_avx2_bit = 1 << 5;         ///< @private Bit 5 of EBX for EAX=0x7
+constexpr uint32_t cpuid_bmi1_bit = 1 << 3;         ///< @private bit 3 of EBX for EAX=0x7
+constexpr uint32_t cpuid_bmi2_bit = 1 << 8;         ///< @private bit 8 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512f_bit = 1 << 16;     ///< @private bit 16 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512dq_bit = 1 << 17;    ///< @private bit 17 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512ifma_bit = 1 << 21;  ///< @private bit 21 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512pf_bit = 1 << 26;    ///< @private bit 26 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512er_bit = 1 << 27;    ///< @private bit 27 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512cd_bit = 1 << 28;    ///< @private bit 28 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512bw_bit = 1 << 30;    ///< @private bit 30 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512vl_bit = 1U << 31;    ///< @private bit 31 of EBX for EAX=0x7
+constexpr uint32_t cpuid_avx512vbmi2_bit = 1 << 6;  ///< @private bit 6 of ECX for EAX=0x7
+constexpr uint32_t cpuid_sse42_bit = 1 << 20;       ///< @private bit 20 of ECX for EAX=0x1
+constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1;    ///< @private bit  1 of ECX for EAX=0x1
+}
+
+
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+                         uint32_t *edx) {
+#if defined(_MSC_VER)
+  int cpu_info[4];
+  __cpuid(cpu_info, *eax);
+  *eax = cpu_info[0];
+  *ebx = cpu_info[1];
+  *ecx = cpu_info[2];
+  *edx = cpu_info[3];
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+  uint32_t level = *eax;
+  __get_cpuid(level, eax, ebx, ecx, edx);
+#else
+  uint32_t a = *eax, b, c = *ecx, d;
+  asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+  *eax = a;
+  *ebx = b;
+  *ecx = c;
+  *edx = d;
+#endif
+}
+
+static inline uint32_t detect_supported_architectures() {
+  uint32_t eax, ebx, ecx, edx;
+  uint32_t host_isa = 0x0;
+
+  // ECX for EAX=0x7
+  eax = 0x7;
+  ecx = 0x0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ebx & cpuid_avx2_bit) {
+    host_isa |= instruction_set::AVX2;
+  }
+  if (ebx & cpuid_bmi1_bit) {
+    host_isa |= instruction_set::BMI1;
+  }
+
+  if (ebx & cpuid_bmi2_bit) {
+    host_isa |= instruction_set::BMI2;
+  }
+
+  if (ebx & cpuid_avx512f_bit) {
+    host_isa |= instruction_set::AVX512F;
+  }
+
+  if (ebx & cpuid_avx512dq_bit) {
+    host_isa |= instruction_set::AVX512DQ;
+  }
+
+  if (ebx & cpuid_avx512ifma_bit) {
+    host_isa |= instruction_set::AVX512IFMA;
+  }
+
+  if (ebx & cpuid_avx512pf_bit) {
+    host_isa |= instruction_set::AVX512PF;
+  }
+
+  if (ebx & cpuid_avx512er_bit) {
+    host_isa |= instruction_set::AVX512ER;
+  }
+
+  if (ebx & cpuid_avx512cd_bit) {
+    host_isa |= instruction_set::AVX512CD;
+  }
+
+  if (ebx & cpuid_avx512bw_bit) {
+    host_isa |= instruction_set::AVX512BW;
+  }
+
+  if (ebx & cpuid_avx512vl_bit) {
+    host_isa |= instruction_set::AVX512VL;
+  }
+
+  if (ecx & cpuid_avx512vbmi2_bit) {
+    host_isa |= instruction_set::AVX512VBMI2;
+  }
+
+  // EBX for EAX=0x1
+  eax = 0x1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  if (ecx & cpuid_sse42_bit) {
+    host_isa |= instruction_set::SSE42;
+  }
+
+  if (ecx & cpuid_pclmulqdq_bit) {
+    host_isa |= instruction_set::PCLMULQDQ;
+  }
+
+  return host_isa;
+}
+#else // fallback
+
+
+static inline uint32_t detect_supported_architectures() {
+  return instruction_set::DEFAULT;
+}
+
+
+#endif // end SIMD extension detection code
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_ISADETECTION_H
+/* end file include/simdjson/internal/isadetection.h */
+#include <string>
+#include <atomic>
+#include <vector>
+
+namespace simdjson {
+
+/**
+ * Validate the UTF-8 string.
+ *
+ * @param buf the string to validate.
+ * @param len the length of the string in bytes.
+ * @return true if the string is valid UTF-8.
+ */
+simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) noexcept;
+/**
+ * Validate the UTF-8 string.
+ *
+ * @param sv the string_view to validate.
+ * @return true if the string is valid UTF-8.
+ */
+simdjson_inline simdjson_warn_unused bool validate_utf8(const std::string_view sv) noexcept {
+  return validate_utf8(sv.data(), sv.size());
+}
+
+/**
+ * Validate the UTF-8 string.
+ *
+ * @param s the string to validate.
+ * @return true if the string is valid UTF-8.
+ */
+simdjson_inline simdjson_warn_unused bool validate_utf8(const std::string& s) noexcept {
+  return validate_utf8(s.data(), s.size());
+}
+
+namespace dom {
+  class document;
+} // namespace dom
+
+/**
+ * An implementation of simdjson for a particular CPU architecture.
+ *
+ * Also used to maintain the currently active implementation. The active implementation is
+ * automatically initialized on first use to the most advanced implementation supported by the host.
+ */
+class implementation {
+public:
+
+  /**
+   * The name of this implementation.
+   *
+   *     const implementation *impl = simdjson::get_active_implementation();
+   *     cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   *
+   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64".
+   */
+  virtual const std::string &name() const { return _name; }
+
+  /**
+   * The description of this implementation.
+   *
+   *     const implementation *impl = simdjson::get_active_implementation();
+   *     cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   *
+   * @return the description of the implementation, e.g. "Intel/AMD AVX2", "Intel/AMD SSE4.2", "ARM NEON".
+   */
+  virtual const std::string &description() const { return _description; }
+
+  /**
+   * The instruction sets this implementation is compiled against
+   * and the current CPU match. This function may poll the current CPU/system
+   * and should therefore not be called too often if performance is a concern.
+   *
+   * @return true if the implementation can be safely used on the current system (determined at runtime).
+   */
+  bool supported_by_runtime_system() const;
+
+  /**
+   * @private For internal implementation use
+   *
+   * The instruction sets this implementation is compiled against.
+   *
+   * @return a mask of all required `internal::instruction_set::` values.
+   */
+  virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; };
+
+  /**
+   * @private For internal implementation use
+   *
+   *     const implementation *impl = simdjson::get_active_implementation();
+   *     cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   *
+   * @param capacity The largest document that will be passed to the parser.
+   * @param max_depth The maximum JSON object/array nesting this parser is expected to handle.
+   * @param dst The place to put the resulting parser implementation.
+   * @return the error code, or SUCCESS if there was no error.
+   */
+  virtual error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_depth,
+    std::unique_ptr<internal::dom_parser_implementation> &dst
+  ) const noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Minify the input string assuming that it represents a JSON string, does not parse or validate.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the json document to minify.
+   * @param len the length of the json document.
+   * @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param dst_len the number of bytes written. Output only.
+   * @return the error code, or SUCCESS if there was no error.
+   */
+  simdjson_warn_unused virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
+
+
+  /**
+   * Validate the UTF-8 string.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the string to validate.
+   * @param len the length of the string in bytes.
+   * @return true if and only if the string is valid UTF-8.
+   */
+  simdjson_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
+
+protected:
+  /** @private Construct an implementation with the given name and description. For subclasses. */
+  simdjson_inline implementation(
+    std::string_view name,
+    std::string_view description,
+    uint32_t required_instruction_sets
+  ) :
+    _name(name),
+    _description(description),
+    _required_instruction_sets(required_instruction_sets)
+  {
+  }
+  virtual ~implementation()=default;
+
+private:
+  /**
+   * The name of this implementation.
+   */
+  const std::string _name;
+
+  /**
+   * The description of this implementation.
+   */
+  const std::string _description;
+
+  /**
+   * Instruction sets required for this implementation.
+   */
+  const uint32_t _required_instruction_sets;
+};
+
+/** @private */
+namespace internal {
+
+/**
+ * The list of available implementations compiled into simdjson.
+ */
+class available_implementation_list {
+public:
+  /** Get the list of available implementations compiled into simdjson */
+  simdjson_inline available_implementation_list() {}
+  /** Number of implementations */
+  size_t size() const noexcept;
+  /** STL const begin() iterator */
+  const implementation * const *begin() const noexcept;
+  /** STL const end() iterator */
+  const implementation * const *end() const noexcept;
+
+  /**
+   * Get the implementation with the given name.
+   *
+   * Case sensitive.
+   *
+   *     const implementation *impl = simdjson::get_available_implementations()["westmere"];
+   *     if (!impl) { exit(1); }
+   *     if (!imp->supported_by_runtime_system()) { exit(1); }
+   *     simdjson::get_active_implementation() = impl;
+   *
+   * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
+   * @return the implementation, or nullptr if the parse failed.
+   */
+  const implementation * operator[](const std::string_view &name) const noexcept {
+    for (const implementation * impl : *this) {
+      if (impl->name() == name) { return impl; }
+    }
+    return nullptr;
+  }
+
+  /**
+   * Detect the most advanced implementation supported by the current host.
+   *
+   * This is used to initialize the implementation on startup.
+   *
+   *     const implementation *impl = simdjson::available_implementation::detect_best_supported();
+   *     simdjson::get_active_implementation() = impl;
+   *
+   * @return the most advanced supported implementation for the current host, or an
+   *         implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
+   *         implementation. Will never return nullptr.
+   */
+  const implementation *detect_best_supported() const noexcept;
+};
+
+template<typename T>
+class atomic_ptr {
+public:
+  atomic_ptr(T *_ptr) : ptr{_ptr} {}
+
+  operator const T*() const { return ptr.load(); }
+  const T& operator*() const { return *ptr; }
+  const T* operator->() const { return ptr.load(); }
+
+  operator T*() { return ptr.load(); }
+  T& operator*() { return *ptr; }
+  T* operator->() { return ptr.load(); }
+  atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
+
+private:
+  std::atomic<T*> ptr;
+};
+
+} // namespace internal
+
+/**
+ * The list of available implementations compiled into simdjson.
+ */
+extern SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations();
+
+/**
+  * The active implementation.
+  *
+  * Automatically initialized on first use to the most advanced implementation supported by this hardware.
+  */
+extern SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation();
+
+} // namespace simdjson
+
+#endif // SIMDJSON_IMPLEMENTATION_H
+/* end file include/simdjson/implementation.h */
+
+// Inline functions
+/* begin file include/simdjson/error-inl.h */
+#ifndef SIMDJSON_INLINE_ERROR_H
+#define SIMDJSON_INLINE_ERROR_H
+
+#include <cstring>
+#include <string>
+#include <utility>
+
+namespace simdjson {
+namespace internal {
+  // We store the error code so we can validate the error message is associated with the right code
+  struct error_code_info {
+    error_code code;
+    const char* message; // do not use a fancy std::string where a simple C string will do (no alloc, no destructor)
+  };
+  // These MUST match the codes in error_code. We check this constraint in basictests.
+  extern SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[];
+} // namespace internal
+
+
+inline const char *error_message(error_code error) noexcept {
+  // If you're using error_code, we're trusting you got it from the enum.
+  return internal::error_codes[int(error)].message;
+}
+
+// deprecated function
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+inline const std::string error_message(int error) noexcept {
+  if (error < 0 || error >= error_code::NUM_ERROR_CODES) {
+    return internal::error_codes[UNEXPECTED_ERROR].message;
+  }
+  return internal::error_codes[error].message;
+}
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+
+inline std::ostream& operator<<(std::ostream& out, error_code error) noexcept {
+  return out << error_message(error);
+}
+
+namespace internal {
+
+//
+// internal::simdjson_result_base<T> inline implementation
+//
+
+template<typename T>
+simdjson_inline void simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
+  error = this->second;
+  if (!error) {
+    value = std::forward<simdjson_result_base<T>>(*this).first;
+  }
+}
+
+template<typename T>
+simdjson_warn_unused simdjson_inline error_code simdjson_result_base<T>::get(T &value) && noexcept {
+  error_code error;
+  std::forward<simdjson_result_base<T>>(*this).tie(value, error);
+  return error;
+}
+
+template<typename T>
+simdjson_inline error_code simdjson_result_base<T>::error() const noexcept {
+  return this->second;
+}
+
+#if SIMDJSON_EXCEPTIONS
+
+template<typename T>
+simdjson_inline T& simdjson_result_base<T>::value() & noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return this->first;
+}
+
+template<typename T>
+simdjson_inline T&& simdjson_result_base<T>::value() && noexcept(false) {
+  return std::forward<simdjson_result_base<T>>(*this).take_value();
+}
+
+template<typename T>
+simdjson_inline T&& simdjson_result_base<T>::take_value() && noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return std::forward<T>(this->first);
+}
+
+template<typename T>
+simdjson_inline simdjson_result_base<T>::operator T&&() && noexcept(false) {
+  return std::forward<simdjson_result_base<T>>(*this).take_value();
+}
+
+#endif // SIMDJSON_EXCEPTIONS
+
+template<typename T>
+simdjson_inline const T& simdjson_result_base<T>::value_unsafe() const& noexcept {
+  return this->first;
+}
+
+template<typename T>
+simdjson_inline T&& simdjson_result_base<T>::value_unsafe() && noexcept {
+  return std::forward<T>(this->first);
+}
+
+template<typename T>
+simdjson_inline simdjson_result_base<T>::simdjson_result_base(T &&value, error_code error) noexcept
+    : std::pair<T, error_code>(std::forward<T>(value), error) {}
+template<typename T>
+simdjson_inline simdjson_result_base<T>::simdjson_result_base(error_code error) noexcept
+    : simdjson_result_base(T{}, error) {}
+template<typename T>
+simdjson_inline simdjson_result_base<T>::simdjson_result_base(T &&value) noexcept
+    : simdjson_result_base(std::forward<T>(value), SUCCESS) {}
+template<typename T>
+simdjson_inline simdjson_result_base<T>::simdjson_result_base() noexcept
+    : simdjson_result_base(T{}, UNINITIALIZED) {}
+
+} // namespace internal
+
+///
+/// simdjson_result<T> inline implementation
+///
+
+template<typename T>
+simdjson_inline void simdjson_result<T>::tie(T &value, error_code &error) && noexcept {
+  std::forward<internal::simdjson_result_base<T>>(*this).tie(value, error);
+}
+
+template<typename T>
+simdjson_warn_unused simdjson_inline error_code simdjson_result<T>::get(T &value) && noexcept {
+  return std::forward<internal::simdjson_result_base<T>>(*this).get(value);
+}
+
+template<typename T>
+simdjson_inline error_code simdjson_result<T>::error() const noexcept {
+  return internal::simdjson_result_base<T>::error();
+}
+
+#if SIMDJSON_EXCEPTIONS
+
+template<typename T>
+simdjson_inline T& simdjson_result<T>::value() & noexcept(false) {
+  return internal::simdjson_result_base<T>::value();
+}
+
+template<typename T>
+simdjson_inline T&& simdjson_result<T>::value() && noexcept(false) {
+  return std::forward<internal::simdjson_result_base<T>>(*this).value();
+}
+
+template<typename T>
+simdjson_inline T&& simdjson_result<T>::take_value() && noexcept(false) {
+  return std::forward<internal::simdjson_result_base<T>>(*this).take_value();
+}
+
+template<typename T>
+simdjson_inline simdjson_result<T>::operator T&&() && noexcept(false) {
+  return std::forward<internal::simdjson_result_base<T>>(*this).take_value();
+}
+
+#endif // SIMDJSON_EXCEPTIONS
+
+template<typename T>
+simdjson_inline const T& simdjson_result<T>::value_unsafe() const& noexcept {
+  return internal::simdjson_result_base<T>::value_unsafe();
+}
+
+template<typename T>
+simdjson_inline T&& simdjson_result<T>::value_unsafe() && noexcept {
+  return std::forward<internal::simdjson_result_base<T>>(*this).value_unsafe();
+}
+
+template<typename T>
+simdjson_inline simdjson_result<T>::simdjson_result(T &&value, error_code error) noexcept
+    : internal::simdjson_result_base<T>(std::forward<T>(value), error) {}
+template<typename T>
+simdjson_inline simdjson_result<T>::simdjson_result(error_code error) noexcept
+    : internal::simdjson_result_base<T>(error) {}
+template<typename T>
+simdjson_inline simdjson_result<T>::simdjson_result(T &&value) noexcept
+    : internal::simdjson_result_base<T>(std::forward<T>(value)) {}
+template<typename T>
+simdjson_inline simdjson_result<T>::simdjson_result() noexcept
+    : internal::simdjson_result_base<T>() {}
+
+} // namespace simdjson
+
+#endif // SIMDJSON_INLINE_ERROR_H
+/* end file include/simdjson/error-inl.h */
+/* begin file include/simdjson/padded_string-inl.h */
+#ifndef SIMDJSON_INLINE_PADDED_STRING_H
+#define SIMDJSON_INLINE_PADDED_STRING_H
+
+
+#include <climits>
+#include <cstring>
+#include <memory>
+#include <string>
+
+namespace simdjson {
+namespace internal {
+
+// The allocate_padded_buffer function is a low-level function to allocate memory
+// with padding so we can read past the "length" bytes safely. It is used by
+// the padded_string class automatically. It returns nullptr in case
+// of error: the caller should check for a null pointer.
+// The length parameter is the maximum size in bytes of the string.
+// The caller is responsible to free the memory (e.g., delete[] (...)).
+inline char *allocate_padded_buffer(size_t length) noexcept {
+  const size_t totalpaddedlength = length + SIMDJSON_PADDING;
+  if(totalpaddedlength<length) {
+    // overflow
+    return nullptr;
+  }
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  // avoid getting out of memory
+  if (totalpaddedlength>(1UL<<20)) {
+    return nullptr;
+  }
+#endif
+
+  char *padded_buffer = new (std::nothrow) char[totalpaddedlength];
+  if (padded_buffer == nullptr) {
+    return nullptr;
+  }
+  // We write zeroes in the padded region to avoid having uninitized
+  // garbage. If nothing else, garbage getting read might trigger a
+  // warning in a memory checking.
+  std::memset(padded_buffer + length, 0, totalpaddedlength - length);
+  return padded_buffer;
+} // allocate_padded_buffer()
+
+} // namespace internal
+
+
+inline padded_string::padded_string() noexcept = default;
+inline padded_string::padded_string(size_t length) noexcept
+    : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) {
+}
+inline padded_string::padded_string(const char *data, size_t length) noexcept
+    : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) {
+  if ((data != nullptr) && (data_ptr != nullptr)) {
+    std::memcpy(data_ptr, data, length);
+  }
+}
+// note: do not pass std::string arguments by value
+inline padded_string::padded_string(const std::string & str_ ) noexcept
+    : viable_size(str_.size()), data_ptr(internal::allocate_padded_buffer(str_.size())) {
+  if (data_ptr != nullptr) {
+    std::memcpy(data_ptr, str_.data(), str_.size());
+  }
+}
+// note: do pass std::string_view arguments by value
+inline padded_string::padded_string(std::string_view sv_) noexcept
+    : viable_size(sv_.size()), data_ptr(internal::allocate_padded_buffer(sv_.size())) {
+  if(simdjson_unlikely(!data_ptr)) {
+    //allocation failed or zero size
+    viable_size=0;
+    return;
+  }
+  if (sv_.size()) {
+    std::memcpy(data_ptr, sv_.data(), sv_.size());
+  }
+}
+inline padded_string::padded_string(padded_string &&o) noexcept
+    : viable_size(o.viable_size), data_ptr(o.data_ptr) {
+  o.data_ptr = nullptr; // we take ownership
+}
+
+inline padded_string &padded_string::operator=(padded_string &&o) noexcept {
+  delete[] data_ptr;
+  data_ptr = o.data_ptr;
+  viable_size = o.viable_size;
+  o.data_ptr = nullptr; // we take ownership
+  o.viable_size = 0;
+  return *this;
+}
+
+inline void padded_string::swap(padded_string &o) noexcept {
+  size_t tmp_viable_size = viable_size;
+  char *tmp_data_ptr = data_ptr;
+  viable_size = o.viable_size;
+  data_ptr = o.data_ptr;
+  o.data_ptr = tmp_data_ptr;
+  o.viable_size = tmp_viable_size;
+}
+
+inline padded_string::~padded_string() noexcept {
+  delete[] data_ptr;
+}
+
+inline size_t padded_string::size() const noexcept { return viable_size; }
+
+inline size_t padded_string::length() const noexcept { return viable_size; }
+
+inline const char *padded_string::data() const noexcept { return data_ptr; }
+
+inline char *padded_string::data() noexcept { return data_ptr; }
+
+inline padded_string::operator std::string_view() const { return std::string_view(data(), length()); }
+
+inline padded_string::operator padded_string_view() const noexcept {
+  return padded_string_view(data(), length(), length() + SIMDJSON_PADDING);
+}
+
+inline simdjson_result<padded_string> padded_string::load(std::string_view filename) noexcept {
+  // Open the file
+  SIMDJSON_PUSH_DISABLE_WARNINGS
+  SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+  std::FILE *fp = std::fopen(filename.data(), "rb");
+  SIMDJSON_POP_DISABLE_WARNINGS
+
+  if (fp == nullptr) {
+    return IO_ERROR;
+  }
+
+  // Get the file size
+  int ret;
+#if defined(SIMDJSON_VISUAL_STUDIO) && !SIMDJSON_IS_32BITS
+  ret = _fseeki64(fp, 0, SEEK_END);
+#else
+  ret = std::fseek(fp, 0, SEEK_END);
+#endif // _WIN64
+  if(ret < 0) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+#if defined(SIMDJSON_VISUAL_STUDIO) && !SIMDJSON_IS_32BITS
+  __int64 llen = _ftelli64(fp);
+  if(llen == -1L) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+#else
+  long llen = std::ftell(fp);
+  if((llen < 0) || (llen == LONG_MAX)) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+#endif
+
+  // Allocate the padded_string
+  size_t len = static_cast<size_t>(llen);
+  padded_string s(len);
+  if (s.data() == nullptr) {
+    std::fclose(fp);
+    return MEMALLOC;
+  }
+
+  // Read the padded_string
+  std::rewind(fp);
+  size_t bytes_read = std::fread(s.data(), 1, len, fp);
+  if (std::fclose(fp) != 0 || bytes_read != len) {
+    return IO_ERROR;
+  }
+
+  return s;
+}
+
+} // namespace simdjson
+
+#endif // SIMDJSON_INLINE_PADDED_STRING_H
+/* end file include/simdjson/padded_string-inl.h */
+/* begin file include/simdjson/padded_string_view-inl.h */
+#ifndef SIMDJSON_PADDED_STRING_VIEW_INL_H
+#define SIMDJSON_PADDED_STRING_VIEW_INL_H
+
+
+#include <climits>
+#include <cstring>
+#include <memory>
+#include <string>
+
+namespace simdjson {
+
+inline padded_string_view::padded_string_view(const char* s, size_t len, size_t capacity) noexcept
+  : std::string_view(s, len), _capacity(capacity)
+{
+}
+
+inline padded_string_view::padded_string_view(const uint8_t* s, size_t len, size_t capacity) noexcept
+  : padded_string_view(reinterpret_cast<const char*>(s), len, capacity)
+{
+}
+
+inline padded_string_view::padded_string_view(const std::string &s) noexcept
+  : std::string_view(s), _capacity(s.capacity())
+{
+}
+
+inline padded_string_view::padded_string_view(std::string_view s, size_t capacity) noexcept
+  : std::string_view(s), _capacity(capacity)
+{
+}
+
+inline size_t padded_string_view::capacity() const noexcept { return _capacity; }
+
+inline size_t padded_string_view::padding() const noexcept { return capacity() - length(); }
+
+} // namespace simdjson
+
+#endif // SIMDJSON_PADDED_STRING_VIEW_INL_H
+/* end file include/simdjson/padded_string_view-inl.h */
+
+SIMDJSON_POP_DISABLE_WARNINGS
+
+#endif // SIMDJSON_BASE_H
+/* end file include/simdjson/base.h */
+
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_UNDESIRED_WARNINGS
+
+/* begin file include/simdjson/dom/array.h */
+#ifndef SIMDJSON_DOM_ARRAY_H
+#define SIMDJSON_DOM_ARRAY_H
+
+/* begin file include/simdjson/internal/tape_ref.h */
+#ifndef SIMDJSON_INTERNAL_TAPE_REF_H
+#define SIMDJSON_INTERNAL_TAPE_REF_H
+
+/* begin file include/simdjson/internal/tape_type.h */
+#ifndef SIMDJSON_INTERNAL_TAPE_TYPE_H
+#define SIMDJSON_INTERNAL_TAPE_TYPE_H
+
+namespace simdjson {
+namespace internal {
+
+/**
+ * The possible types in the tape.
+ */
+enum class tape_type {
+  ROOT = 'r',
+  START_ARRAY = '[',
+  START_OBJECT = '{',
+  END_ARRAY = ']',
+  END_OBJECT = '}',
+  STRING = '"',
+  INT64 = 'l',
+  UINT64 = 'u',
+  DOUBLE = 'd',
+  TRUE_VALUE = 't',
+  FALSE_VALUE = 'f',
+  NULL_VALUE = 'n'
+}; // enum class tape_type
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_TAPE_TYPE_H
+/* end file include/simdjson/internal/tape_type.h */
+
+namespace simdjson {
+
+namespace dom {
+  class document;
+}
+
+namespace internal {
+
+constexpr const uint64_t JSON_VALUE_MASK = 0x00FFFFFFFFFFFFFF;
+constexpr const uint32_t JSON_COUNT_MASK = 0xFFFFFF;
+
+/**
+ * A reference to an element on the tape. Internal only.
+ */
+class tape_ref {
+public:
+  simdjson_inline tape_ref() noexcept;
+  simdjson_inline tape_ref(const dom::document *doc, size_t json_index) noexcept;
+  inline size_t after_element() const noexcept;
+  simdjson_inline tape_type tape_ref_type() const noexcept;
+  simdjson_inline uint64_t tape_value() const noexcept;
+  simdjson_inline bool is_double() const noexcept;
+  simdjson_inline bool is_int64() const noexcept;
+  simdjson_inline bool is_uint64() const noexcept;
+  simdjson_inline bool is_false() const noexcept;
+  simdjson_inline bool is_true() const noexcept;
+  simdjson_inline bool is_null_on_tape() const noexcept;// different name to avoid clash with is_null.
+  simdjson_inline uint32_t matching_brace_index() const noexcept;
+  simdjson_inline uint32_t scope_count() const noexcept;
+  template<typename T>
+  simdjson_inline T next_tape_value() const noexcept;
+  simdjson_inline uint32_t get_string_length() const noexcept;
+  simdjson_inline const char * get_c_str() const noexcept;
+  inline std::string_view get_string_view() const noexcept;
+  simdjson_inline bool is_document_root() const noexcept;
+  simdjson_inline bool usable() const noexcept;
+
+  /** The document this element references. */
+  const dom::document *doc;
+
+  /** The index of this element on `doc.tape[]` */
+  size_t json_index;
+};
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_TAPE_REF_H
+/* end file include/simdjson/internal/tape_ref.h */
+
+namespace simdjson {
+
+namespace internal {
+template<typename T>
+class string_builder;
+}
+namespace dom {
+
+class document;
+class element;
+
+/**
+ * JSON array.
+ */
+class array {
+public:
+  /** Create a new, invalid array */
+  simdjson_inline array() noexcept;
+
+  class iterator {
+  public:
+    using value_type = element;
+    using difference_type = std::ptrdiff_t;
+
+    /**
+     * Get the actual value
+     */
+    inline value_type operator*() const noexcept;
+    /**
+     * Get the next value.
+     *
+     * Part of the std::iterator interface.
+     */
+    inline iterator& operator++() noexcept;
+    /**
+     * Get the next value.
+     *
+     * Part of the  std::iterator interface.
+     */
+    inline iterator operator++(int) noexcept;
+    /**
+     * Check if these values come from the same place in the JSON.
+     *
+     * Part of the std::iterator interface.
+     */
+    inline bool operator!=(const iterator& other) const noexcept;
+    inline bool operator==(const iterator& other) const noexcept;
+
+    inline bool operator<(const iterator& other) const noexcept;
+    inline bool operator<=(const iterator& other) const noexcept;
+    inline bool operator>=(const iterator& other) const noexcept;
+    inline bool operator>(const iterator& other) const noexcept;
+
+    iterator() noexcept = default;
+    iterator(const iterator&) noexcept = default;
+    iterator& operator=(const iterator&) noexcept = default;
+  private:
+    simdjson_inline iterator(const internal::tape_ref &tape) noexcept;
+    internal::tape_ref tape;
+    friend class array;
+  };
+
+  /**
+   * Return the first array element.
+   *
+   * Part of the std::iterable interface.
+   */
+  inline iterator begin() const noexcept;
+  /**
+   * One past the last array element.
+   *
+   * Part of the std::iterable interface.
+   */
+  inline iterator end() const noexcept;
+  /**
+   * Get the size of the array (number of immediate children).
+   * It is a saturated value with a maximum of 0xFFFFFF: if the value
+   * is 0xFFFFFF then the size is 0xFFFFFF or greater.
+   */
+  inline size_t size() const noexcept;
+  /**
+   * Get the total number of slots used by this array on the tape.
+   *
+   * Note that this is not the same thing as `size()`, which reports the
+   * number of actual elements within an array (not counting its children).
+   *
+   * Since an element can use 1 or 2 slots on the tape, you can only use this
+   * to figure out the total size of an array (including its children,
+   * recursively) if you know its structure ahead of time.
+   **/
+  inline size_t number_of_slots() const noexcept;
+  /**
+   * Get the value associated with the given JSON pointer.  We use the RFC 6901
+   * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node
+   * as the root of its own JSON document.
+   *
+   *   dom::parser parser;
+   *   array a = parser.parse(R"([ { "foo": { "a": [ 10, 20, 30 ] }} ])"_padded);
+   *   a.at_pointer("/0/foo/a/1") == 20
+   *   a.at_pointer("0")["foo"]["a"].at(1) == 20
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   */
+  inline simdjson_result<element> at_pointer(std::string_view json_pointer) const noexcept;
+
+  /**
+   * Get the value at the given index. This function has linear-time complexity and
+   * is equivalent to the following:
+   *
+   *    size_t i=0;
+   *    for (auto element : *this) {
+   *      if (i == index) { return element; }
+   *      i++;
+   *    }
+   *    return INDEX_OUT_OF_BOUNDS;
+   *
+   * Avoid calling the at() function repeatedly.
+   *
+   * @return The value at the given index, or:
+   *         - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length
+   */
+  inline simdjson_result<element> at(size_t index) const noexcept;
+
+private:
+  simdjson_inline array(const internal::tape_ref &tape) noexcept;
+  internal::tape_ref tape;
+  friend class element;
+  friend struct simdjson_result<element>;
+  template<typename T>
+  friend class simdjson::internal::string_builder;
+};
+
+
+} // namespace dom
+
+/** The result of a JSON conversion that may fail. */
+template<>
+struct simdjson_result<dom::array> : public internal::simdjson_result_base<dom::array> {
+public:
+  simdjson_inline simdjson_result() noexcept; ///< @private
+  simdjson_inline simdjson_result(dom::array value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+
+  inline simdjson_result<dom::element> at_pointer(std::string_view json_pointer) const noexcept;
+  inline simdjson_result<dom::element> at(size_t index) const noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  inline dom::array::iterator begin() const noexcept(false);
+  inline dom::array::iterator end() const noexcept(false);
+  inline size_t size() const noexcept(false);
+#endif // SIMDJSON_EXCEPTIONS
+};
+
+
+
+} // namespace simdjson
+
+#if defined(__cpp_lib_ranges)
+#include <ranges>
+
+namespace std {
+namespace ranges {
+template<>
+inline constexpr bool enable_view<simdjson::dom::array> = true;
+#if SIMDJSON_EXCEPTIONS
+template<>
+inline constexpr bool enable_view<simdjson::simdjson_result<simdjson::dom::array>> = true;
+#endif // SIMDJSON_EXCEPTIONS
+} // namespace ranges
+} // namespace std
+#endif // defined(__cpp_lib_ranges)
+
+#endif // SIMDJSON_DOM_ARRAY_H
+/* end file include/simdjson/dom/array.h */
+/* begin file include/simdjson/dom/document_stream.h */
+#ifndef SIMDJSON_DOCUMENT_STREAM_H
+#define SIMDJSON_DOCUMENT_STREAM_H
+
+/* begin file include/simdjson/dom/parser.h */
+#ifndef SIMDJSON_DOM_PARSER_H
+#define SIMDJSON_DOM_PARSER_H
+
+/* begin file include/simdjson/dom/document.h */
+#ifndef SIMDJSON_DOM_DOCUMENT_H
+#define SIMDJSON_DOM_DOCUMENT_H
+
+#include <memory>
+#include <ostream>
+
+namespace simdjson {
+namespace dom {
+
+class element;
+
+/**
+ * A parsed JSON document.
+ *
+ * This class cannot be copied, only moved, to avoid unintended allocations.
+ */
+class document {
+public:
+  /**
+   * Create a document container with zero capacity.
+   *
+   * The parser will allocate capacity as needed.
+   */
+  document() noexcept = default;
+  ~document() noexcept = default;
+
+  /**
+   * Take another document's buffers.
+   *
+   * @param other The document to take. Its capacity is zeroed and it is invalidated.
+   */
+  document(document &&other) noexcept = default;
+  /** @private */
+  document(const document &) = delete; // Disallow copying
+  /**
+   * Take another document's buffers.
+   *
+   * @param other The document to take. Its capacity is zeroed.
+   */
+  document &operator=(document &&other) noexcept = default;
+  /** @private */
+  document &operator=(const document &) = delete; // Disallow copying
+
+  /**
+   * Get the root element of this document as a JSON array.
+   */
+  element root() const noexcept;
+
+  /**
+   * @private Dump the raw tape for debugging.
+   *
+   * @param os the stream to output to.
+   * @return false if the tape is likely wrong (e.g., you did not parse a valid JSON).
+   */
+  bool dump_raw_tape(std::ostream &os) const noexcept;
+
+  /** @private Structural values. */
+  std::unique_ptr<uint64_t[]> tape{};
+
+  /** @private String values.
+   *
+   * Should be at least byte_capacity.
+   */
+  std::unique_ptr<uint8_t[]> string_buf{};
+  /** @private Allocate memory to support
+   * input JSON documents of up to len bytes.
+   *
+   * When calling this function, you lose
+   * all the data.
+   *
+   * The memory allocation is strict: you
+   * can you use this function to increase
+   * or lower the amount of allocated memory.
+   * Passsing zero clears the memory.
+   */
+  error_code allocate(size_t len) noexcept;
+  /** @private Capacity in bytes, in terms
+   * of how many bytes of input JSON we can
+   * support.
+   */
+  size_t capacity() const noexcept;
+
+
+private:
+  size_t allocated_capacity{0};
+  friend class parser;
+}; // class document
+
+} // namespace dom
+} // namespace simdjson
+
+#endif // SIMDJSON_DOM_DOCUMENT_H
+/* end file include/simdjson/dom/document.h */
+#include <memory>
+#include <ostream>
+#include <string>
+
+namespace simdjson {
+
+namespace dom {
+
+class document_stream;
+class element;
+
+/** The default batch size for parser.parse_many() and parser.load_many() */
+static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
+/**
+ * Some adversary might try to set the batch size to 0 or 1, which might cause problems.
+ * We set a minimum of 32B since anything else is highly likely to be an error. In practice,
+ * most users will want a much larger batch size.
+ *
+ * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON
+ * document can ever span 0 or 1 byte and that very large values would create memory allocation issues.
+ */
+static constexpr size_t MINIMAL_BATCH_SIZE = 32;
+
+/**
+ * It is wasteful to allocate memory for tiny documents (e.g., 4 bytes).
+ */
+static constexpr size_t MINIMAL_DOCUMENT_CAPACITY = 32;
+
+/**
+ * A persistent document parser.
+ *
+ * The parser is designed to be reused, holding the internal buffers necessary to do parsing,
+ * as well as memory for a single document. The parsed document is overwritten on each parse.
+ *
+ * This class cannot be copied, only moved, to avoid unintended allocations.
+ *
+ * @note Moving a parser instance may invalidate "dom::element" instances. If you need to
+ * preserve both the "dom::element" instances and the parser, consider wrapping the parser
+ * instance in a std::unique_ptr instance:
+ *
+ *   std::unique_ptr<dom::parser> parser(new dom::parser{});
+ *   auto error = parser->load(f).get(root);
+ *
+ * You can then move std::unique_ptr safely.
+ *
+ * @note This is not thread safe: one parser cannot produce two documents at the same time!
+ */
+class parser {
+public:
+  /**
+   * Create a JSON parser.
+   *
+   * The new parser will have zero capacity.
+   *
+   * @param max_capacity The maximum document length the parser can automatically handle. The parser
+   *    will allocate more capacity on an as needed basis (when it sees documents too big to handle)
+   *    up to this amount. The parser still starts with zero capacity no matter what this number is:
+   *    to allocate an initial capacity, call allocate() after constructing the parser.
+   *    Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process).
+   */
+  simdjson_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept;
+  /**
+   * Take another parser's buffers and state.
+   *
+   * @param other The parser to take. Its capacity is zeroed.
+   */
+  simdjson_inline parser(parser &&other) noexcept;
+  parser(const parser &) = delete; ///< @private Disallow copying
+  /**
+   * Take another parser's buffers and state.
+   *
+   * @param other The parser to take. Its capacity is zeroed.
+   */
+  simdjson_inline parser &operator=(parser &&other) noexcept;
+  parser &operator=(const parser &) = delete; ///< @private Disallow copying
+
+  /** Deallocate the JSON parser. */
+  ~parser()=default;
+
+  /**
+   * Load a JSON document from a file and return a reference to it.
+   *
+   *   dom::parser parser;
+   *   const element doc = parser.load("jsonexamples/twitter.json");
+   *
+   * The function is eager: the file's content is loaded in memory inside the parser instance
+   * and immediately parsed. The file can be deleted after the  `parser.load` call.
+   *
+   * ### IMPORTANT: Document Lifetime
+   *
+   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
+   * documents because it reuses the same buffers, but you *must* use the document before you
+   * destroy the parser or call parse() again.
+   *
+   * Moving the parser instance is safe, but it invalidates the element instances. You may store
+   * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like
+   * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`.
+   *
+   * ### Parser Capacity
+   *
+   * If the parser's current capacity is less than the file length, it will allocate enough capacity
+   * to handle it (up to max_capacity).
+   *
+   * @param path The path to load.
+   * @return The document, or an error:
+   *         - IO_ERROR if there was an error opening or reading the file.
+   *           Be mindful that on some 32-bit systems,
+   *           the file size might be limited to 2 GB.
+   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails.
+   *         - CAPACITY if the parser does not have enough capacity and len > max_capacity.
+   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
+   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
+   */
+  inline simdjson_result<element> load(const std::string &path) & noexcept;
+  inline simdjson_result<element> load(const std::string &path) &&  = delete ;
+  /**
+   * Parse a JSON document and return a temporary reference to it.
+   *
+   *   dom::parser parser;
+   *   element doc_root = parser.parse(buf, len);
+   *
+   * The function eagerly parses the input: the input can be modified and discarded after
+   * the `parser.parse(buf, len)` call has completed.
+   *
+   * ### IMPORTANT: Document Lifetime
+   *
+   * The JSON document still lives in the parser: this is the most efficient way to parse JSON
+   * documents because it reuses the same buffers, but you *must* use the document before you
+   * destroy the parser or call parse() again.
+   *
+   * Moving the parser instance is safe, but it invalidates the element instances. You may store
+   * the parser instance without moving it by wrapping it inside an `unique_ptr` instance like
+   * so: `std::unique_ptr<dom::parser> parser(new dom::parser{});`.
+   *
+   * ### REQUIRED: Buffer Padding
+   *
+   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
+   * those bytes are initialized to, as long as they are allocated.
+   *
+   * If realloc_if_needed is true (the default), it is assumed that the buffer does *not* have enough padding,
+   * and it is copied into an enlarged temporary buffer before parsing. Thus the following is safe:
+   *
+   *   const char *json      = R"({"key":"value"})";
+   *   const size_t json_len = std::strlen(json);
+   *   simdjson::dom::parser parser;
+   *   simdjson::dom::element element = parser.parse(json, json_len);
+   *
+   * If you set realloc_if_needed to false (e.g., parser.parse(json, json_len, false)),
+   * you must provide a buffer with at least SIMDJSON_PADDING extra bytes at the end.
+   * The benefit of setting realloc_if_needed to false is that you avoid a temporary
+   * memory allocation and a copy.
+   *
+   * The padded bytes may be read. It is not important how you initialize
+   * these bytes though we recommend a sensible default like null character values or spaces.
+   * For example, the following low-level code is safe:
+   *
+   *   const char *json      = R"({"key":"value"})";
+   *   const size_t json_len = std::strlen(json);
+   *   std::unique_ptr<char[]> padded_json_copy{new char[json_len + SIMDJSON_PADDING]};
+   *   std::memcpy(padded_json_copy.get(), json, json_len);
+   *   std::memset(padded_json_copy.get() + json_len, '\0', SIMDJSON_PADDING);
+   *   simdjson::dom::parser parser;
+   *   simdjson::dom::element element = parser.parse(padded_json_copy.get(), json_len, false);
+   *
+   * ### Parser Capacity
+   *
+   * If the parser's current capacity is less than len, it will allocate enough capacity
+   * to handle it (up to max_capacity).
+   *
+   * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
+   *            realloc_if_needed is true.
+   * @param len The length of the JSON.
+   * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
+   * @return An element pointing at the root of the document, or an error:
+   *         - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,
+   *           and memory allocation fails.
+   *         - CAPACITY if the parser does not have enough capacity and len > max_capacity.
+   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
+   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
+   */
+  inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept;
+  inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete;
+  /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
+  simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) & noexcept;
+  simdjson_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) && =delete;
+  /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
+  simdjson_inline simdjson_result<element> parse(const std::string &s) & noexcept;
+  simdjson_inline simdjson_result<element> parse(const std::string &s) && =delete;
+  /** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
+  simdjson_inline simdjson_result<element> parse(const padded_string &s) & noexcept;
+  simdjson_inline simdjson_result<element> parse(const padded_string &s) && =delete;
+
+  /** @private We do not want to allow implicit conversion from C string to std::string. */
+  simdjson_inline simdjson_result<element> parse(const char *buf) noexcept = delete;
+
+  /**
+   * Parse a JSON document into a provide document instance and return a temporary reference to it.
+   * It is similar to the function `parse` except that instead of parsing into the internal
+   * `document` instance associated with the parser, it allows the user to provide a document
+   * instance.
+   *
+   *   dom::parser parser;
+   *   dom::document doc;
+   *   element doc_root = parser.parse_into_document(doc, buf, len);
+   *
+   * The function eagerly parses the input: the input can be modified and discarded after
+   * the `parser.parse(buf, len)` call has completed.
+   *
+   * ### IMPORTANT: Document Lifetime
+   *
+   * After the call to parse_into_document, the parser is no longer needed.
+   *
+   * The JSON document lives in the document instance: you must keep the document
+   * instance alive while you navigate through it (i.e., used the returned value from
+   * parse_into_document). You are encourage to reuse the document instance
+   * many times with new data to avoid reallocations:
+   *
+   *   dom::document doc;
+   *   element doc_root1 = parser.parse_into_document(doc, buf1, len);
+   *   //... doc_root1 is a pointer inside doc
+   *   element doc_root2 = parser.parse_into_document(doc, buf1, len);
+   *   //... doc_root2 is a pointer inside doc
+   *   // at this point doc_root1 is no longer safe
+   *
+   * Moving the document instance is safe, but it invalidates the element instances. After
+   * moving a document, you can recover safe access to the document root with its `root()` method.
+   *
+   * @param doc The document instance where the parsed data will be stored (on success).
+   * @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
+   *            realloc_if_needed is true.
+   * @param len The length of the JSON.
+   * @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
+   * @return An element pointing at the root of document, or an error:
+   *         - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,
+   *           and memory allocation fails.
+   *         - CAPACITY if the parser does not have enough capacity and len > max_capacity.
+   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
+   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
+   */
+  inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept;
+  inline simdjson_result<element> parse_into_document(document& doc, const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete;
+  /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
+  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) & noexcept;
+  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf, size_t len, bool realloc_if_needed = true) && =delete;
+  /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
+  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) & noexcept;
+  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const std::string &s) && =delete;
+  /** @overload parse_into_document(const uint8_t *buf, size_t len, bool realloc_if_needed) */
+  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) & noexcept;
+  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const padded_string &s) && =delete;
+
+  /** @private We do not want to allow implicit conversion from C string to std::string. */
+  simdjson_inline simdjson_result<element> parse_into_document(document& doc, const char *buf) noexcept = delete;
+
+  /**
+   * Load a file containing many JSON documents.
+   *
+   *   dom::parser parser;
+   *   for (const element doc : parser.load_many(path)) {
+   *     cout << std::string(doc["title"]) << endl;
+   *   }
+   *
+   * The file is loaded in memory and can be safely deleted after the `parser.load_many(path)`
+   * function has returned. The memory is held by the `parser` instance.
+   *
+   * The function is lazy: it may be that no more than one JSON document at a time is parsed.
+   * And, possibly, no document many have been parsed when the `parser.load_many(path)` function
+   * returned.
+   *
+   * ### Format
+   *
+   * The file must contain a series of one or more JSON documents, concatenated into a single
+   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
+   * then starts parsing the next document at that point. (It does this with more parallelism and
+   * lookahead than you might think, though.)
+   *
+   * Documents that consist of an object or array may omit the whitespace between them, concatenating
+   * with no separator. documents that consist of a single primitive (i.e. documents that are not
+   * arrays or objects) MUST be separated with whitespace.
+   *
+   * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
+   * Setting batch_size to excessively large or excesively small values may impact negatively the
+   * performance.
+   *
+   * ### Error Handling
+   *
+   * All errors are returned during iteration: if there is a global error such as memory allocation,
+   * it will be yielded as the first result. Iteration always stops after the first error.
+   *
+   * As with all other simdjson methods, non-exception error handling is readily available through
+   * the same interface, requiring you to check the error before using the document:
+   *
+   *   dom::parser parser;
+   *   dom::document_stream docs;
+   *   auto error = parser.load_many(path).get(docs);
+   *   if (error) { cerr << error << endl; exit(1); }
+   *   for (auto doc : docs) {
+   *     std::string_view title;
+   *     if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }
+   *     cout << title << endl;
+   *   }
+   *
+   * ### Threads
+   *
+   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
+   * hood to do some lookahead.
+   *
+   * ### Parser Capacity
+   *
+   * If the parser's current capacity is less than batch_size, it will allocate enough capacity
+   * to handle it (up to max_capacity).
+   *
+   * @param path File name pointing at the concatenated JSON to parse.
+   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
+   *                   spot is cache-related: small enough to fit in cache, yet big enough to
+   *                   parse as many documents as possible in one tight loop.
+   *                   Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet
+   *                   spot in our tests.
+   *                   If you set the batch_size to a value smaller than simdjson::dom::MINIMAL_BATCH_SIZE
+   *                   (currently 32B), it will be replaced by simdjson::dom::MINIMAL_BATCH_SIZE.
+   * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
+   *         - IO_ERROR if there was an error opening or reading the file.
+   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails.
+   *         - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
+   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
+   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
+   */
+  inline simdjson_result<document_stream> load_many(const std::string &path, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
+
+  /**
+   * Parse a buffer containing many JSON documents.
+   *
+   *   dom::parser parser;
+   *   for (element doc : parser.parse_many(buf, len)) {
+   *     cout << std::string(doc["title"]) << endl;
+   *   }
+   *
+   * No copy of the input buffer is made.
+   *
+   * The function is lazy: it may be that no more than one JSON document at a time is parsed.
+   * And, possibly, no document many have been parsed when the `parser.load_many(path)` function
+   * returned.
+   *
+   * The caller is responsabile to ensure that the input string data remains unchanged and is
+   * not deleted during the loop. In particular, the following is unsafe and will not compile:
+   *
+   *   auto docs = parser.parse_many("[\"temporary data\"]"_padded);
+   *   // here the string "[\"temporary data\"]" may no longer exist in memory
+   *   // the parser instance may not have even accessed the input yet
+   *   for (element doc : docs) {
+   *     cout << std::string(doc["title"]) << endl;
+   *   }
+   *
+   * The following is safe:
+   *
+   *   auto json = "[\"temporary data\"]"_padded;
+   *   auto docs = parser.parse_many(json);
+   *   for (element doc : docs) {
+   *     cout << std::string(doc["title"]) << endl;
+   *   }
+   *
+   * ### Format
+   *
+   * The buffer must contain a series of one or more JSON documents, concatenated into a single
+   * buffer, separated by whitespace. It effectively parses until it has a fully valid document,
+   * then starts parsing the next document at that point. (It does this with more parallelism and
+   * lookahead than you might think, though.)
+   *
+   * documents that consist of an object or array may omit the whitespace between them, concatenating
+   * with no separator. documents that consist of a single primitive (i.e. documents that are not
+   * arrays or objects) MUST be separated with whitespace.
+   *
+   * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
+   * Setting batch_size to excessively large or excesively small values may impact negatively the
+   * performance.
+   *
+   * ### Error Handling
+   *
+   * All errors are returned during iteration: if there is a global error such as memory allocation,
+   * it will be yielded as the first result. Iteration always stops after the first error.
+   *
+   * As with all other simdjson methods, non-exception error handling is readily available through
+   * the same interface, requiring you to check the error before using the document:
+   *
+   *   dom::parser parser;
+   *   dom::document_stream docs;
+   *   auto error = parser.load_many(path).get(docs);
+   *   if (error) { cerr << error << endl; exit(1); }
+   *   for (auto doc : docs) {
+   *     std::string_view title;
+   *     if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }
+   *     cout << title << endl;
+   *   }
+   *
+   * ### REQUIRED: Buffer Padding
+   *
+   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
+   * those bytes are initialized to, as long as they are allocated.
+   *
+   * ### Threads
+   *
+   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
+   * hood to do some lookahead.
+   *
+   * ### Parser Capacity
+   *
+   * If the parser's current capacity is less than batch_size, it will allocate enough capacity
+   * to handle it (up to max_capacity).
+   *
+   * @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
+   * @param len The length of the concatenated JSON.
+   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
+   *                   spot is cache-related: small enough to fit in cache, yet big enough to
+   *                   parse as many documents as possible in one tight loop.
+   *                   Defaults to 10MB, which has been a reasonable sweet spot in our tests.
+   * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
+   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails
+   *         - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
+   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
+   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
+   */
+  inline simdjson_result<document_stream> parse_many(const uint8_t *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
+  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
+  inline simdjson_result<document_stream> parse_many(const char *buf, size_t len, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
+  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
+  inline simdjson_result<document_stream> parse_many(const std::string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
+  inline simdjson_result<document_stream> parse_many(const std::string &&s, size_t batch_size) = delete;// unsafe
+  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
+  inline simdjson_result<document_stream> parse_many(const padded_string &s, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept;
+  inline simdjson_result<document_stream> parse_many(const padded_string &&s, size_t batch_size) = delete;// unsafe
+
+  /** @private We do not want to allow implicit conversion from C string to std::string. */
+  simdjson_result<document_stream> parse_many(const char *buf, size_t batch_size = dom::DEFAULT_BATCH_SIZE) noexcept = delete;
+
+  /**
+   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
+   * and `max_depth` depth.
+   *
+   * @param capacity The new capacity.
+   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
+   * @return The error, if there is one.
+   */
+  simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;
+
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+  /**
+   * @private deprecated because it returns bool instead of error_code, which is our standard for
+   * failures. Use allocate() instead.
+   *
+   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
+   * and `max_depth` depth.
+   *
+   * @param capacity The new capacity.
+   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
+   * @return true if successful, false if allocation failed.
+   */
+  [[deprecated("Use allocate() instead.")]]
+  simdjson_warn_unused inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+  /**
+   * The largest document this parser can support without reallocating.
+   *
+   * @return Current capacity, in bytes.
+   */
+  simdjson_inline size_t capacity() const noexcept;
+
+  /**
+   * The largest document this parser can automatically support.
+   *
+   * The parser may reallocate internal buffers as needed up to this amount.
+   *
+   * @return Maximum capacity, in bytes.
+   */
+  simdjson_inline size_t max_capacity() const noexcept;
+
+  /**
+   * The maximum level of nested object and arrays supported by this parser.
+   *
+   * @return Maximum depth, in bytes.
+   */
+  simdjson_inline size_t max_depth() const noexcept;
+
+  /**
+   * Set max_capacity. This is the largest document this parser can automatically support.
+   *
+   * The parser may reallocate internal buffers as needed up to this amount as documents are passed
+   * to it.
+   *
+   * Note: To avoid limiting the memory to an absurd value, such as zero or two bytes,
+   * iff you try to set max_capacity to a value lower than MINIMAL_DOCUMENT_CAPACITY,
+   * then the maximal capacity is set to MINIMAL_DOCUMENT_CAPACITY.
+   *
+   * This call will not allocate or deallocate, even if capacity is currently above max_capacity.
+   *
+   * @param max_capacity The new maximum capacity, in bytes.
+   */
+  simdjson_inline void set_max_capacity(size_t max_capacity) noexcept;
+
+#ifdef SIMDJSON_THREADS_ENABLED
+  /**
+   * The parser instance can use threads when they are available to speed up some
+   * operations. It is enabled by default. Changing this attribute will change the
+   * behavior of the parser for future operations.
+   */
+  bool threaded{true};
+#endif
+  /** @private Use the new DOM API instead */
+  class Iterator;
+  /** @private Use simdjson_error instead */
+  using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error;
+
+  /** @private [for benchmarking access] The implementation to use */
+  std::unique_ptr<internal::dom_parser_implementation> implementation{};
+
+  /** @private Use `if (parser.parse(...).error())` instead */
+  bool valid{false};
+  /** @private Use `parser.parse(...).error()` instead */
+  error_code error{UNINITIALIZED};
+
+  /** @private Use `parser.parse(...).value()` instead */
+  document doc{};
+
+  /** @private returns true if the document parsed was valid */
+  [[deprecated("Use the result of parser.parse() instead")]]
+  inline bool is_valid() const noexcept;
+
+  /**
+   * @private return an error code corresponding to the last parsing attempt, see
+   * simdjson.h will return UNINITIALIZED if no parsing was attempted
+   */
+  [[deprecated("Use the result of parser.parse() instead")]]
+  inline int get_error_code() const noexcept;
+
+  /** @private return the string equivalent of "get_error_code" */
+  [[deprecated("Use error_message() on the result of parser.parse() instead, or cout << error")]]
+  inline std::string get_error_message() const noexcept;
+
+  /** @private */
+  [[deprecated("Use cout << on the result of parser.parse() instead")]]
+  inline bool print_json(std::ostream &os) const noexcept;
+
+  /** @private Private and deprecated: use `parser.parse(...).doc.dump_raw_tape()` instead */
+  inline bool dump_raw_tape(std::ostream &os) const noexcept;
+
+
+private:
+  /**
+   * The maximum document length this parser will automatically support.
+   *
+   * The parser will not be automatically allocated above this amount.
+   */
+  size_t _max_capacity;
+
+  /**
+   * The loaded buffer (reused each time load() is called)
+   */
+  std::unique_ptr<char[]> loaded_bytes;
+
+  /** Capacity of loaded_bytes buffer. */
+  size_t _loaded_bytes_capacity{0};
+
+  // all nodes are stored on the doc.tape using a 64-bit word.
+  //
+  // strings, double and ints are stored as
+  //  a 64-bit word with a pointer to the actual value
+  //
+  //
+  //
+  // for objects or arrays, store [ or {  at the beginning and } and ] at the
+  // end. For the openings ([ or {), we annotate them with a reference to the
+  // location on the doc.tape of the end, and for then closings (} and ]), we
+  // annotate them with a reference to the location of the opening
+  //
+  //
+
+  /**
+   * Ensure we have enough capacity to handle at least desired_capacity bytes,
+   * and auto-allocate if not. This also allocates memory if needed in the
+   * internal document.
+   */
+  inline error_code ensure_capacity(size_t desired_capacity) noexcept;
+  /**
+   * Ensure we have enough capacity to handle at least desired_capacity bytes,
+   * and auto-allocate if not. This also allocates memory if needed in the
+   * provided document.
+   */
+  inline error_code ensure_capacity(document& doc, size_t desired_capacity) noexcept;
+
+  /** Read the file into loaded_bytes */
+  inline simdjson_result<size_t> read_file(const std::string &path) noexcept;
+
+  friend class parser::Iterator;
+  friend class document_stream;
+
+
+}; // class parser
+
+} // namespace dom
+} // namespace simdjson
+
+#endif // SIMDJSON_DOM_PARSER_H
+/* end file include/simdjson/dom/parser.h */
+#ifdef SIMDJSON_THREADS_ENABLED
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#endif
+
+namespace simdjson {
+namespace dom {
+
+
+#ifdef SIMDJSON_THREADS_ENABLED
+/** @private Custom worker class **/
+struct stage1_worker {
+  stage1_worker() noexcept = default;
+  stage1_worker(const stage1_worker&) = delete;
+  stage1_worker(stage1_worker&&) = delete;
+  stage1_worker operator=(const stage1_worker&) = delete;
+  ~stage1_worker();
+  /**
+   * We only start the thread when it is needed, not at object construction, this may throw.
+   * You should only call this once.
+   **/
+  void start_thread();
+  /**
+   * Start a stage 1 job. You should first call 'run', then 'finish'.
+   * You must call start_thread once before.
+   */
+  void run(document_stream * ds, dom::parser * stage1, size_t next_batch_start);
+  /** Wait for the run to finish (blocking). You should first call 'run', then 'finish'. **/
+  void finish();
+
+private:
+
+  /**
+   * Normally, we would never stop the thread. But we do in the destructor.
+   * This function is only safe assuming that you are not waiting for results. You
+   * should have called run, then finish, and be done.
+   **/
+  void stop_thread();
+
+  std::thread thread{};
+  /** These three variables define the work done by the thread. **/
+  dom::parser * stage1_thread_parser{};
+  size_t _next_batch_start{};
+  document_stream * owner{};
+  /**
+   * We have two state variables. This could be streamlined to one variable in the future but
+   * we use two for clarity.
+   */
+  bool has_work{false};
+  bool can_work{true};
+
+  /**
+   * We lock using a mutex.
+   */
+  std::mutex locking_mutex{};
+  std::condition_variable cond_var{};
+};
+#endif
+
+/**
+ * A forward-only stream of documents.
+ *
+ * Produced by parser::parse_many.
+ *
+ */
+class document_stream {
+public:
+  /**
+   * Construct an uninitialized document_stream.
+   *
+   *  ```c++
+   *  document_stream docs;
+   *  error = parser.parse_many(json).get(docs);
+   *  ```
+   */
+  simdjson_inline document_stream() noexcept;
+  /** Move one document_stream to another. */
+  simdjson_inline document_stream(document_stream &&other) noexcept = default;
+  /** Move one document_stream to another. */
+  simdjson_inline document_stream &operator=(document_stream &&other) noexcept = default;
+
+  simdjson_inline ~document_stream() noexcept;
+  /**
+   * Returns the input size in bytes.
+   */
+  inline size_t size_in_bytes() const noexcept;
+  /**
+   * After iterating through the stream, this method
+   * returns the number of bytes that were not parsed at the end
+   * of the stream. If truncated_bytes() differs from zero,
+   * then the input was truncated maybe because incomplete JSON
+   * documents were found at the end of the stream. You
+   * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()).
+   *
+   * You should only call truncated_bytes() after streaming through all
+   * documents, like so:
+   *
+   *   document_stream stream = parser.parse_many(json,window);
+   *   for(auto doc : stream) {
+   *      // do something with doc
+   *   }
+   *   size_t truncated = stream.truncated_bytes();
+   *
+   */
+  inline size_t truncated_bytes() const noexcept;
+  /**
+   * An iterator through a forward-only stream of documents.
+   */
+  class iterator {
+  public:
+    using value_type = simdjson_result<element>;
+    using reference  = value_type;
+
+    using difference_type   = std::ptrdiff_t;
+
+    using iterator_category = std::input_iterator_tag;
+
+    /**
+     * Default constructor.
+     */
+    simdjson_inline iterator() noexcept;
+    /**
+     * Get the current document (or error).
+     */
+    simdjson_inline reference operator*() noexcept;
+    /**
+     * Advance to the next document (prefix).
+     */
+    inline iterator& operator++() noexcept;
+    /**
+     * Check if we're at the end yet.
+     * @param other the end iterator to compare to.
+     */
+    simdjson_inline bool operator!=(const iterator &other) const noexcept;
+    /**
+     * @private
+     *
+     * Gives the current index in the input document in bytes.
+     *
+     *   document_stream stream = parser.parse_many(json,window);
+     *   for(auto i = stream.begin(); i != stream.end(); ++i) {
+     *      auto doc = *i;
+     *      size_t index = i.current_index();
+     *   }
+     *
+     * This function (current_index()) is experimental and the usage
+     * may change in future versions of simdjson: we find the API somewhat
+     * awkward and we would like to offer something friendlier.
+     */
+     simdjson_inline size_t current_index() const noexcept;
+    /**
+     * @private
+     *
+     * Gives a view of the current document.
+     *
+     *   document_stream stream = parser.parse_many(json,window);
+     *   for(auto i = stream.begin(); i != stream.end(); ++i) {
+     *      auto doc = *i;
+     *      std::string_view v = i->source();
+     *   }
+     *
+     * The returned string_view instance is simply a map to the (unparsed)
+     * source string: it may thus include white-space characters and all manner
+     * of padding.
+     *
+     * This function (source()) is experimental and the usage
+     * may change in future versions of simdjson: we find the API somewhat
+     * awkward and we would like to offer something friendlier.
+     */
+     simdjson_inline std::string_view source() const noexcept;
+
+  private:
+    simdjson_inline iterator(document_stream *s, bool finished) noexcept;
+    /** The document_stream we're iterating through. */
+    document_stream* stream;
+    /** Whether we're finished or not. */
+    bool finished;
+    friend class document_stream;
+  };
+
+  /**
+   * Start iterating the documents in the stream.
+   */
+  simdjson_inline iterator begin() noexcept;
+  /**
+   * The end of the stream, for iterator comparison purposes.
+   */
+  simdjson_inline iterator end() noexcept;
+
+private:
+
+  document_stream &operator=(const document_stream &) = delete; // Disallow copying
+  document_stream(const document_stream &other) = delete; // Disallow copying
+
+  /**
+   * Construct a document_stream. Does not allocate or parse anything until the iterator is
+   * used.
+   *
+   * @param parser is a reference to the parser instance used to generate this document_stream
+   * @param buf is the raw byte buffer we need to process
+   * @param len is the length of the raw byte buffer in bytes
+   * @param batch_size is the size of the windows (must be strictly greater or equal to the largest JSON document)
+   */
+  simdjson_inline document_stream(
+    dom::parser &parser,
+    const uint8_t *buf,
+    size_t len,
+    size_t batch_size
+  ) noexcept;
+
+  /**
+   * Parse the first document in the buffer. Used by begin(), to handle allocation and
+   * initialization.
+   */
+  inline void start() noexcept;
+
+  /**
+   * Parse the next document found in the buffer previously given to document_stream.
+   *
+   * The content should be a valid JSON document encoded as UTF-8. If there is a
+   * UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
+   * discouraged.
+   *
+   * You do NOT need to pre-allocate a parser.  This function takes care of
+   * pre-allocating a capacity defined by the batch_size defined when creating the
+   * document_stream object.
+   *
+   * The function returns simdjson::EMPTY if there is no more data to be parsed.
+   *
+   * The function returns simdjson::SUCCESS (as integer = 0) in case of success
+   * and indicates that the buffer has successfully been parsed to the end.
+   * Every document it contained has been parsed without error.
+   *
+   * The function returns an error code from simdjson/simdjson.h in case of failure
+   * such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
+   * the simdjson::error_message function converts these error codes into a string).
+   *
+   * You can also check validity by calling parser.is_valid(). The same parser can
+   * and should be reused for the other documents in the buffer.
+   */
+  inline void next() noexcept;
+
+  /**
+   * Pass the next batch through stage 1 and return when finished.
+   * When threads are enabled, this may wait for the stage 1 thread to finish.
+   */
+  inline void load_batch() noexcept;
+
+  /** Get the next document index. */
+  inline size_t next_batch_start() const noexcept;
+
+  /** Pass the next batch through stage 1 with the given parser. */
+  inline error_code run_stage1(dom::parser &p, size_t batch_start) noexcept;
+
+  dom::parser *parser;
+  const uint8_t *buf;
+  size_t len;
+  size_t batch_size;
+  /** The error (or lack thereof) from the current document. */
+  error_code error;
+  size_t batch_start{0};
+  size_t doc_index{};
+#ifdef SIMDJSON_THREADS_ENABLED
+  /** Indicates whether we use threads. Note that this needs to be a constant during the execution of the parsing. */
+  bool use_thread;
+
+  inline void load_from_stage1_thread() noexcept;
+
+  /** Start a thread to run stage 1 on the next batch. */
+  inline void start_stage1_thread() noexcept;
+
+  /** Wait for the stage 1 thread to finish and capture the results. */
+  inline void finish_stage1_thread() noexcept;
+
+  /** The error returned from the stage 1 thread. */
+  error_code stage1_thread_error{UNINITIALIZED};
+  /** The thread used to run stage 1 against the next batch in the background. */
+  friend struct stage1_worker;
+  std::unique_ptr<stage1_worker> worker{new(std::nothrow) stage1_worker()};
+  /**
+   * The parser used to run stage 1 in the background. Will be swapped
+   * with the regular parser when finished.
+   */
+  dom::parser stage1_thread_parser{};
+#endif // SIMDJSON_THREADS_ENABLED
+
+  friend class dom::parser;
+  friend struct simdjson_result<dom::document_stream>;
+  friend struct internal::simdjson_result_base<dom::document_stream>;
+
+}; // class document_stream
+
+} // namespace dom
+
+template<>
+struct simdjson_result<dom::document_stream> : public internal::simdjson_result_base<dom::document_stream> {
+public:
+  simdjson_inline simdjson_result() noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result(dom::document_stream &&value) noexcept; ///< @private
+
+#if SIMDJSON_EXCEPTIONS
+  simdjson_inline dom::document_stream::iterator begin() noexcept(false);
+  simdjson_inline dom::document_stream::iterator end() noexcept(false);
+#else // SIMDJSON_EXCEPTIONS
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+  [[deprecated("parse_many() and load_many() may return errors. Use document_stream stream; error = parser.parse_many().get(doc); instead.")]]
+  simdjson_inline dom::document_stream::iterator begin() noexcept;
+  [[deprecated("parse_many() and load_many() may return errors. Use document_stream stream; error = parser.parse_many().get(doc); instead.")]]
+  simdjson_inline dom::document_stream::iterator end() noexcept;
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+#endif // SIMDJSON_EXCEPTIONS
+}; // struct simdjson_result<dom::document_stream>
+
+} // namespace simdjson
+
+#endif // SIMDJSON_DOCUMENT_STREAM_H
+/* end file include/simdjson/dom/document_stream.h */
+/* begin file include/simdjson/dom/element.h */
+#ifndef SIMDJSON_DOM_ELEMENT_H
+#define SIMDJSON_DOM_ELEMENT_H
+
+#include <ostream>
+
+namespace simdjson {
+namespace internal {
+template<typename T>
+class string_builder;
+}
+namespace dom {
+class array;
+class document;
+class object;
+
+/**
+ * The actual concrete type of a JSON element
+ * This is the type it is most easily cast to with get<>.
+ */
+enum class element_type {
+  ARRAY = '[',     ///< dom::array
+  OBJECT = '{',    ///< dom::object
+  INT64 = 'l',     ///< int64_t
+  UINT64 = 'u',    ///< uint64_t: any integer that fits in uint64_t but *not* int64_t
+  DOUBLE = 'd',    ///< double: Any number with a "." or "e" that fits in double.
+  STRING = '"',    ///< std::string_view
+  BOOL = 't',      ///< bool
+  NULL_VALUE = 'n' ///< null
+};
+
+/**
+ * A JSON element.
+ *
+ * References an element in a JSON document, representing a JSON null, boolean, string, number,
+ * array or object.
+ */
+class element {
+public:
+  /** Create a new, invalid element. */
+  simdjson_inline element() noexcept;
+
+  /** The type of this element. */
+  simdjson_inline element_type type() const noexcept;
+
+  /**
+   * Cast this element to an array.
+   *
+   * @returns An object that can be used to iterate the array, or:
+   *          INCORRECT_TYPE if the JSON element is not an array.
+   */
+  inline simdjson_result<array> get_array() const noexcept;
+  /**
+   * Cast this element to an object.
+   *
+   * @returns An object that can be used to look up or iterate the object's fields, or:
+   *          INCORRECT_TYPE if the JSON element is not an object.
+   */
+  inline simdjson_result<object> get_object() const noexcept;
+  /**
+   * Cast this element to a null-terminated C string.
+   *
+   * The string is guaranteed to be valid UTF-8.
+   *
+   * The length of the string is given by get_string_length(). Because JSON strings
+   * may contain null characters, it may be incorrect to use strlen to determine the
+   * string length.
+   *
+   * It is possible to get a single string_view instance which represents both the string
+   * content and its length: see get_string().
+   *
+   * @returns A pointer to a null-terminated UTF-8 string. This string is stored in the parser and will
+   *          be invalidated the next time it parses a document or when it is destroyed.
+   *          Returns INCORRECT_TYPE if the JSON element is not a string.
+   */
+  inline simdjson_result<const char *> get_c_str() const noexcept;
+  /**
+   * Gives the length in bytes of the string.
+   *
+   * It is possible to get a single string_view instance which represents both the string
+   * content and its length: see get_string().
+   *
+   * @returns A string length in bytes.
+   *          Returns INCORRECT_TYPE if the JSON element is not a string.
+   */
+  inline simdjson_result<size_t> get_string_length() const noexcept;
+  /**
+   * Cast this element to a string.
+   *
+   * The string is guaranteed to be valid UTF-8.
+   *
+   * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next time it
+   *          parses a document or when it is destroyed.
+   *          Returns INCORRECT_TYPE if the JSON element is not a string.
+   */
+  inline simdjson_result<std::string_view> get_string() const noexcept;
+  /**
+   * Cast this element to a signed integer.
+   *
+   * @returns A signed 64-bit integer.
+   *          Returns INCORRECT_TYPE if the JSON element is not an integer, or NUMBER_OUT_OF_RANGE
+   *          if it is negative.
+   */
+  inline simdjson_result<int64_t> get_int64() const noexcept;
+  /**
+   * Cast this element to an unsigned integer.
+   *
+   * @returns An unsigned 64-bit integer.
+   *          Returns INCORRECT_TYPE if the JSON element is not an integer, or NUMBER_OUT_OF_RANGE
+   *          if it is too large.
+   */
+  inline simdjson_result<uint64_t> get_uint64() const noexcept;
+  /**
+   * Cast this element to a double floating-point.
+   *
+   * @returns A double value.
+   *          Returns INCORRECT_TYPE if the JSON element is not a number.
+   */
+  inline simdjson_result<double> get_double() const noexcept;
+  /**
+   * Cast this element to a bool.
+   *
+   * @returns A bool value.
+   *          Returns INCORRECT_TYPE if the JSON element is not a boolean.
+   */
+  inline simdjson_result<bool> get_bool() const noexcept;
+
+  /**
+   * Whether this element is a json array.
+   *
+   * Equivalent to is<array>().
+   */
+  inline bool is_array() const noexcept;
+  /**
+   * Whether this element is a json object.
+   *
+   * Equivalent to is<object>().
+   */
+  inline bool is_object() const noexcept;
+  /**
+   * Whether this element is a json string.
+   *
+   * Equivalent to is<std::string_view>() or is<const char *>().
+   */
+  inline bool is_string() const noexcept;
+  /**
+   * Whether this element is a json number that fits in a signed 64-bit integer.
+   *
+   * Equivalent to is<int64_t>().
+   */
+  inline bool is_int64() const noexcept;
+  /**
+   * Whether this element is a json number that fits in an unsigned 64-bit integer.
+   *
+   * Equivalent to is<uint64_t>().
+   */
+  inline bool is_uint64() const noexcept;
+  /**
+   * Whether this element is a json number that fits in a double.
+   *
+   * Equivalent to is<double>().
+   */
+  inline bool is_double() const noexcept;
+
+  /**
+   * Whether this element is a json number.
+   *
+   * Both integers and floating points will return true.
+   */
+  inline bool is_number() const noexcept;
+
+  /**
+   * Whether this element is a json `true` or `false`.
+   *
+   * Equivalent to is<bool>().
+   */
+  inline bool is_bool() const noexcept;
+  /**
+   * Whether this element is a json `null`.
+   */
+  inline bool is_null() const noexcept;
+
+  /**
+   * Tell whether the value can be cast to provided type (T).
+   *
+   * Supported types:
+   * - Boolean: bool
+   * - Number: double, uint64_t, int64_t
+   * - String: std::string_view, const char *
+   * - Array: dom::array
+   * - Object: dom::object
+   *
+   * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object
+   */
+  template<typename T>
+  simdjson_inline bool is() const noexcept;
+
+  /**
+   * Get the value as the provided type (T).
+   *
+   * Supported types:
+   * - Boolean: bool
+   * - Number: double, uint64_t, int64_t
+   * - String: std::string_view, const char *
+   * - Array: dom::array
+   * - Object: dom::object
+   *
+   * You may use get_double(), get_bool(), get_uint64(), get_int64(),
+   * get_object(), get_array() or get_string() instead.
+   *
+   * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object
+   *
+   * @returns The value cast to the given type, or:
+   *          INCORRECT_TYPE if the value cannot be cast to the given type.
+   */
+
+  template<typename T>
+  inline simdjson_result<T> get() const noexcept {
+    // Unless the simdjson library provides an inline implementation, calling this method should
+    // immediately fail.
+    static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library.");
+  }
+
+  /**
+   * Get the value as the provided type (T).
+   *
+   * Supported types:
+   * - Boolean: bool
+   * - Number: double, uint64_t, int64_t
+   * - String: std::string_view, const char *
+   * - Array: dom::array
+   * - Object: dom::object
+   *
+   * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object
+   *
+   * @param value The variable to set to the value. May not be set if there is an error.
+   *
+   * @returns The error that occurred, or SUCCESS if there was no error.
+   */
+  template<typename T>
+  simdjson_warn_unused simdjson_inline error_code get(T &value) const noexcept;
+
+  /**
+   * Get the value as the provided type (T), setting error if it's not the given type.
+   *
+   * Supported types:
+   * - Boolean: bool
+   * - Number: double, uint64_t, int64_t
+   * - String: std::string_view, const char *
+   * - Array: dom::array
+   * - Object: dom::object
+   *
+   * @tparam T bool, double, uint64_t, int64_t, std::string_view, const char *, dom::array, dom::object
+   *
+   * @param value The variable to set to the given type. value is undefined if there is an error.
+   * @param error The variable to store the error. error is set to error_code::SUCCEED if there is an error.
+   */
+  template<typename T>
+  inline void tie(T &value, error_code &error) && noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  /**
+   * Read this element as a boolean.
+   *
+   * @return The boolean value
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a boolean.
+   */
+  inline operator bool() const noexcept(false);
+
+  /**
+   * Read this element as a null-terminated UTF-8 string.
+   *
+   * Be mindful that JSON allows strings to contain null characters.
+   *
+   * Does *not* convert other types to a string; requires that the JSON type of the element was
+   * an actual string.
+   *
+   * @return The string value.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a string.
+   */
+  inline explicit operator const char*() const noexcept(false);
+
+  /**
+   * Read this element as a null-terminated UTF-8 string.
+   *
+   * Does *not* convert other types to a string; requires that the JSON type of the element was
+   * an actual string.
+   *
+   * @return The string value.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a string.
+   */
+  inline operator std::string_view() const noexcept(false);
+
+  /**
+   * Read this element as an unsigned integer.
+   *
+   * @return The integer value.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an integer
+   * @exception simdjson_error(NUMBER_OUT_OF_RANGE) if the integer doesn't fit in 64 bits or is negative
+   */
+  inline operator uint64_t() const noexcept(false);
+  /**
+   * Read this element as an signed integer.
+   *
+   * @return The integer value.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an integer
+   * @exception simdjson_error(NUMBER_OUT_OF_RANGE) if the integer doesn't fit in 64 bits
+   */
+  inline operator int64_t() const noexcept(false);
+  /**
+   * Read this element as an double.
+   *
+   * @return The double value.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not a number
+   * @exception simdjson_error(NUMBER_OUT_OF_RANGE) if the integer doesn't fit in 64 bits or is negative
+   */
+  inline operator double() const noexcept(false);
+  /**
+   * Read this element as a JSON array.
+   *
+   * @return The JSON array.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an array
+   */
+  inline operator array() const noexcept(false);
+  /**
+   * Read this element as a JSON object (key/value pairs).
+   *
+   * @return The JSON object.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an object
+   */
+  inline operator object() const noexcept(false);
+
+  /**
+   * Iterate over each element in this array.
+   *
+   * @return The beginning of the iteration.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an array
+   */
+  inline dom::array::iterator begin() const noexcept(false);
+
+  /**
+   * Iterate over each element in this array.
+   *
+   * @return The end of the iteration.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON element is not an array
+   */
+  inline dom::array::iterator end() const noexcept(false);
+#endif // SIMDJSON_EXCEPTIONS
+
+  /**
+   * Get the value associated with the given key.
+   *
+   * The key will be matched against **unescaped** JSON:
+   *
+   *   dom::parser parser;
+   *   int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1
+   *   parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   *         - INCORRECT_TYPE if this is not an object
+   */
+  inline simdjson_result<element> operator[](std::string_view key) const noexcept;
+
+  /**
+   * Get the value associated with the given key.
+   *
+   * The key will be matched against **unescaped** JSON:
+   *
+   *   dom::parser parser;
+   *   int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1
+   *   parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   *         - INCORRECT_TYPE if this is not an object
+   */
+  inline simdjson_result<element> operator[](const char *key) const noexcept;
+
+  /**
+   * Get the value associated with the given JSON pointer.  We use the RFC 6901
+   * https://tools.ietf.org/html/rfc6901 standard.
+   *
+   *   dom::parser parser;
+   *   element doc = parser.parse(R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded);
+   *   doc.at_pointer("/foo/a/1") == 20
+   *   doc.at_pointer("/foo")["a"].at(1) == 20
+   *   doc.at_pointer("")["foo"]["a"].at(1) == 20
+   *
+   * It is allowed for a key to be the empty string:
+   *
+   *   dom::parser parser;
+   *   object obj = parser.parse(R"({ "": { "a": [ 10, 20, 30 ] }})"_padded);
+   *   obj.at_pointer("//a/1") == 20
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   */
+  inline simdjson_result<element> at_pointer(const std::string_view json_pointer) const noexcept;
+
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+  /**
+   *
+   * Version 0.4 of simdjson used an incorrect interpretation of the JSON Pointer standard
+   * and allowed the following :
+   *
+   *   dom::parser parser;
+   *   element doc = parser.parse(R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded);
+   *   doc.at("foo/a/1") == 20
+   *
+   * Though it is intuitive, it is not compliant with RFC 6901
+   * https://tools.ietf.org/html/rfc6901
+   *
+   * For standard compliance, use the at_pointer function instead.
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   */
+  [[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]]
+  inline simdjson_result<element> at(const std::string_view json_pointer) const noexcept;
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+
+  /**
+   * Get the value at the given index.
+   *
+   * @return The value at the given index, or:
+   *         - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length
+   */
+  inline simdjson_result<element> at(size_t index) const noexcept;
+
+  /**
+   * Get the value associated with the given key.
+   *
+   * The key will be matched against **unescaped** JSON:
+   *
+   *   dom::parser parser;
+   *   int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1
+   *   parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   */
+  inline simdjson_result<element> at_key(std::string_view key) const noexcept;
+
+  /**
+   * Get the value associated with the given key in a case-insensitive manner.
+   *
+   * Note: The key will be matched against **unescaped** JSON.
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   */
+  inline simdjson_result<element> at_key_case_insensitive(std::string_view key) const noexcept;
+
+  /** @private for debugging. Prints out the root element. */
+  inline bool dump_raw_tape(std::ostream &out) const noexcept;
+
+private:
+  simdjson_inline element(const internal::tape_ref &tape) noexcept;
+  internal::tape_ref tape;
+  friend class document;
+  friend class object;
+  friend class array;
+  friend struct simdjson_result<element>;
+  template<typename T>
+  friend class simdjson::internal::string_builder;
+
+};
+
+} // namespace dom
+
+/** The result of a JSON navigation that may fail. */
+template<>
+struct simdjson_result<dom::element> : public internal::simdjson_result_base<dom::element> {
+public:
+  simdjson_inline simdjson_result() noexcept; ///< @private
+  simdjson_inline simdjson_result(dom::element &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+
+  simdjson_inline simdjson_result<dom::element_type> type() const noexcept;
+  template<typename T>
+  simdjson_inline bool is() const noexcept;
+  template<typename T>
+  simdjson_inline simdjson_result<T> get() const noexcept;
+  template<typename T>
+  simdjson_warn_unused simdjson_inline error_code get(T &value) const noexcept;
+
+  simdjson_inline simdjson_result<dom::array> get_array() const noexcept;
+  simdjson_inline simdjson_result<dom::object> get_object() const noexcept;
+  simdjson_inline simdjson_result<const char *> get_c_str() const noexcept;
+  simdjson_inline simdjson_result<size_t> get_string_length() const noexcept;
+  simdjson_inline simdjson_result<std::string_view> get_string() const noexcept;
+  simdjson_inline simdjson_result<int64_t> get_int64() const noexcept;
+  simdjson_inline simdjson_result<uint64_t> get_uint64() const noexcept;
+  simdjson_inline simdjson_result<double> get_double() const noexcept;
+  simdjson_inline simdjson_result<bool> get_bool() const noexcept;
+
+  simdjson_inline bool is_array() const noexcept;
+  simdjson_inline bool is_object() const noexcept;
+  simdjson_inline bool is_string() const noexcept;
+  simdjson_inline bool is_int64() const noexcept;
+  simdjson_inline bool is_uint64() const noexcept;
+  simdjson_inline bool is_double() const noexcept;
+  simdjson_inline bool is_number() const noexcept;
+  simdjson_inline bool is_bool() const noexcept;
+  simdjson_inline bool is_null() const noexcept;
+
+  simdjson_inline simdjson_result<dom::element> operator[](std::string_view key) const noexcept;
+  simdjson_inline simdjson_result<dom::element> operator[](const char *key) const noexcept;
+  simdjson_inline simdjson_result<dom::element> at_pointer(const std::string_view json_pointer) const noexcept;
+  [[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]]
+  simdjson_inline simdjson_result<dom::element> at(const std::string_view json_pointer) const noexcept;
+  simdjson_inline simdjson_result<dom::element> at(size_t index) const noexcept;
+  simdjson_inline simdjson_result<dom::element> at_key(std::string_view key) const noexcept;
+  simdjson_inline simdjson_result<dom::element> at_key_case_insensitive(std::string_view key) const noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  simdjson_inline operator bool() const noexcept(false);
+  simdjson_inline explicit operator const char*() const noexcept(false);
+  simdjson_inline operator std::string_view() const noexcept(false);
+  simdjson_inline operator uint64_t() const noexcept(false);
+  simdjson_inline operator int64_t() const noexcept(false);
+  simdjson_inline operator double() const noexcept(false);
+  simdjson_inline operator dom::array() const noexcept(false);
+  simdjson_inline operator dom::object() const noexcept(false);
+
+  simdjson_inline dom::array::iterator begin() const noexcept(false);
+  simdjson_inline dom::array::iterator end() const noexcept(false);
+#endif // SIMDJSON_EXCEPTIONS
+};
+
+
+} // namespace simdjson
+
+#endif // SIMDJSON_DOM_DOCUMENT_H
+/* end file include/simdjson/dom/element.h */
+/* begin file include/simdjson/dom/object.h */
+#ifndef SIMDJSON_DOM_OBJECT_H
+#define SIMDJSON_DOM_OBJECT_H
+
+
+namespace simdjson {
+namespace internal {
+template<typename T>
+class string_builder;
+}
+namespace dom {
+
+class document;
+class element;
+class key_value_pair;
+
+/**
+ * JSON object.
+ */
+class object {
+public:
+  /** Create a new, invalid object */
+  simdjson_inline object() noexcept;
+
+  class iterator {
+  public:
+    using value_type = key_value_pair;
+    using difference_type = std::ptrdiff_t;
+
+    /**
+     * Get the actual key/value pair
+     */
+    inline const value_type operator*() const noexcept;
+    /**
+     * Get the next key/value pair.
+     *
+     * Part of the std::iterator interface.
+     *
+     */
+    inline iterator& operator++() noexcept;
+    /**
+     * Get the next key/value pair.
+     *
+     * Part of the std::iterator interface.
+     *
+     */
+    inline iterator operator++(int) noexcept;
+    /**
+     * Check if these values come from the same place in the JSON.
+     *
+     * Part of the std::iterator interface.
+     */
+    inline bool operator!=(const iterator& other) const noexcept;
+    inline bool operator==(const iterator& other) const noexcept;
+
+    inline bool operator<(const iterator& other) const noexcept;
+    inline bool operator<=(const iterator& other) const noexcept;
+    inline bool operator>=(const iterator& other) const noexcept;
+    inline bool operator>(const iterator& other) const noexcept;
+    /**
+     * Get the key of this key/value pair.
+     */
+    inline std::string_view key() const noexcept;
+    /**
+     * Get the length (in bytes) of the key in this key/value pair.
+     * You should expect this function to be faster than key().size().
+     */
+    inline uint32_t key_length() const noexcept;
+    /**
+     * Returns true if the key in this key/value pair is equal
+     * to the provided string_view.
+     */
+    inline bool key_equals(std::string_view o) const noexcept;
+    /**
+     * Returns true if the key in this key/value pair is equal
+     * to the provided string_view in a case-insensitive manner.
+     * Case comparisons may only be handled correctly for ASCII strings.
+     */
+    inline bool key_equals_case_insensitive(std::string_view o) const noexcept;
+    /**
+     * Get the key of this key/value pair.
+     */
+    inline const char *key_c_str() const noexcept;
+    /**
+     * Get the value of this key/value pair.
+     */
+    inline element value() const noexcept;
+
+    iterator() noexcept = default;
+    iterator(const iterator&) noexcept = default;
+    iterator& operator=(const iterator&) noexcept = default;
+  private:
+    simdjson_inline iterator(const internal::tape_ref &tape) noexcept;
+
+    internal::tape_ref tape;
+
+    friend class object;
+  };
+
+  /**
+   * Return the first key/value pair.
+   *
+   * Part of the std::iterable interface.
+   */
+  inline iterator begin() const noexcept;
+  /**
+   * One past the last key/value pair.
+   *
+   * Part of the std::iterable interface.
+   */
+  inline iterator end() const noexcept;
+  /**
+   * Get the size of the object (number of keys).
+   * It is a saturated value with a maximum of 0xFFFFFF: if the value
+   * is 0xFFFFFF then the size is 0xFFFFFF or greater.
+   */
+  inline size_t size() const noexcept;
+  /**
+   * Get the value associated with the given key.
+   *
+   * The key will be matched against **unescaped** JSON:
+   *
+   *   dom::parser parser;
+   *   int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1
+   *   parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD
+   *
+   * This function has linear-time complexity: the keys are checked one by one.
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   *         - INCORRECT_TYPE if this is not an object
+   */
+  inline simdjson_result<element> operator[](std::string_view key) const noexcept;
+
+  /**
+   * Get the value associated with the given key.
+   *
+   * The key will be matched against **unescaped** JSON:
+   *
+   *   dom::parser parser;
+   *   int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1
+   *   parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD
+   *
+   * This function has linear-time complexity: the keys are checked one by one.
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   *         - INCORRECT_TYPE if this is not an object
+   */
+  inline simdjson_result<element> operator[](const char *key) const noexcept;
+
+  /**
+   * Get the value associated with the given JSON pointer. We use the RFC 6901
+   * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node
+   * as the root of its own JSON document.
+   *
+   *   dom::parser parser;
+   *   object obj = parser.parse(R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded);
+   *   obj.at_pointer("/foo/a/1") == 20
+   *   obj.at_pointer("/foo")["a"].at(1) == 20
+   *
+   * It is allowed for a key to be the empty string:
+   *
+   *   dom::parser parser;
+   *   object obj = parser.parse(R"({ "": { "a": [ 10, 20, 30 ] }})"_padded);
+   *   obj.at_pointer("//a/1") == 20
+   *   obj.at_pointer("/")["a"].at(1) == 20
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   */
+  inline simdjson_result<element> at_pointer(std::string_view json_pointer) const noexcept;
+
+  /**
+   * Get the value associated with the given key.
+   *
+   * The key will be matched against **unescaped** JSON:
+   *
+   *   dom::parser parser;
+   *   int64_t(parser.parse(R"({ "a\n": 1 })"_padded)["a\n"]) == 1
+   *   parser.parse(R"({ "a\n": 1 })"_padded)["a\\n"].get_uint64().error() == NO_SUCH_FIELD
+   *
+   * This function has linear-time complexity: the keys are checked one by one.
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   */
+  inline simdjson_result<element> at_key(std::string_view key) const noexcept;
+
+  /**
+   * Get the value associated with the given key in a case-insensitive manner.
+   * It is only guaranteed to work over ASCII inputs.
+   *
+   * Note: The key will be matched against **unescaped** JSON.
+   *
+   * This function has linear-time complexity: the keys are checked one by one.
+   *
+   * @return The value associated with this field, or:
+   *         - NO_SUCH_FIELD if the field does not exist in the object
+   */
+  inline simdjson_result<element> at_key_case_insensitive(std::string_view key) const noexcept;
+
+private:
+  simdjson_inline object(const internal::tape_ref &tape) noexcept;
+
+  internal::tape_ref tape;
+
+  friend class element;
+  friend struct simdjson_result<element>;
+  template<typename T>
+  friend class simdjson::internal::string_builder;
+};
+
+/**
+ * Key/value pair in an object.
+ */
+class key_value_pair {
+public:
+  /** key in the key-value pair **/
+  std::string_view key;
+  /** value in the key-value pair **/
+  element value;
+
+private:
+  simdjson_inline key_value_pair(std::string_view _key, element _value) noexcept;
+  friend class object;
+};
+
+} // namespace dom
+
+/** The result of a JSON conversion that may fail. */
+template<>
+struct simdjson_result<dom::object> : public internal::simdjson_result_base<dom::object> {
+public:
+  simdjson_inline simdjson_result() noexcept; ///< @private
+  simdjson_inline simdjson_result(dom::object value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+
+  inline simdjson_result<dom::element> operator[](std::string_view key) const noexcept;
+  inline simdjson_result<dom::element> operator[](const char *key) const noexcept;
+  inline simdjson_result<dom::element> at_pointer(std::string_view json_pointer) const noexcept;
+  inline simdjson_result<dom::element> at_key(std::string_view key) const noexcept;
+  inline simdjson_result<dom::element> at_key_case_insensitive(std::string_view key) const noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  inline dom::object::iterator begin() const noexcept(false);
+  inline dom::object::iterator end() const noexcept(false);
+  inline size_t size() const noexcept(false);
+#endif // SIMDJSON_EXCEPTIONS
+};
+
+} // namespace simdjson
+
+#if defined(__cpp_lib_ranges)
+#include <ranges>
+
+namespace std {
+namespace ranges {
+template<>
+inline constexpr bool enable_view<simdjson::dom::object> = true;
+#if SIMDJSON_EXCEPTIONS
+template<>
+inline constexpr bool enable_view<simdjson::simdjson_result<simdjson::dom::object>> = true;
+#endif // SIMDJSON_EXCEPTIONS
+} // namespace ranges
+} // namespace std
+#endif // defined(__cpp_lib_ranges)
+
+#endif // SIMDJSON_DOM_OBJECT_H
+/* end file include/simdjson/dom/object.h */
+/* begin file include/simdjson/dom/serialization.h */
+#ifndef SIMDJSON_SERIALIZATION_H
+#define SIMDJSON_SERIALIZATION_H
+
+#include <vector>
+
+namespace simdjson {
+
+/**
+ * The string_builder template and mini_formatter class
+ * are not part of  our public API and are subject to change
+ * at any time!
+ */
+namespace internal {
+
+class mini_formatter;
+
+/**
+ * @private The string_builder template allows us to construct
+ * a string from a document element. It is parametrized
+ * by a "formatter" which handles the details. Thus
+ * the string_builder template could support both minification
+ * and prettification, and various other tradeoffs.
+ */
+template <class formatter = mini_formatter>
+class string_builder {
+public:
+  /** Construct an initially empty builder, would print the empty string **/
+  string_builder() = default;
+  /** Append an element to the builder (to be printed) **/
+  inline void append(simdjson::dom::element value);
+  /** Append an array to the builder (to be printed) **/
+  inline void append(simdjson::dom::array value);
+  /** Append an object to the builder (to be printed) **/
+  inline void append(simdjson::dom::object value);
+  /** Reset the builder (so that it would print the empty string) **/
+  simdjson_inline void clear();
+  /**
+   * Get access to the string. The string_view is owned by the builder
+   * and it is invalid to use it after the string_builder has been
+   * destroyed.
+   * However you can make a copy of the string_view on memory that you
+   * own.
+   */
+  simdjson_inline std::string_view str() const;
+  /** Append a key_value_pair to the builder (to be printed) **/
+  simdjson_inline void append(simdjson::dom::key_value_pair value);
+private:
+  formatter format{};
+};
+
+/**
+ * @private This is the class that we expect to use with the string_builder
+ * template. It tries to produce a compact version of the JSON element
+ * as quickly as possible.
+ */
+class mini_formatter {
+public:
+  mini_formatter() = default;
+  /** Add a comma **/
+  simdjson_inline void comma();
+  /** Start an array, prints [ **/
+  simdjson_inline void start_array();
+  /** End an array, prints ] **/
+  simdjson_inline void end_array();
+  /** Start an array, prints { **/
+  simdjson_inline void start_object();
+  /** Start an array, prints } **/
+  simdjson_inline void end_object();
+  /** Prints a true **/
+  simdjson_inline void true_atom();
+  /** Prints a false **/
+  simdjson_inline void false_atom();
+  /** Prints a null **/
+  simdjson_inline void null_atom();
+  /** Prints a number **/
+  simdjson_inline void number(int64_t x);
+  /** Prints a number **/
+  simdjson_inline void number(uint64_t x);
+  /** Prints a number **/
+  simdjson_inline void number(double x);
+  /** Prints a key (string + colon) **/
+  simdjson_inline void key(std::string_view unescaped);
+  /** Prints a string. The string is escaped as needed. **/
+  simdjson_inline void string(std::string_view unescaped);
+  /** Clears out the content. **/
+  simdjson_inline void clear();
+  /**
+   * Get access to the buffer, it is owned by the instance, but
+   * the user can make a copy.
+   **/
+  simdjson_inline std::string_view str() const;
+
+private:
+  // implementation details (subject to change)
+  /** Prints one character **/
+  simdjson_inline void one_char(char c);
+  /** Backing buffer **/
+  std::vector<char> buffer{}; // not ideal!
+};
+
+} // internal
+
+namespace dom {
+
+/**
+ * Print JSON to an output stream.
+ *
+ * @param out The output stream.
+ * @param value The element.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson::dom::element value) {
+    simdjson::internal::string_builder<> sb;
+    sb.append(value);
+    return (out << sb.str());
+}
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::dom::element> x) {
+    if (x.error()) { throw simdjson::simdjson_error(x.error()); }
+    return (out << x.value());
+}
+#endif
+/**
+ * Print JSON to an output stream.
+ *
+ * @param out The output stream.
+ * @param value The array.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson::dom::array value)  {
+    simdjson::internal::string_builder<> sb;
+    sb.append(value);
+    return (out << sb.str());
+}
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::dom::array> x) {
+    if (x.error()) { throw simdjson::simdjson_error(x.error()); }
+    return (out << x.value());
+}
+#endif
+/**
+ * Print JSON to an output stream.
+ *
+ * @param out The output stream.
+ * @param value The object.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson::dom::object value)   {
+    simdjson::internal::string_builder<> sb;
+    sb.append(value);
+    return (out << sb.str());
+}
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out,  simdjson::simdjson_result<simdjson::dom::object> x) {
+    if (x.error()) { throw  simdjson::simdjson_error(x.error()); }
+    return (out << x.value());
+}
+#endif
+} // namespace dom
+
+/**
+ * Converts JSON to a string.
+ *
+ *   dom::parser parser;
+ *   element doc = parser.parse("   [ 1 , 2 , 3 ] "_padded);
+ *   cout << to_string(doc) << endl; // prints [1,2,3]
+ *
+ */
+template <class T>
+std::string to_string(T x)   {
+    // in C++, to_string is standard: http://www.cplusplus.com/reference/string/to_string/
+    // Currently minify and to_string are identical but in the future, they may
+    // differ.
+    simdjson::internal::string_builder<> sb;
+    sb.append(x);
+    std::string_view answer = sb.str();
+    return std::string(answer.data(), answer.size());
+}
+#if SIMDJSON_EXCEPTIONS
+template <class T>
+std::string to_string(simdjson_result<T> x) {
+    if (x.error()) { throw simdjson_error(x.error()); }
+    return to_string(x.value());
+}
+#endif
+
+/**
+ * Minifies a JSON element or document, printing the smallest possible valid JSON.
+ *
+ *   dom::parser parser;
+ *   element doc = parser.parse("   [ 1 , 2 , 3 ] "_padded);
+ *   cout << minify(doc) << endl; // prints [1,2,3]
+ *
+ */
+template <class T>
+std::string minify(T x)  {
+  return to_string(x);
+}
+
+#if SIMDJSON_EXCEPTIONS
+template <class T>
+std::string minify(simdjson_result<T> x) {
+    if (x.error()) { throw simdjson_error(x.error()); }
+    return to_string(x.value());
+}
+#endif
+
+
+} // namespace simdjson
+
+
+#endif
+/* end file include/simdjson/dom/serialization.h */
+
+// Deprecated API
+/* begin file include/simdjson/dom/jsonparser.h */
+// TODO Remove this -- deprecated API and files
+
+#ifndef SIMDJSON_DOM_JSONPARSER_H
+#define SIMDJSON_DOM_JSONPARSER_H
+
+/* begin file include/simdjson/dom/parsedjson.h */
+// TODO Remove this -- deprecated API and files
+
+#ifndef SIMDJSON_DOM_PARSEDJSON_H
+#define SIMDJSON_DOM_PARSEDJSON_H
+
+
+namespace simdjson {
+
+/**
+ * @deprecated Use `dom::parser` instead.
+ */
+using ParsedJson [[deprecated("Use dom::parser instead")]] = dom::parser;
+
+} // namespace simdjson
+
+#endif // SIMDJSON_DOM_PARSEDJSON_H
+/* end file include/simdjson/dom/parsedjson.h */
+/* begin file include/simdjson/jsonioutil.h */
+#ifndef SIMDJSON_JSONIOUTIL_H
+#define SIMDJSON_JSONIOUTIL_H
+
+
+namespace simdjson {
+
+#if SIMDJSON_EXCEPTIONS
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+[[deprecated("Use padded_string::load() instead")]]
+inline padded_string get_corpus(const char *path) {
+  return padded_string::load(path);
+}
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+#endif // SIMDJSON_EXCEPTIONS
+
+} // namespace simdjson
+
+#endif // SIMDJSON_JSONIOUTIL_H
+/* end file include/simdjson/jsonioutil.h */
+
+namespace simdjson {
+
+//
+// C API (json_parse and build_parsed_json) declarations
+//
+
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+[[deprecated("Use parser.parse() instead")]]
+inline int json_parse(const uint8_t *buf, size_t len, dom::parser &parser, bool realloc_if_needed = true) noexcept {
+  error_code code = parser.parse(buf, len, realloc_if_needed).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return code;
+}
+[[deprecated("Use parser.parse() instead")]]
+inline int json_parse(const char *buf, size_t len, dom::parser &parser, bool realloc_if_needed = true) noexcept {
+  error_code code = parser.parse(buf, len, realloc_if_needed).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return code;
+}
+[[deprecated("Use parser.parse() instead")]]
+inline int json_parse(const std::string &s, dom::parser &parser, bool realloc_if_needed = true) noexcept {
+  error_code code = parser.parse(s.data(), s.length(), realloc_if_needed).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return code;
+}
+[[deprecated("Use parser.parse() instead")]]
+inline int json_parse(const padded_string &s, dom::parser &parser) noexcept {
+  error_code code = parser.parse(s).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return code;
+}
+
+[[deprecated("Use parser.parse() instead")]]
+simdjson_warn_unused inline dom::parser build_parsed_json(const uint8_t *buf, size_t len, bool realloc_if_needed = true) noexcept {
+  dom::parser parser;
+  error_code code = parser.parse(buf, len, realloc_if_needed).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return parser;
+}
+[[deprecated("Use parser.parse() instead")]]
+simdjson_warn_unused inline dom::parser build_parsed_json(const char *buf, size_t len, bool realloc_if_needed = true) noexcept {
+  dom::parser parser;
+  error_code code = parser.parse(buf, len, realloc_if_needed).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return parser;
+}
+[[deprecated("Use parser.parse() instead")]]
+simdjson_warn_unused inline dom::parser build_parsed_json(const std::string &s, bool realloc_if_needed = true) noexcept {
+  dom::parser parser;
+  error_code code = parser.parse(s.data(), s.length(), realloc_if_needed).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return parser;
+}
+[[deprecated("Use parser.parse() instead")]]
+simdjson_warn_unused inline dom::parser build_parsed_json(const padded_string &s) noexcept {
+  dom::parser parser;
+  error_code code = parser.parse(s).error();
+  // The deprecated json_parse API is a signal that the user plans to *use* the error code / valid
+  // bits in the parser instead of heeding the result code. The normal parser unsets those in
+  // anticipation of making the error code ephemeral.
+  // Here we put the code back into the parser, until we've removed this method.
+  parser.valid = code == SUCCESS;
+  parser.error = code;
+  return parser;
+}
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+
+/** @private We do not want to allow implicit conversion from C string to std::string. */
+int json_parse(const char *buf, dom::parser &parser) noexcept = delete;
+/** @private We do not want to allow implicit conversion from C string to std::string. */
+dom::parser build_parsed_json(const char *buf) noexcept = delete;
+
+} // namespace simdjson
+
+#endif // SIMDJSON_DOM_JSONPARSER_H
+/* end file include/simdjson/dom/jsonparser.h */
+/* begin file include/simdjson/dom/parsedjson_iterator.h */
+// TODO Remove this -- deprecated API and files
+
+#ifndef SIMDJSON_DOM_PARSEDJSON_ITERATOR_H
+#define SIMDJSON_DOM_PARSEDJSON_ITERATOR_H
+
+#include <cstring>
+#include <string>
+#include <ostream>
+#include <iterator>
+#include <limits>
+#include <stdexcept>
+
+/* begin file include/simdjson/internal/jsonformatutils.h */
+#ifndef SIMDJSON_INTERNAL_JSONFORMATUTILS_H
+#define SIMDJSON_INTERNAL_JSONFORMATUTILS_H
+
+#include <iomanip>
+#include <ostream>
+#include <sstream>
+
+namespace simdjson {
+namespace internal {
+
+class escape_json_string;
+
+inline std::ostream& operator<<(std::ostream& out, const escape_json_string &str);
+
+class escape_json_string {
+public:
+  escape_json_string(std::string_view _str) noexcept : str{_str} {}
+  operator std::string() const noexcept { std::stringstream s; s << *this; return s.str(); }
+private:
+  std::string_view str;
+  friend std::ostream& operator<<(std::ostream& out, const escape_json_string &unescaped);
+};
+
+inline std::ostream& operator<<(std::ostream& out, const escape_json_string &unescaped) {
+  for (size_t i=0; i<unescaped.str.length(); i++) {
+    switch (unescaped.str[i]) {
+    case '\b':
+      out << "\\b";
+      break;
+    case '\f':
+      out << "\\f";
+      break;
+    case '\n':
+      out << "\\n";
+      break;
+    case '\r':
+      out << "\\r";
+      break;
+    case '\"':
+      out << "\\\"";
+      break;
+    case '\t':
+      out << "\\t";
+      break;
+    case '\\':
+      out << "\\\\";
+      break;
+    default:
+      if (static_cast<unsigned char>(unescaped.str[i]) <= 0x1F) {
+        // TODO can this be done once at the beginning, or will it mess up << char?
+        std::ios::fmtflags f(out.flags());
+        out << "\\u" << std::hex << std::setw(4) << std::setfill('0') << int(unescaped.str[i]);
+        out.flags(f);
+      } else {
+        out << unescaped.str[i];
+      }
+    }
+  }
+  return out;
+}
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_JSONFORMATUTILS_H
+/* end file include/simdjson/internal/jsonformatutils.h */
+
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+
+namespace simdjson {
+/** @private **/
+class [[deprecated("Use the new DOM navigation API instead (see doc/basics.md)")]] dom::parser::Iterator {
+public:
+  inline Iterator(const dom::parser &parser) noexcept(false);
+  inline Iterator(const Iterator &o) noexcept;
+  inline ~Iterator() noexcept;
+
+  inline Iterator& operator=(const Iterator&) = delete;
+
+  inline bool is_ok() const;
+
+  // useful for debugging purposes
+  inline size_t get_tape_location() const;
+
+  // useful for debugging purposes
+  inline size_t get_tape_length() const;
+
+  // returns the current depth (start at 1 with 0 reserved for the fictitious
+  // root node)
+  inline size_t get_depth() const;
+
+  // A scope is a series of nodes at the same depth, typically it is either an
+  // object ({) or an array ([). The root node has type 'r'.
+  inline uint8_t get_scope_type() const;
+
+  // move forward in document order
+  inline bool move_forward();
+
+  // retrieve the character code of what we're looking at:
+  // [{"slutfn are the possibilities
+  inline uint8_t get_type() const {
+      return current_type; // short functions should be inlined!
+  }
+
+  // get the int64_t value at this node; valid only if get_type is "l"
+  inline int64_t get_integer() const {
+      if (location + 1 >= tape_length) {
+      return 0; // default value in case of error
+      }
+      return static_cast<int64_t>(doc.tape[location + 1]);
+  }
+
+  // get the value as uint64; valid only if  if get_type is "u"
+  inline uint64_t get_unsigned_integer() const {
+      if (location + 1 >= tape_length) {
+      return 0; // default value in case of error
+      }
+      return doc.tape[location + 1];
+  }
+
+  // get the string value at this node (NULL ended); valid only if get_type is "
+  // note that tabs, and line endings are escaped in the returned value (see
+  // print_with_escapes) return value is valid UTF-8, it may contain NULL chars
+  // within the string: get_string_length determines the true string length.
+  inline const char *get_string() const {
+      return reinterpret_cast<const char *>(
+          doc.string_buf.get() + (current_val & internal::JSON_VALUE_MASK) + sizeof(uint32_t));
+  }
+
+  // return the length of the string in bytes
+  inline uint32_t get_string_length() const {
+      uint32_t answer;
+      std::memcpy(&answer,
+          reinterpret_cast<const char *>(doc.string_buf.get() +
+                                          (current_val & internal::JSON_VALUE_MASK)),
+          sizeof(uint32_t));
+      return answer;
+  }
+
+  // get the double value at this node; valid only if
+  // get_type() is "d"
+  inline double get_double() const {
+      if (location + 1 >= tape_length) {
+      return std::numeric_limits<double>::quiet_NaN(); // default value in
+                                                      // case of error
+      }
+      double answer;
+      std::memcpy(&answer, &doc.tape[location + 1], sizeof(answer));
+      return answer;
+  }
+
+  inline bool is_object_or_array() const { return is_object() || is_array(); }
+
+  inline bool is_object() const { return get_type() == '{'; }
+
+  inline bool is_array() const { return get_type() == '['; }
+
+  inline bool is_string() const { return get_type() == '"'; }
+
+  // Returns true if the current type of the node is an signed integer.
+  // You can get its value with `get_integer()`.
+  inline bool is_integer() const { return get_type() == 'l'; }
+
+  // Returns true if the current type of the node is an unsigned integer.
+  // You can get its value with `get_unsigned_integer()`.
+  //
+  // NOTE:
+  // Only a large value, which is out of range of a 64-bit signed integer, is
+  // represented internally as an unsigned node. On the other hand, a typical
+  // positive integer, such as 1, 42, or 1000000, is as a signed node.
+  // Be aware this function returns false for a signed node.
+  inline bool is_unsigned_integer() const { return get_type() == 'u'; }
+  // Returns true if the current type of the node is a double floating-point number.
+  inline bool is_double() const { return get_type() == 'd'; }
+  // Returns true if the current type of the node is a number (integer or floating-point).
+  inline bool is_number() const {
+      return is_integer() || is_unsigned_integer() || is_double();
+  }
+  // Returns true if the current type of the node is a bool with true value.
+  inline bool is_true() const { return get_type() == 't'; }
+  // Returns true if the current type of the node is a bool with false value.
+  inline bool is_false() const { return get_type() == 'f'; }
+  // Returns true if the current type of the node is null.
+  inline bool is_null() const { return get_type() == 'n'; }
+  // Returns true if the type byte represents an object of an array
+  static bool is_object_or_array(uint8_t type) {
+      return ((type == '[') || (type == '{'));
+  }
+
+  // when at {, go one level deep, looking for a given key
+  // if successful, we are left pointing at the value,
+  // if not, we are still pointing at the object ({)
+  // (in case of repeated keys, this only finds the first one).
+  // We seek the key using C's strcmp so if your JSON strings contain
+  // NULL chars, this would trigger a false positive: if you expect that
+  // to be the case, take extra precautions.
+  // Furthermore, we do the comparison character-by-character
+  // without taking into account Unicode equivalence.
+  inline bool move_to_key(const char *key);
+
+  // as above, but case insensitive lookup (strcmpi instead of strcmp)
+  inline bool move_to_key_insensitive(const char *key);
+
+  // when at {, go one level deep, looking for a given key
+  // if successful, we are left pointing at the value,
+  // if not, we are still pointing at the object ({)
+  // (in case of repeated keys, this only finds the first one).
+  // The string we search for can contain NULL values.
+  // Furthermore, we do the comparison character-by-character
+  // without taking into account Unicode equivalence.
+  inline bool move_to_key(const char *key, uint32_t length);
+
+  // when at a key location within an object, this moves to the accompanying
+  // value (located next to it). This is equivalent but much faster than
+  // calling "next()".
+  inline void move_to_value();
+
+  // when at [, go one level deep, and advance to the given index.
+  // if successful, we are left pointing at the value,
+  // if not, we are still pointing at the array ([)
+  inline bool move_to_index(uint32_t index);
+
+  // Moves the iterator to the value corresponding to the json pointer.
+  // Always search from the root of the document.
+  // if successful, we are left pointing at the value,
+  // if not, we are still pointing the same value we were pointing before the
+  // call. The json pointer follows the rfc6901 standard's syntax:
+  // https://tools.ietf.org/html/rfc6901 However, the standard says "If a
+  // referenced member name is not unique in an object, the member that is
+  // referenced is undefined, and evaluation fails". Here we just return the
+  // first corresponding value. The length parameter is the length of the
+  // jsonpointer string ('pointer').
+  inline bool move_to(const char *pointer, uint32_t length);
+
+  // Moves the iterator to the value corresponding to the json pointer.
+  // Always search from the root of the document.
+  // if successful, we are left pointing at the value,
+  // if not, we are still pointing the same value we were pointing before the
+  // call. The json pointer implementation follows the rfc6901 standard's
+  // syntax: https://tools.ietf.org/html/rfc6901 However, the standard says
+  // "If a referenced member name is not unique in an object, the member that
+  // is referenced is undefined, and evaluation fails". Here we just return
+  // the first corresponding value.
+  inline bool move_to(const std::string &pointer) {
+      return move_to(pointer.c_str(), uint32_t(pointer.length()));
+  }
+
+  private:
+  // Almost the same as move_to(), except it searches from the current
+  // position. The pointer's syntax is identical, though that case is not
+  // handled by the rfc6901 standard. The '/' is still required at the
+  // beginning. However, contrary to move_to(), the URI Fragment Identifier
+  // Representation is not supported here. Also, in case of failure, we are
+  // left pointing at the closest value it could reach. For these reasons it
+  // is private. It exists because it is used by move_to().
+  inline bool relative_move_to(const char *pointer, uint32_t length);
+
+  public:
+  // throughout return true if we can do the navigation, false
+  // otherwise
+
+  // Within a given scope (series of nodes at the same depth within either an
+  // array or an object), we move forward.
+  // Thus, given [true, null, {"a":1}, [1,2]], we would visit true, null, {
+  // and [. At the object ({) or at the array ([), you can issue a "down" to
+  // visit their content. valid if we're not at the end of a scope (returns
+  // true).
+  inline bool next();
+
+  // Within a given scope (series of nodes at the same depth within either an
+  // array or an object), we move backward.
+  // Thus, given [true, null, {"a":1}, [1,2]], we would visit ], }, null, true
+  // when starting at the end of the scope. At the object ({) or at the array
+  // ([), you can issue a "down" to visit their content.
+  // Performance warning: This function is implemented by starting again
+  // from the beginning of the scope and scanning forward. You should expect
+  // it to be relatively slow.
+  inline bool prev();
+
+  // Moves back to either the containing array or object (type { or [) from
+  // within a contained scope.
+  // Valid unless we are at the first level of the document
+  inline bool up();
+
+  // Valid if we're at a [ or { and it starts a non-empty scope; moves us to
+  // start of that deeper scope if it not empty. Thus, given [true, null,
+  // {"a":1}, [1,2]], if we are at the { node, we would move to the "a" node.
+  inline bool down();
+
+  // move us to the start of our current scope,
+  // a scope is a series of nodes at the same level
+  inline void to_start_scope();
+
+  inline void rewind() {
+      while (up())
+      ;
+  }
+
+
+
+  // print the node we are currently pointing at
+  inline bool print(std::ostream &os, bool escape_strings = true) const;
+
+  private:
+  const document &doc;
+  size_t max_depth{};
+  size_t depth{};
+  size_t location{}; // our current location on a tape
+  size_t tape_length{};
+  uint8_t current_type{};
+  uint64_t current_val{};
+  typedef struct {
+      size_t start_of_scope;
+      uint8_t scope_type;
+  } scopeindex_t;
+
+  scopeindex_t *depth_index{};
+};
+
+} // namespace simdjson
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+
+#endif // SIMDJSON_DOM_PARSEDJSON_ITERATOR_H
+/* end file include/simdjson/dom/parsedjson_iterator.h */
+
+// Inline functions
+/* begin file include/simdjson/dom/array-inl.h */
+#ifndef SIMDJSON_INLINE_ARRAY_H
+#define SIMDJSON_INLINE_ARRAY_H
+
+// Inline implementations go in here.
+
+#include <utility>
+
+namespace simdjson {
+
+//
+// simdjson_result<dom::array> inline implementation
+//
+simdjson_inline simdjson_result<dom::array>::simdjson_result() noexcept
+    : internal::simdjson_result_base<dom::array>() {}
+simdjson_inline simdjson_result<dom::array>::simdjson_result(dom::array value) noexcept
+    : internal::simdjson_result_base<dom::array>(std::forward<dom::array>(value)) {}
+simdjson_inline simdjson_result<dom::array>::simdjson_result(error_code error) noexcept
+    : internal::simdjson_result_base<dom::array>(error) {}
+
+#if SIMDJSON_EXCEPTIONS
+
+inline dom::array::iterator simdjson_result<dom::array>::begin() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.begin();
+}
+inline dom::array::iterator simdjson_result<dom::array>::end() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.end();
+}
+inline size_t simdjson_result<dom::array>::size() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.size();
+}
+
+#endif // SIMDJSON_EXCEPTIONS
+
+inline simdjson_result<dom::element> simdjson_result<dom::array>::at_pointer(std::string_view json_pointer) const noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+inline simdjson_result<dom::element> simdjson_result<dom::array>::at(size_t index) const noexcept {
+  if (error()) { return error(); }
+  return first.at(index);
+}
+
+namespace dom {
+
+//
+// array inline implementation
+//
+simdjson_inline array::array() noexcept : tape{} {}
+simdjson_inline array::array(const internal::tape_ref &_tape) noexcept : tape{_tape} {}
+inline array::iterator array::begin() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed array is invalid
+#endif
+  return internal::tape_ref(tape.doc, tape.json_index + 1);
+}
+inline array::iterator array::end() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed array is invalid
+#endif
+  return internal::tape_ref(tape.doc, tape.after_element() - 1);
+}
+inline size_t array::size() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed array is invalid
+#endif
+  return tape.scope_count();
+}
+inline size_t array::number_of_slots() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed array is invalid
+#endif
+  return tape.matching_brace_index() - tape.json_index;
+}
+inline simdjson_result<element> array::at_pointer(std::string_view json_pointer) const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed array is invalid
+#endif
+  if(json_pointer.empty()) { // an empty string means that we return the current node
+      return element(this->tape); // copy the current node
+  } else if(json_pointer[0] != '/') { // otherwise there is an error
+      return INVALID_JSON_POINTER;
+  }
+  json_pointer = json_pointer.substr(1);
+  // - means "the append position" or "the element after the end of the array"
+  // We don't support this, because we're returning a real element, not a position.
+  if (json_pointer == "-") { return INDEX_OUT_OF_BOUNDS; }
+
+  // Read the array index
+  size_t array_index = 0;
+  size_t i;
+  for (i = 0; i < json_pointer.length() && json_pointer[i] != '/'; i++) {
+    uint8_t digit = uint8_t(json_pointer[i] - '0');
+    // Check for non-digit in array index. If it's there, we're trying to get a field in an object
+    if (digit > 9) { return INCORRECT_TYPE; }
+    array_index = array_index*10 + digit;
+  }
+
+  // 0 followed by other digits is invalid
+  if (i > 1 && json_pointer[0] == '0') { return INVALID_JSON_POINTER; } // "JSON pointer array index has other characters after 0"
+
+  // Empty string is invalid; so is a "/" with no digits before it
+  if (i == 0) { return INVALID_JSON_POINTER; } // "Empty string in JSON pointer array index"
+
+  // Get the child
+  auto child = array(tape).at(array_index);
+  // If there is an error, it ends here
+  if(child.error()) {
+    return child;
+  }
+  // If there is a /, we're not done yet, call recursively.
+  if (i < json_pointer.length()) {
+    child = child.at_pointer(json_pointer.substr(i));
+  }
+  return child;
+}
+
+inline simdjson_result<element> array::at(size_t index) const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed array is invalid
+#endif
+  size_t i=0;
+  for (auto element : *this) {
+    if (i == index) { return element; }
+    i++;
+  }
+  return INDEX_OUT_OF_BOUNDS;
+}
+
+//
+// array::iterator inline implementation
+//
+simdjson_inline array::iterator::iterator(const internal::tape_ref &_tape) noexcept : tape{_tape} { }
+inline element array::iterator::operator*() const noexcept {
+  return element(tape);
+}
+inline array::iterator& array::iterator::operator++() noexcept {
+  tape.json_index = tape.after_element();
+  return *this;
+}
+inline array::iterator array::iterator::operator++(int) noexcept {
+  array::iterator out = *this;
+  ++*this;
+  return out;
+}
+inline bool array::iterator::operator!=(const array::iterator& other) const noexcept {
+  return tape.json_index != other.tape.json_index;
+}
+inline bool array::iterator::operator==(const array::iterator& other) const noexcept {
+  return tape.json_index == other.tape.json_index;
+}
+inline bool array::iterator::operator<(const array::iterator& other) const noexcept {
+  return tape.json_index < other.tape.json_index;
+}
+inline bool array::iterator::operator<=(const array::iterator& other) const noexcept {
+  return tape.json_index <= other.tape.json_index;
+}
+inline bool array::iterator::operator>=(const array::iterator& other) const noexcept {
+  return tape.json_index >= other.tape.json_index;
+}
+inline bool array::iterator::operator>(const array::iterator& other) const noexcept {
+  return tape.json_index > other.tape.json_index;
+}
+
+} // namespace dom
+
+
+} // namespace simdjson
+
+/* begin file include/simdjson/dom/element-inl.h */
+#ifndef SIMDJSON_INLINE_ELEMENT_H
+#define SIMDJSON_INLINE_ELEMENT_H
+
+#include <cstring>
+#include <utility>
+
+namespace simdjson {
+
+//
+// simdjson_result<dom::element> inline implementation
+//
+simdjson_inline simdjson_result<dom::element>::simdjson_result() noexcept
+    : internal::simdjson_result_base<dom::element>() {}
+simdjson_inline simdjson_result<dom::element>::simdjson_result(dom::element &&value) noexcept
+    : internal::simdjson_result_base<dom::element>(std::forward<dom::element>(value)) {}
+simdjson_inline simdjson_result<dom::element>::simdjson_result(error_code error) noexcept
+    : internal::simdjson_result_base<dom::element>(error) {}
+inline simdjson_result<dom::element_type> simdjson_result<dom::element>::type() const noexcept {
+  if (error()) { return error(); }
+  return first.type();
+}
+
+template<typename T>
+simdjson_inline bool simdjson_result<dom::element>::is() const noexcept {
+  return !error() && first.is<T>();
+}
+template<typename T>
+simdjson_inline simdjson_result<T> simdjson_result<dom::element>::get() const noexcept {
+  if (error()) { return error(); }
+  return first.get<T>();
+}
+template<typename T>
+simdjson_warn_unused simdjson_inline error_code simdjson_result<dom::element>::get(T &value) const noexcept {
+  if (error()) { return error(); }
+  return first.get<T>(value);
+}
+
+simdjson_inline simdjson_result<dom::array> simdjson_result<dom::element>::get_array() const noexcept {
+  if (error()) { return error(); }
+  return first.get_array();
+}
+simdjson_inline simdjson_result<dom::object> simdjson_result<dom::element>::get_object() const noexcept {
+  if (error()) { return error(); }
+  return first.get_object();
+}
+simdjson_inline simdjson_result<const char *> simdjson_result<dom::element>::get_c_str() const noexcept {
+  if (error()) { return error(); }
+  return first.get_c_str();
+}
+simdjson_inline simdjson_result<size_t> simdjson_result<dom::element>::get_string_length() const noexcept {
+  if (error()) { return error(); }
+  return first.get_string_length();
+}
+simdjson_inline simdjson_result<std::string_view> simdjson_result<dom::element>::get_string() const noexcept {
+  if (error()) { return error(); }
+  return first.get_string();
+}
+simdjson_inline simdjson_result<int64_t> simdjson_result<dom::element>::get_int64() const noexcept {
+  if (error()) { return error(); }
+  return first.get_int64();
+}
+simdjson_inline simdjson_result<uint64_t> simdjson_result<dom::element>::get_uint64() const noexcept {
+  if (error()) { return error(); }
+  return first.get_uint64();
+}
+simdjson_inline simdjson_result<double> simdjson_result<dom::element>::get_double() const noexcept {
+  if (error()) { return error(); }
+  return first.get_double();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<dom::element>::get_bool() const noexcept {
+  if (error()) { return error(); }
+  return first.get_bool();
+}
+
+simdjson_inline bool simdjson_result<dom::element>::is_array() const noexcept {
+  return !error() && first.is_array();
+}
+simdjson_inline bool simdjson_result<dom::element>::is_object() const noexcept {
+  return !error() && first.is_object();
+}
+simdjson_inline bool simdjson_result<dom::element>::is_string() const noexcept {
+  return !error() && first.is_string();
+}
+simdjson_inline bool simdjson_result<dom::element>::is_int64() const noexcept {
+  return !error() && first.is_int64();
+}
+simdjson_inline bool simdjson_result<dom::element>::is_uint64() const noexcept {
+  return !error() && first.is_uint64();
+}
+simdjson_inline bool simdjson_result<dom::element>::is_double() const noexcept {
+  return !error() && first.is_double();
+}
+simdjson_inline bool simdjson_result<dom::element>::is_number() const noexcept {
+  return !error() && first.is_number();
+}
+simdjson_inline bool simdjson_result<dom::element>::is_bool() const noexcept {
+  return !error() && first.is_bool();
+}
+
+simdjson_inline bool simdjson_result<dom::element>::is_null() const noexcept {
+  return !error() && first.is_null();
+}
+
+simdjson_inline simdjson_result<dom::element> simdjson_result<dom::element>::operator[](std::string_view key) const noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<dom::element> simdjson_result<dom::element>::operator[](const char *key) const noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<dom::element> simdjson_result<dom::element>::at_pointer(const std::string_view json_pointer) const noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+[[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]]
+simdjson_inline simdjson_result<dom::element> simdjson_result<dom::element>::at(const std::string_view json_pointer) const noexcept {
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_DEPRECATED_WARNING
+  if (error()) { return error(); }
+  return first.at(json_pointer);
+SIMDJSON_POP_DISABLE_WARNINGS
+}
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+simdjson_inline simdjson_result<dom::element> simdjson_result<dom::element>::at(size_t index) const noexcept {
+  if (error()) { return error(); }
+  return first.at(index);
+}
+simdjson_inline simdjson_result<dom::element> simdjson_result<dom::element>::at_key(std::string_view key) const noexcept {
+  if (error()) { return error(); }
+  return first.at_key(key);
+}
+simdjson_inline simdjson_result<dom::element> simdjson_result<dom::element>::at_key_case_insensitive(std::string_view key) const noexcept {
+  if (error()) { return error(); }
+  return first.at_key_case_insensitive(key);
+}
+
+#if SIMDJSON_EXCEPTIONS
+
+simdjson_inline simdjson_result<dom::element>::operator bool() const noexcept(false) {
+  return get<bool>();
+}
+simdjson_inline simdjson_result<dom::element>::operator const char *() const noexcept(false) {
+  return get<const char *>();
+}
+simdjson_inline simdjson_result<dom::element>::operator std::string_view() const noexcept(false) {
+  return get<std::string_view>();
+}
+simdjson_inline simdjson_result<dom::element>::operator uint64_t() const noexcept(false) {
+  return get<uint64_t>();
+}
+simdjson_inline simdjson_result<dom::element>::operator int64_t() const noexcept(false) {
+  return get<int64_t>();
+}
+simdjson_inline simdjson_result<dom::element>::operator double() const noexcept(false) {
+  return get<double>();
+}
+simdjson_inline simdjson_result<dom::element>::operator dom::array() const noexcept(false) {
+  return get<dom::array>();
+}
+simdjson_inline simdjson_result<dom::element>::operator dom::object() const noexcept(false) {
+  return get<dom::object>();
+}
+
+simdjson_inline dom::array::iterator simdjson_result<dom::element>::begin() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.begin();
+}
+simdjson_inline dom::array::iterator simdjson_result<dom::element>::end() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.end();
+}
+
+#endif // SIMDJSON_EXCEPTIONS
+
+namespace dom {
+
+//
+// element inline implementation
+//
+simdjson_inline element::element() noexcept : tape{} {}
+simdjson_inline element::element(const internal::tape_ref &_tape) noexcept : tape{_tape} { }
+
+inline element_type element::type() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  auto tape_type = tape.tape_ref_type();
+  return tape_type == internal::tape_type::FALSE_VALUE ? element_type::BOOL : static_cast<element_type>(tape_type);
+}
+
+inline simdjson_result<bool> element::get_bool() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  if(tape.is_true()) {
+    return true;
+  } else if(tape.is_false()) {
+    return false;
+  }
+  return INCORRECT_TYPE;
+}
+inline simdjson_result<const char *> element::get_c_str() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  switch (tape.tape_ref_type()) {
+    case internal::tape_type::STRING: {
+      return tape.get_c_str();
+    }
+    default:
+      return INCORRECT_TYPE;
+  }
+}
+inline simdjson_result<size_t> element::get_string_length() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  switch (tape.tape_ref_type()) {
+    case internal::tape_type::STRING: {
+      return tape.get_string_length();
+    }
+    default:
+      return INCORRECT_TYPE;
+  }
+}
+inline simdjson_result<std::string_view> element::get_string() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  switch (tape.tape_ref_type()) {
+    case internal::tape_type::STRING:
+      return tape.get_string_view();
+    default:
+      return INCORRECT_TYPE;
+  }
+}
+inline simdjson_result<uint64_t> element::get_uint64() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  if(simdjson_unlikely(!tape.is_uint64())) { // branch rarely taken
+    if(tape.is_int64()) {
+      int64_t result = tape.next_tape_value<int64_t>();
+      if (result < 0) {
+        return NUMBER_OUT_OF_RANGE;
+      }
+      return uint64_t(result);
+    }
+    return INCORRECT_TYPE;
+  }
+  return tape.next_tape_value<int64_t>();
+}
+inline simdjson_result<int64_t> element::get_int64() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  if(simdjson_unlikely(!tape.is_int64())) { // branch rarely taken
+    if(tape.is_uint64()) {
+      uint64_t result = tape.next_tape_value<uint64_t>();
+      // Wrapping max in parens to handle Windows issue: https://stackoverflow.com/questions/11544073/how-do-i-deal-with-the-max-macro-in-windows-h-colliding-with-max-in-std
+      if (result > uint64_t((std::numeric_limits<int64_t>::max)())) {
+        return NUMBER_OUT_OF_RANGE;
+      }
+      return static_cast<int64_t>(result);
+    }
+    return INCORRECT_TYPE;
+  }
+  return tape.next_tape_value<int64_t>();
+}
+inline simdjson_result<double> element::get_double() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  // Performance considerations:
+  // 1. Querying tape_ref_type() implies doing a shift, it is fast to just do a straight
+  //   comparison.
+  // 2. Using a switch-case relies on the compiler guessing what kind of code generation
+  //    we want... But the compiler cannot know that we expect the type to be "double"
+  //    most of the time.
+  // We can expect get<double> to refer to a double type almost all the time.
+  // It is important to craft the code accordingly so that the compiler can use this
+  // information. (This could also be solved with profile-guided optimization.)
+  if(simdjson_unlikely(!tape.is_double())) { // branch rarely taken
+    if(tape.is_uint64()) {
+      return double(tape.next_tape_value<uint64_t>());
+    } else if(tape.is_int64()) {
+      return double(tape.next_tape_value<int64_t>());
+    }
+    return INCORRECT_TYPE;
+  }
+  // this is common:
+  return tape.next_tape_value<double>();
+}
+inline simdjson_result<array> element::get_array() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  switch (tape.tape_ref_type()) {
+    case internal::tape_type::START_ARRAY:
+      return array(tape);
+    default:
+      return INCORRECT_TYPE;
+  }
+}
+inline simdjson_result<object> element::get_object() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  switch (tape.tape_ref_type()) {
+    case internal::tape_type::START_OBJECT:
+      return object(tape);
+    default:
+      return INCORRECT_TYPE;
+  }
+}
+
+template<typename T>
+simdjson_warn_unused simdjson_inline error_code element::get(T &value) const noexcept {
+  return get<T>().get(value);
+}
+// An element-specific version prevents recursion with simdjson_result::get<element>(value)
+template<>
+simdjson_warn_unused simdjson_inline error_code element::get<element>(element &value) const noexcept {
+  value = element(tape);
+  return SUCCESS;
+}
+template<typename T>
+inline void element::tie(T &value, error_code &error) && noexcept {
+  error = get<T>(value);
+}
+
+template<typename T>
+simdjson_inline bool element::is() const noexcept {
+  auto result = get<T>();
+  return !result.error();
+}
+
+template<> inline simdjson_result<array> element::get<array>() const noexcept { return get_array(); }
+template<> inline simdjson_result<object> element::get<object>() const noexcept { return get_object(); }
+template<> inline simdjson_result<const char *> element::get<const char *>() const noexcept { return get_c_str(); }
+template<> inline simdjson_result<std::string_view> element::get<std::string_view>() const noexcept { return get_string(); }
+template<> inline simdjson_result<int64_t> element::get<int64_t>() const noexcept { return get_int64(); }
+template<> inline simdjson_result<uint64_t> element::get<uint64_t>() const noexcept { return get_uint64(); }
+template<> inline simdjson_result<double> element::get<double>() const noexcept { return get_double(); }
+template<> inline simdjson_result<bool> element::get<bool>() const noexcept { return get_bool(); }
+
+inline bool element::is_array() const noexcept { return is<array>(); }
+inline bool element::is_object() const noexcept { return is<object>(); }
+inline bool element::is_string() const noexcept { return is<std::string_view>(); }
+inline bool element::is_int64() const noexcept { return is<int64_t>(); }
+inline bool element::is_uint64() const noexcept { return is<uint64_t>(); }
+inline bool element::is_double() const noexcept { return is<double>(); }
+inline bool element::is_bool() const noexcept { return is<bool>(); }
+inline bool element::is_number() const noexcept { return is_int64() || is_uint64() || is_double(); }
+
+inline bool element::is_null() const noexcept {
+  return tape.is_null_on_tape();
+}
+
+#if SIMDJSON_EXCEPTIONS
+
+inline element::operator bool() const noexcept(false) { return get<bool>(); }
+inline element::operator const char*() const noexcept(false) { return get<const char *>(); }
+inline element::operator std::string_view() const noexcept(false) { return get<std::string_view>(); }
+inline element::operator uint64_t() const noexcept(false) { return get<uint64_t>(); }
+inline element::operator int64_t() const noexcept(false) { return get<int64_t>(); }
+inline element::operator double() const noexcept(false) { return get<double>(); }
+inline element::operator array() const noexcept(false) { return get<array>(); }
+inline element::operator object() const noexcept(false) { return get<object>(); }
+
+inline array::iterator element::begin() const noexcept(false) {
+  return get<array>().begin();
+}
+inline array::iterator element::end() const noexcept(false) {
+  return get<array>().end();
+}
+
+#endif // SIMDJSON_EXCEPTIONS
+
+inline simdjson_result<element> element::operator[](std::string_view key) const noexcept {
+  return at_key(key);
+}
+inline simdjson_result<element> element::operator[](const char *key) const noexcept {
+  return at_key(key);
+}
+
+inline simdjson_result<element> element::at_pointer(std::string_view json_pointer) const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  switch (tape.tape_ref_type()) {
+    case internal::tape_type::START_OBJECT:
+      return object(tape).at_pointer(json_pointer);
+    case internal::tape_type::START_ARRAY:
+      return array(tape).at_pointer(json_pointer);
+    default: {
+      if(!json_pointer.empty()) { // a non-empty string is invalid on an atom
+        return INVALID_JSON_POINTER;
+      }
+      // an empty string means that we return the current node
+      dom::element copy(*this);
+      return simdjson_result<element>(std::move(copy));
+    }
+  }
+}
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+[[deprecated("For standard compliance, use at_pointer instead, and prefix your pointers with a slash '/', see RFC6901 ")]]
+inline simdjson_result<element> element::at(std::string_view json_pointer) const noexcept {
+  // version 0.4 of simdjson allowed non-compliant pointers
+  auto std_pointer = (json_pointer.empty() ? "" : "/") + std::string(json_pointer.begin(), json_pointer.end());
+  return at_pointer(std_pointer);
+}
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+
+inline simdjson_result<element> element::at(size_t index) const noexcept {
+  return get<array>().at(index);
+}
+inline simdjson_result<element> element::at_key(std::string_view key) const noexcept {
+  return get<object>().at_key(key);
+}
+inline simdjson_result<element> element::at_key_case_insensitive(std::string_view key) const noexcept {
+  return get<object>().at_key_case_insensitive(key);
+}
+
+inline bool element::dump_raw_tape(std::ostream &out) const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed element is invalid
+#endif
+  return tape.doc->dump_raw_tape(out);
+}
+
+
+inline std::ostream& operator<<(std::ostream& out, element_type type) {
+  switch (type) {
+    case element_type::ARRAY:
+      return out << "array";
+    case element_type::OBJECT:
+      return out << "object";
+    case element_type::INT64:
+      return out << "int64_t";
+    case element_type::UINT64:
+      return out << "uint64_t";
+    case element_type::DOUBLE:
+      return out << "double";
+    case element_type::STRING:
+      return out << "string";
+    case element_type::BOOL:
+      return out << "bool";
+    case element_type::NULL_VALUE:
+      return out << "null";
+    default:
+      return out << "unexpected content!!!"; // abort() usage is forbidden in the library
+  }
+}
+
+} // namespace dom
+
+} // namespace simdjson
+
+#endif // SIMDJSON_INLINE_ELEMENT_H
+/* end file include/simdjson/dom/element-inl.h */
+
+#if defined(__cpp_lib_ranges)
+static_assert(std::ranges::view<simdjson::dom::array>);
+static_assert(std::ranges::sized_range<simdjson::dom::array>);
+#if SIMDJSON_EXCEPTIONS
+static_assert(std::ranges::view<simdjson::simdjson_result<simdjson::dom::array>>);
+static_assert(std::ranges::sized_range<simdjson::simdjson_result<simdjson::dom::array>>);
+#endif // SIMDJSON_EXCEPTIONS
+#endif // defined(__cpp_lib_ranges)
+
+#endif // SIMDJSON_INLINE_ARRAY_H
+/* end file include/simdjson/dom/array-inl.h */
+/* begin file include/simdjson/dom/document_stream-inl.h */
+#ifndef SIMDJSON_INLINE_DOCUMENT_STREAM_H
+#define SIMDJSON_INLINE_DOCUMENT_STREAM_H
+
+#include <algorithm>
+#include <limits>
+#include <stdexcept>
+namespace simdjson {
+namespace dom {
+
+#ifdef SIMDJSON_THREADS_ENABLED
+inline void stage1_worker::finish() {
+  // After calling "run" someone would call finish() to wait
+  // for the end of the processing.
+  // This function will wait until either the thread has done
+  // the processing or, else, the destructor has been called.
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  cond_var.wait(lock, [this]{return has_work == false;});
+}
+
+inline stage1_worker::~stage1_worker() {
+  // The thread may never outlive the stage1_worker instance
+  // and will always be stopped/joined before the stage1_worker
+  // instance is gone.
+  stop_thread();
+}
+
+inline void stage1_worker::start_thread() {
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  if(thread.joinable()) {
+    return; // This should never happen but we never want to create more than one thread.
+  }
+  thread = std::thread([this]{
+      while(true) {
+        std::unique_lock<std::mutex> thread_lock(locking_mutex);
+        // We wait for either "run" or "stop_thread" to be called.
+        cond_var.wait(thread_lock, [this]{return has_work || !can_work;});
+        // If, for some reason, the stop_thread() method was called (i.e., the
+        // destructor of stage1_worker is called, then we want to immediately destroy
+        // the thread (and not do any more processing).
+        if(!can_work) {
+          break;
+        }
+        this->owner->stage1_thread_error = this->owner->run_stage1(*this->stage1_thread_parser,
+              this->_next_batch_start);
+        this->has_work = false;
+        // The condition variable call should be moved after thread_lock.unlock() for performance
+        // reasons but thread sanitizers may report it as a data race if we do.
+        // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock
+        cond_var.notify_one(); // will notify "finish"
+        thread_lock.unlock();
+      }
+    }
+  );
+}
+
+
+inline void stage1_worker::stop_thread() {
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  // We have to make sure that all locks can be released.
+  can_work = false;
+  has_work = false;
+  cond_var.notify_all();
+  lock.unlock();
+  if(thread.joinable()) {
+    thread.join();
+  }
+}
+
+inline void stage1_worker::run(document_stream * ds, dom::parser * stage1, size_t next_batch_start) {
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  owner = ds;
+  _next_batch_start = next_batch_start;
+  stage1_thread_parser = stage1;
+  has_work = true;
+  // The condition variable call should be moved after thread_lock.unlock() for performance
+  // reasons but thread sanitizers may report it as a data race if we do.
+  // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock
+  cond_var.notify_one(); // will notify the thread lock that we have work
+  lock.unlock();
+}
+#endif
+
+simdjson_inline document_stream::document_stream(
+  dom::parser &_parser,
+  const uint8_t *_buf,
+  size_t _len,
+  size_t _batch_size
+) noexcept
+  : parser{&_parser},
+    buf{_buf},
+    len{_len},
+    batch_size{_batch_size <= MINIMAL_BATCH_SIZE ? MINIMAL_BATCH_SIZE : _batch_size},
+    error{SUCCESS}
+#ifdef SIMDJSON_THREADS_ENABLED
+    , use_thread(_parser.threaded) // we need to make a copy because _parser.threaded can change
+#endif
+{
+#ifdef SIMDJSON_THREADS_ENABLED
+  if(worker.get() == nullptr) {
+    error = MEMALLOC;
+  }
+#endif
+}
+
+simdjson_inline document_stream::document_stream() noexcept
+  : parser{nullptr},
+    buf{nullptr},
+    len{0},
+    batch_size{0},
+    error{UNINITIALIZED}
+#ifdef SIMDJSON_THREADS_ENABLED
+    , use_thread(false)
+#endif
+{
+}
+
+simdjson_inline document_stream::~document_stream() noexcept {
+#ifdef SIMDJSON_THREADS_ENABLED
+  worker.reset();
+#endif
+}
+
+simdjson_inline document_stream::iterator::iterator() noexcept
+  : stream{nullptr}, finished{true} {
+}
+
+simdjson_inline document_stream::iterator document_stream::begin() noexcept {
+  start();
+  // If there are no documents, we're finished.
+  return iterator(this, error == EMPTY);
+}
+
+simdjson_inline document_stream::iterator document_stream::end() noexcept {
+  return iterator(this, true);
+}
+
+simdjson_inline document_stream::iterator::iterator(document_stream* _stream, bool is_end) noexcept
+  : stream{_stream}, finished{is_end} {
+}
+
+simdjson_inline document_stream::iterator::reference document_stream::iterator::operator*() noexcept {
+  // Note that in case of error, we do not yet mark
+  // the iterator as "finished": this detection is done
+  // in the operator++ function since it is possible
+  // to call operator++ repeatedly while omitting
+  // calls to operator*.
+  if (stream->error) { return stream->error; }
+  return stream->parser->doc.root();
+}
+
+simdjson_inline document_stream::iterator& document_stream::iterator::operator++() noexcept {
+  // If there is an error, then we want the iterator
+  // to be finished, no matter what. (E.g., we do not
+  // keep generating documents with errors, or go beyond
+  // a document with errors.)
+  //
+  // Users do not have to call "operator*()" when they use operator++,
+  // so we need to end the stream in the operator++ function.
+  //
+  // Note that setting finished = true is essential otherwise
+  // we would enter an infinite loop.
+  if (stream->error) { finished = true; }
+  // Note that stream->error() is guarded against error conditions
+  // (it will immediately return if stream->error casts to false).
+  // In effect, this next function does nothing when (stream->error)
+  // is true (hence the risk of an infinite loop).
+  stream->next();
+  // If that was the last document, we're finished.
+  // It is the only type of error we do not want to appear
+  // in operator*.
+  if (stream->error == EMPTY) { finished = true; }
+  // If we had any other kind of error (not EMPTY) then we want
+  // to pass it along to the operator* and we cannot mark the result
+  // as "finished" just yet.
+  return *this;
+}
+
+simdjson_inline bool document_stream::iterator::operator!=(const document_stream::iterator &other) const noexcept {
+  return finished != other.finished;
+}
+
+inline void document_stream::start() noexcept {
+  if (error) { return; }
+  error = parser->ensure_capacity(batch_size);
+  if (error) { return; }
+  // Always run the first stage 1 parse immediately
+  batch_start = 0;
+  error = run_stage1(*parser, batch_start);
+  while(error == EMPTY) {
+    // In exceptional cases, we may start with an empty block
+    batch_start = next_batch_start();
+    if (batch_start >= len) { return; }
+    error = run_stage1(*parser, batch_start);
+  }
+  if (error) { return; }
+#ifdef SIMDJSON_THREADS_ENABLED
+  if (use_thread && next_batch_start() < len) {
+    // Kick off the first thread if needed
+    error = stage1_thread_parser.ensure_capacity(batch_size);
+    if (error) { return; }
+    worker->start_thread();
+    start_stage1_thread();
+    if (error) { return; }
+  }
+#endif // SIMDJSON_THREADS_ENABLED
+  next();
+}
+
+simdjson_inline size_t document_stream::iterator::current_index() const noexcept {
+  return stream->doc_index;
+}
+
+simdjson_inline std::string_view document_stream::iterator::source() const noexcept {
+  const char* start = reinterpret_cast<const char*>(stream->buf) + current_index();
+  bool object_or_array = ((*start == '[') || (*start == '{'));
+  if(object_or_array) {
+    size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index - 1];
+    return std::string_view(start, next_doc_index - current_index() + 1);
+  } else {
+    size_t next_doc_index = stream->batch_start + stream->parser->implementation->structural_indexes[stream->parser->implementation->next_structural_index];
+    return std::string_view(reinterpret_cast<const char*>(stream->buf) + current_index(), next_doc_index - current_index() - 1);
+  }
+}
+
+
+inline void document_stream::next() noexcept {
+  // We always exit at once, once in an error condition.
+  if (error) { return; }
+
+  // Load the next document from the batch
+  doc_index = batch_start + parser->implementation->structural_indexes[parser->implementation->next_structural_index];
+  error = parser->implementation->stage2_next(parser->doc);
+  // If that was the last document in the batch, load another batch (if available)
+  while (error == EMPTY) {
+    batch_start = next_batch_start();
+    if (batch_start >= len) { break; }
+
+#ifdef SIMDJSON_THREADS_ENABLED
+    if(use_thread) {
+      load_from_stage1_thread();
+    } else {
+      error = run_stage1(*parser, batch_start);
+    }
+#else
+    error = run_stage1(*parser, batch_start);
+#endif
+    if (error) { continue; } // If the error was EMPTY, we may want to load another batch.
+    // Run stage 2 on the first document in the batch
+    doc_index = batch_start + parser->implementation->structural_indexes[parser->implementation->next_structural_index];
+    error = parser->implementation->stage2_next(parser->doc);
+  }
+}
+inline size_t document_stream::size_in_bytes() const noexcept {
+  return len;
+}
+
+inline size_t document_stream::truncated_bytes() const noexcept {
+  if(error == CAPACITY) { return len - batch_start; }
+  return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1];
+}
+
+inline size_t document_stream::next_batch_start() const noexcept {
+  return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes];
+}
+
+inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept {
+  size_t remaining = len - _batch_start;
+  if (remaining <= batch_size) {
+    return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final);
+  } else {
+    return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial);
+  }
+}
+
+#ifdef SIMDJSON_THREADS_ENABLED
+
+inline void document_stream::load_from_stage1_thread() noexcept {
+  worker->finish();
+  // Swap to the parser that was loaded up in the thread. Make sure the parser has
+  // enough memory to swap to, as well.
+  std::swap(*parser, stage1_thread_parser);
+  error = stage1_thread_error;
+  if (error) { return; }
+
+  // If there's anything left, start the stage 1 thread!
+  if (next_batch_start() < len) {
+    start_stage1_thread();
+  }
+}
+
+inline void document_stream::start_stage1_thread() noexcept {
+  // we call the thread on a lambda that will update
+  // this->stage1_thread_error
+  // there is only one thread that may write to this value
+  // TODO this is NOT exception-safe.
+  this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error
+  size_t _next_batch_start = this->next_batch_start();
+
+  worker->run(this, & this->stage1_thread_parser, _next_batch_start);
+}
+
+#endif // SIMDJSON_THREADS_ENABLED
+
+} // namespace dom
+
+simdjson_inline simdjson_result<dom::document_stream>::simdjson_result() noexcept
+  : simdjson_result_base() {
+}
+simdjson_inline simdjson_result<dom::document_stream>::simdjson_result(error_code error) noexcept
+  : simdjson_result_base(error) {
+}
+simdjson_inline simdjson_result<dom::document_stream>::simdjson_result(dom::document_stream &&value) noexcept
+  : simdjson_result_base(std::forward<dom::document_stream>(value)) {
+}
+
+#if SIMDJSON_EXCEPTIONS
+simdjson_inline dom::document_stream::iterator simdjson_result<dom::document_stream>::begin() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.begin();
+}
+simdjson_inline dom::document_stream::iterator simdjson_result<dom::document_stream>::end() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.end();
+}
+#else // SIMDJSON_EXCEPTIONS
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+simdjson_inline dom::document_stream::iterator simdjson_result<dom::document_stream>::begin() noexcept {
+  first.error = error();
+  return first.begin();
+}
+simdjson_inline dom::document_stream::iterator simdjson_result<dom::document_stream>::end() noexcept {
+  first.error = error();
+  return first.end();
+}
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+#endif // SIMDJSON_EXCEPTIONS
+
+} // namespace simdjson
+#endif // SIMDJSON_INLINE_DOCUMENT_STREAM_H
+/* end file include/simdjson/dom/document_stream-inl.h */
+/* begin file include/simdjson/dom/document-inl.h */
+#ifndef SIMDJSON_INLINE_DOCUMENT_H
+#define SIMDJSON_INLINE_DOCUMENT_H
+
+// Inline implementations go in here.
+
+#include <ostream>
+#include <cstring>
+
+namespace simdjson {
+namespace dom {
+
+//
+// document inline implementation
+//
+inline element document::root() const noexcept {
+  return element(internal::tape_ref(this, 1));
+}
+simdjson_warn_unused
+inline size_t document::capacity() const noexcept {
+  return allocated_capacity;
+}
+
+simdjson_warn_unused
+inline error_code document::allocate(size_t capacity) noexcept {
+  if (capacity == 0) {
+    string_buf.reset();
+    tape.reset();
+    allocated_capacity = 0;
+    return SUCCESS;
+  }
+
+  // a pathological input like "[[[[..." would generate capacity tape elements, so
+  // need a capacity of at least capacity + 1, but it is also possible to do
+  // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"
+  //where capacity + 1 tape elements are
+  // generated, see issue https://github.com/simdjson/simdjson/issues/345
+  size_t tape_capacity = SIMDJSON_ROUNDUP_N(capacity + 3, 64);
+  // a document with only zero-length strings... could have capacity/3 string
+  // and we would need capacity/3 * 5 bytes on the string buffer
+  size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * capacity / 3 + SIMDJSON_PADDING, 64);
+  string_buf.reset( new (std::nothrow) uint8_t[string_capacity]);
+  tape.reset(new (std::nothrow) uint64_t[tape_capacity]);
+  if(!(string_buf && tape)) {
+    allocated_capacity = 0;
+    string_buf.reset();
+    tape.reset();
+    return MEMALLOC;
+  }
+  // Technically the allocated_capacity might be larger than capacity
+  // so the next line is pessimistic.
+  allocated_capacity = capacity;
+  return SUCCESS;
+}
+
+inline bool document::dump_raw_tape(std::ostream &os) const noexcept {
+  uint32_t string_length;
+  size_t tape_idx = 0;
+  uint64_t tape_val = tape[tape_idx];
+  uint8_t type = uint8_t(tape_val >> 56);
+  os << tape_idx << " : " << type;
+  tape_idx++;
+  size_t how_many = 0;
+  if (type == 'r') {
+    how_many = size_t(tape_val & internal::JSON_VALUE_MASK);
+  } else {
+    // Error: no starting root node?
+    return false;
+  }
+  os << "\t// pointing to " << how_many << " (right after last node)\n";
+  uint64_t payload;
+  for (; tape_idx < how_many; tape_idx++) {
+    os << tape_idx << " : ";
+    tape_val = tape[tape_idx];
+    payload = tape_val & internal::JSON_VALUE_MASK;
+    type = uint8_t(tape_val >> 56);
+    switch (type) {
+    case '"': // we have a string
+      os << "string \"";
+      std::memcpy(&string_length, string_buf.get() + payload, sizeof(uint32_t));
+      os << internal::escape_json_string(std::string_view(
+        reinterpret_cast<const char *>(string_buf.get() + payload + sizeof(uint32_t)),
+        string_length
+      ));
+      os << '"';
+      os << '\n';
+      break;
+    case 'l': // we have a long int
+      if (tape_idx + 1 >= how_many) {
+        return false;
+      }
+      os << "integer " << static_cast<int64_t>(tape[++tape_idx]) << "\n";
+      break;
+    case 'u': // we have a long uint
+      if (tape_idx + 1 >= how_many) {
+        return false;
+      }
+      os << "unsigned integer " << tape[++tape_idx] << "\n";
+      break;
+    case 'd': // we have a double
+      os << "float ";
+      if (tape_idx + 1 >= how_many) {
+        return false;
+      }
+      double answer;
+      std::memcpy(&answer, &tape[++tape_idx], sizeof(answer));
+      os << answer << '\n';
+      break;
+    case 'n': // we have a null
+      os << "null\n";
+      break;
+    case 't': // we have a true
+      os << "true\n";
+      break;
+    case 'f': // we have a false
+      os << "false\n";
+      break;
+    case '{': // we have an object
+      os << "{\t// pointing to next tape location " << uint32_t(payload)
+         << " (first node after the scope), "
+         << " saturated count "
+         << ((payload >> 32) & internal::JSON_COUNT_MASK)<< "\n";
+      break;    case '}': // we end an object
+      os << "}\t// pointing to previous tape location " << uint32_t(payload)
+         << " (start of the scope)\n";
+      break;
+    case '[': // we start an array
+      os << "[\t// pointing to next tape location " << uint32_t(payload)
+         << " (first node after the scope), "
+         << " saturated count "
+         << ((payload >> 32) & internal::JSON_COUNT_MASK)<< "\n";
+      break;
+    case ']': // we end an array
+      os << "]\t// pointing to previous tape location " << uint32_t(payload)
+         << " (start of the scope)\n";
+      break;
+    case 'r': // we start and end with the root node
+      // should we be hitting the root node?
+      return false;
+    default:
+      return false;
+    }
+  }
+  tape_val = tape[tape_idx];
+  payload = tape_val & internal::JSON_VALUE_MASK;
+  type = uint8_t(tape_val >> 56);
+  os << tape_idx << " : " << type << "\t// pointing to " << payload
+     << " (start root)\n";
+  return true;
+}
+
+} // namespace dom
+} // namespace simdjson
+
+#endif // SIMDJSON_INLINE_DOCUMENT_H
+/* end file include/simdjson/dom/document-inl.h */
+/* begin file include/simdjson/dom/object-inl.h */
+#ifndef SIMDJSON_INLINE_OBJECT_H
+#define SIMDJSON_INLINE_OBJECT_H
+
+#include <cstring>
+#include <string>
+
+namespace simdjson {
+
+//
+// simdjson_result<dom::object> inline implementation
+//
+simdjson_inline simdjson_result<dom::object>::simdjson_result() noexcept
+    : internal::simdjson_result_base<dom::object>() {}
+simdjson_inline simdjson_result<dom::object>::simdjson_result(dom::object value) noexcept
+    : internal::simdjson_result_base<dom::object>(std::forward<dom::object>(value)) {}
+simdjson_inline simdjson_result<dom::object>::simdjson_result(error_code error) noexcept
+    : internal::simdjson_result_base<dom::object>(error) {}
+
+inline simdjson_result<dom::element> simdjson_result<dom::object>::operator[](std::string_view key) const noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+inline simdjson_result<dom::element> simdjson_result<dom::object>::operator[](const char *key) const noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+inline simdjson_result<dom::element> simdjson_result<dom::object>::at_pointer(std::string_view json_pointer) const noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+inline simdjson_result<dom::element> simdjson_result<dom::object>::at_key(std::string_view key) const noexcept {
+  if (error()) { return error(); }
+  return first.at_key(key);
+}
+inline simdjson_result<dom::element> simdjson_result<dom::object>::at_key_case_insensitive(std::string_view key) const noexcept {
+  if (error()) { return error(); }
+  return first.at_key_case_insensitive(key);
+}
+
+#if SIMDJSON_EXCEPTIONS
+
+inline dom::object::iterator simdjson_result<dom::object>::begin() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.begin();
+}
+inline dom::object::iterator simdjson_result<dom::object>::end() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.end();
+}
+inline size_t simdjson_result<dom::object>::size() const noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first.size();
+}
+
+#endif // SIMDJSON_EXCEPTIONS
+
+namespace dom {
+
+//
+// object inline implementation
+//
+simdjson_inline object::object() noexcept : tape{} {}
+simdjson_inline object::object(const internal::tape_ref &_tape) noexcept : tape{_tape} { }
+inline object::iterator object::begin() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed object is invalid
+#endif
+  return internal::tape_ref(tape.doc, tape.json_index + 1);
+}
+inline object::iterator object::end() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed object is invalid
+#endif
+  return internal::tape_ref(tape.doc, tape.after_element() - 1);
+}
+inline size_t object::size() const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed object is invalid
+#endif
+  return tape.scope_count();
+}
+
+inline simdjson_result<element> object::operator[](std::string_view key) const noexcept {
+  return at_key(key);
+}
+inline simdjson_result<element> object::operator[](const char *key) const noexcept {
+  return at_key(key);
+}
+inline simdjson_result<element> object::at_pointer(std::string_view json_pointer) const noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  // issue https://github.com/simdjson/simdjson/issues/1914
+  assert (tape.usable()); // the default constructed object is invalid
+#endif
+  if(json_pointer.empty()) { // an empty string means that we return the current node
+      return element(this->tape); // copy the current node
+  } else if(json_pointer[0] != '/') { // otherwise there is an error
+      return INVALID_JSON_POINTER;
+  }
+  json_pointer = json_pointer.substr(1);
+  size_t slash = json_pointer.find('/');
+  std::string_view key = json_pointer.substr(0, slash);
+  // Grab the child with the given key
+  simdjson_result<element> child;
+
+  // If there is an escape character in the key, unescape it and then get the child.
+  size_t escape = key.find('~');
+  if (escape != std::string_view::npos) {
+    // Unescape the key
+    std::string unescaped(key);
+    do {
+      switch (unescaped[escape+1]) {
+        case '0':
+          unescaped.replace(escape, 2, "~");
+          break;
+        case '1':
+          unescaped.replace(escape, 2, "/");
+          break;
+        default:
+          return INVALID_JSON_POINTER; // "Unexpected ~ escape character in JSON pointer");
+      }
+      escape = unescaped.find('~', escape+1);
+    } while (escape != std::string::npos);
+    child = at_key(unescaped);
+  } else {
+    child = at_key(key);
+  }
+  if(child.error()) {
+    return child; // we do not continue if there was an error
+  }
+  // If there is a /, we have to recurse and look up more of the path
+  if (slash != std::string_view::npos) {
+    child = child.at_pointer(json_pointer.substr(slash));
+  }
+  return child;
+}
+
+inline simdjson_result<element> object::at_key(std::string_view key) const noexcept {
+  iterator end_field = end();
+  for (iterator field = begin(); field != end_field; ++field) {
+    if (field.key_equals(key)) {
+      return field.value();
+    }
+  }
+  return NO_SUCH_FIELD;
+}
+// In case you wonder why we need this, please see
+// https://github.com/simdjson/simdjson/issues/323
+// People do seek keys in a case-insensitive manner.
+inline simdjson_result<element> object::at_key_case_insensitive(std::string_view key) const noexcept {
+  iterator end_field = end();
+  for (iterator field = begin(); field != end_field; ++field) {
+    if (field.key_equals_case_insensitive(key)) {
+      return field.value();
+    }
+  }
+  return NO_SUCH_FIELD;
+}
+
+//
+// object::iterator inline implementation
+//
+simdjson_inline object::iterator::iterator(const internal::tape_ref &_tape) noexcept : tape{_tape} { }
+inline const key_value_pair object::iterator::operator*() const noexcept {
+  return key_value_pair(key(), value());
+}
+inline bool object::iterator::operator!=(const object::iterator& other) const noexcept {
+  return tape.json_index != other.tape.json_index;
+}
+inline bool object::iterator::operator==(const object::iterator& other) const noexcept {
+  return tape.json_index == other.tape.json_index;
+}
+inline bool object::iterator::operator<(const object::iterator& other) const noexcept {
+  return tape.json_index < other.tape.json_index;
+}
+inline bool object::iterator::operator<=(const object::iterator& other) const noexcept {
+  return tape.json_index <= other.tape.json_index;
+}
+inline bool object::iterator::operator>=(const object::iterator& other) const noexcept {
+  return tape.json_index >= other.tape.json_index;
+}
+inline bool object::iterator::operator>(const object::iterator& other) const noexcept {
+  return tape.json_index > other.tape.json_index;
+}
+inline object::iterator& object::iterator::operator++() noexcept {
+  tape.json_index++;
+  tape.json_index = tape.after_element();
+  return *this;
+}
+inline object::iterator object::iterator::operator++(int) noexcept {
+  object::iterator out = *this;
+  ++*this;
+  return out;
+}
+inline std::string_view object::iterator::key() const noexcept {
+  return tape.get_string_view();
+}
+inline uint32_t object::iterator::key_length() const noexcept {
+  return tape.get_string_length();
+}
+inline const char* object::iterator::key_c_str() const noexcept {
+  return reinterpret_cast<const char *>(&tape.doc->string_buf[size_t(tape.tape_value()) + sizeof(uint32_t)]);
+}
+inline element object::iterator::value() const noexcept {
+  return element(internal::tape_ref(tape.doc, tape.json_index + 1));
+}
+
+/**
+ * Design notes:
+ * Instead of constructing a string_view and then comparing it with a
+ * user-provided strings, it is probably more performant to have dedicated
+ * functions taking as a parameter the string we want to compare against
+ * and return true when they are equal. That avoids the creation of a temporary
+ * std::string_view. Though it is possible for the compiler to avoid entirely
+ * any overhead due to string_view, relying too much on compiler magic is
+ * problematic: compiler magic sometimes fail, and then what do you do?
+ * Also, enticing users to rely on high-performance function is probably better
+ * on the long run.
+ */
+
+inline bool object::iterator::key_equals(std::string_view o) const noexcept {
+  // We use the fact that the key length can be computed quickly
+  // without access to the string buffer.
+  const uint32_t len = key_length();
+  if(o.size() == len) {
+    // We avoid construction of a temporary string_view instance.
+    return (memcmp(o.data(), key_c_str(), len) == 0);
+  }
+  return false;
+}
+
+inline bool object::iterator::key_equals_case_insensitive(std::string_view o) const noexcept {
+  // We use the fact that the key length can be computed quickly
+  // without access to the string buffer.
+  const uint32_t len = key_length();
+  if(o.size() == len) {
+      // See For case-insensitive string comparisons, avoid char-by-char functions
+      // https://lemire.me/blog/2020/04/30/for-case-insensitive-string-comparisons-avoid-char-by-char-functions/
+      // Note that it might be worth rolling our own strncasecmp function, with vectorization.
+      return (simdjson_strncasecmp(o.data(), key_c_str(), len) == 0);
+  }
+  return false;
+}
+//
+// key_value_pair inline implementation
+//
+inline key_value_pair::key_value_pair(std::string_view _key, element _value) noexcept :
+  key(_key), value(_value) {}
+
+} // namespace dom
+
+} // namespace simdjson
+
+#if defined(__cpp_lib_ranges)
+static_assert(std::ranges::view<simdjson::dom::object>);
+static_assert(std::ranges::sized_range<simdjson::dom::object>);
+#if SIMDJSON_EXCEPTIONS
+static_assert(std::ranges::view<simdjson::simdjson_result<simdjson::dom::object>>);
+static_assert(std::ranges::sized_range<simdjson::simdjson_result<simdjson::dom::object>>);
+#endif // SIMDJSON_EXCEPTIONS
+#endif // defined(__cpp_lib_ranges)
+
+#endif // SIMDJSON_INLINE_OBJECT_H
+/* end file include/simdjson/dom/object-inl.h */
+/* begin file include/simdjson/dom/parsedjson_iterator-inl.h */
+#ifndef SIMDJSON_INLINE_PARSEDJSON_ITERATOR_H
+#define SIMDJSON_INLINE_PARSEDJSON_ITERATOR_H
+
+#include <cstring>
+
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+
+namespace simdjson {
+
+// VS2017 reports deprecated warnings when you define a deprecated class's methods.
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_DEPRECATED_WARNING
+
+// Because of template weirdness, the actual class definition is inline in the document class
+simdjson_warn_unused bool dom::parser::Iterator::is_ok() const {
+  return location < tape_length;
+}
+
+// useful for debugging purposes
+size_t dom::parser::Iterator::get_tape_location() const {
+  return location;
+}
+
+// useful for debugging purposes
+size_t dom::parser::Iterator::get_tape_length() const {
+  return tape_length;
+}
+
+// returns the current depth (start at 1 with 0 reserved for the fictitious root
+// node)
+size_t dom::parser::Iterator::get_depth() const {
+  return depth;
+}
+
+// A scope is a series of nodes at the same depth, typically it is either an
+// object ({) or an array ([). The root node has type 'r'.
+uint8_t dom::parser::Iterator::get_scope_type() const {
+  return depth_index[depth].scope_type;
+}
+
+bool dom::parser::Iterator::move_forward() {
+  if (location + 1 >= tape_length) {
+    return false; // we are at the end!
+  }
+
+  if ((current_type == '[') || (current_type == '{')) {
+    // We are entering a new scope
+    depth++;
+    assert(depth < max_depth);
+    depth_index[depth].start_of_scope = location;
+    depth_index[depth].scope_type = current_type;
+  } else if ((current_type == ']') || (current_type == '}')) {
+    // Leaving a scope.
+    depth--;
+  } else if (is_number()) {
+    // these types use 2 locations on the tape, not just one.
+    location += 1;
+  }
+
+  location += 1;
+  current_val = doc.tape[location];
+  current_type = uint8_t(current_val >> 56);
+  return true;
+}
+
+void dom::parser::Iterator::move_to_value() {
+  // assume that we are on a key, so move by 1.
+  location += 1;
+  current_val = doc.tape[location];
+  current_type = uint8_t(current_val >> 56);
+}
+
+bool dom::parser::Iterator::move_to_key(const char *key) {
+    if (down()) {
+      do {
+        const bool right_key = (strcmp(get_string(), key) == 0);
+        move_to_value();
+        if (right_key) {
+          return true;
+        }
+      } while (next());
+      up();
+    }
+    return false;
+}
+
+bool dom::parser::Iterator::move_to_key_insensitive(
+    const char *key) {
+    if (down()) {
+      do {
+        const bool right_key = (simdjson_strcasecmp(get_string(), key) == 0);
+        move_to_value();
+        if (right_key) {
+          return true;
+        }
+      } while (next());
+      up();
+    }
+    return false;
+}
+
+bool dom::parser::Iterator::move_to_key(const char *key,
+                                                       uint32_t length) {
+  if (down()) {
+    do {
+      bool right_key = ((get_string_length() == length) &&
+                        (memcmp(get_string(), key, length) == 0));
+      move_to_value();
+      if (right_key) {
+        return true;
+      }
+    } while (next());
+    up();
+  }
+  return false;
+}
+
+bool dom::parser::Iterator::move_to_index(uint32_t index) {
+  if (down()) {
+    uint32_t i = 0;
+    for (; i < index; i++) {
+      if (!next()) {
+        break;
+      }
+    }
+    if (i == index) {
+      return true;
+    }
+    up();
+  }
+  return false;
+}
+
+bool dom::parser::Iterator::prev() {
+  size_t target_location = location;
+  to_start_scope();
+  size_t npos = location;
+  if (target_location == npos) {
+    return false; // we were already at the start
+  }
+  size_t oldnpos;
+  // we have that npos < target_location here
+  do {
+    oldnpos = npos;
+    if ((current_type == '[') || (current_type == '{')) {
+      // we need to jump
+      npos = uint32_t(current_val);
+    } else {
+      npos = npos + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
+    }
+  } while (npos < target_location);
+  location = oldnpos;
+  current_val = doc.tape[location];
+  current_type = uint8_t(current_val >> 56);
+  return true;
+}
+
+bool dom::parser::Iterator::up() {
+  if (depth == 1) {
+    return false; // don't allow moving back to root
+  }
+  to_start_scope();
+  // next we just move to the previous value
+  depth--;
+  location -= 1;
+  current_val = doc.tape[location];
+  current_type = uint8_t(current_val >> 56);
+  return true;
+}
+
+bool dom::parser::Iterator::down() {
+  if (location + 1 >= tape_length) {
+    return false;
+  }
+  if ((current_type == '[') || (current_type == '{')) {
+    size_t npos = uint32_t(current_val);
+    if (npos == location + 2) {
+      return false; // we have an empty scope
+    }
+    depth++;
+    assert(depth < max_depth);
+    location = location + 1;
+    depth_index[depth].start_of_scope = location;
+    depth_index[depth].scope_type = current_type;
+    current_val = doc.tape[location];
+    current_type = uint8_t(current_val >> 56);
+    return true;
+  }
+  return false;
+}
+
+void dom::parser::Iterator::to_start_scope() {
+  location = depth_index[depth].start_of_scope;
+  current_val = doc.tape[location];
+  current_type = uint8_t(current_val >> 56);
+}
+
+bool dom::parser::Iterator::next() {
+  size_t npos;
+  if ((current_type == '[') || (current_type == '{')) {
+    // we need to jump
+    npos = uint32_t(current_val);
+  } else {
+    npos = location + (is_number() ? 2 : 1);
+  }
+  uint64_t next_val = doc.tape[npos];
+  uint8_t next_type = uint8_t(next_val >> 56);
+  if ((next_type == ']') || (next_type == '}')) {
+    return false; // we reached the end of the scope
+  }
+  location = npos;
+  current_val = next_val;
+  current_type = next_type;
+  return true;
+}
+dom::parser::Iterator::Iterator(const dom::parser &pj) noexcept(false)
+    : doc(pj.doc)
+{
+#if SIMDJSON_EXCEPTIONS
+  if (!pj.valid) { throw simdjson_error(pj.error); }
+#else
+  if (!pj.valid) { return; } //  abort() usage is forbidden in the library
+#endif
+
+  max_depth = pj.max_depth();
+  depth_index = new scopeindex_t[max_depth + 1];
+  depth_index[0].start_of_scope = location;
+  current_val = doc.tape[location++];
+  current_type = uint8_t(current_val >> 56);
+  depth_index[0].scope_type = current_type;
+  tape_length = size_t(current_val & internal::JSON_VALUE_MASK);
+  if (location < tape_length) {
+    // If we make it here, then depth_capacity must >=2, but the compiler
+    // may not know this.
+    current_val = doc.tape[location];
+    current_type = uint8_t(current_val >> 56);
+    depth++;
+    assert(depth < max_depth);
+    depth_index[depth].start_of_scope = location;
+    depth_index[depth].scope_type = current_type;
+  }
+}
+dom::parser::Iterator::Iterator(
+    const dom::parser::Iterator &o) noexcept
+    : doc(o.doc),
+    max_depth(o.depth),
+    depth(o.depth),
+    location(o.location),
+    tape_length(o.tape_length),
+    current_type(o.current_type),
+    current_val(o.current_val)
+{
+  depth_index = new scopeindex_t[max_depth+1];
+  std::memcpy(depth_index, o.depth_index, (depth + 1) * sizeof(depth_index[0]));
+}
+
+dom::parser::Iterator::~Iterator() noexcept {
+  if (depth_index) { delete[] depth_index; }
+}
+
+bool dom::parser::Iterator::print(std::ostream &os, bool escape_strings) const {
+  if (!is_ok()) {
+    return false;
+  }
+  switch (current_type) {
+  case '"': // we have a string
+    os << '"';
+    if (escape_strings) {
+      os << internal::escape_json_string(std::string_view(get_string(), get_string_length()));
+    } else {
+      // was: os << get_string();, but given that we can include null chars, we
+      // have to do something crazier:
+      std::copy(get_string(), get_string() + get_string_length(), std::ostream_iterator<char>(os));
+    }
+    os << '"';
+    break;
+  case 'l': // we have a long int
+    os << get_integer();
+    break;
+  case 'u':
+    os << get_unsigned_integer();
+    break;
+  case 'd':
+    os << get_double();
+    break;
+  case 'n': // we have a null
+    os << "null";
+    break;
+  case 't': // we have a true
+    os << "true";
+    break;
+  case 'f': // we have a false
+    os << "false";
+    break;
+  case '{': // we have an object
+  case '}': // we end an object
+  case '[': // we start an array
+  case ']': // we end an array
+    os << char(current_type);
+    break;
+  default:
+    return false;
+  }
+  return true;
+}
+
+bool dom::parser::Iterator::move_to(const char *pointer,
+                                                   uint32_t length) {
+  char *new_pointer = nullptr;
+  if (pointer[0] == '#') {
+    // Converting fragment representation to string representation
+    new_pointer = new char[length];
+    uint32_t new_length = 0;
+    for (uint32_t i = 1; i < length; i++) {
+      if (pointer[i] == '%' && pointer[i + 1] == 'x') {
+#if __cpp_exceptions
+        try {
+#endif
+          int fragment =
+              std::stoi(std::string(&pointer[i + 2], 2), nullptr, 16);
+          if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) {
+            // escaping the character
+            new_pointer[new_length] = '\\';
+            new_length++;
+          }
+          new_pointer[new_length] = char(fragment);
+          i += 3;
+#if __cpp_exceptions
+        } catch (std::invalid_argument &) {
+          delete[] new_pointer;
+          return false; // the fragment is invalid
+        }
+#endif
+      } else {
+        new_pointer[new_length] = pointer[i];
+      }
+      new_length++;
+    }
+    length = new_length;
+    pointer = new_pointer;
+  }
+
+  // saving the current state
+  size_t depth_s = depth;
+  size_t location_s = location;
+  uint8_t current_type_s = current_type;
+  uint64_t current_val_s = current_val;
+
+  rewind(); // The json pointer is used from the root of the document.
+
+  bool found = relative_move_to(pointer, length);
+  delete[] new_pointer;
+
+  if (!found) {
+    // since the pointer has found nothing, we get back to the original
+    // position.
+    depth = depth_s;
+    location = location_s;
+    current_type = current_type_s;
+    current_val = current_val_s;
+  }
+
+  return found;
+}
+
+bool dom::parser::Iterator::relative_move_to(const char *pointer,
+                                                            uint32_t length) {
+  if (length == 0) {
+    // returns the whole document
+    return true;
+  }
+
+  if (pointer[0] != '/') {
+    // '/' must be the first character
+    return false;
+  }
+
+  // finding the key in an object or the index in an array
+  std::string key_or_index;
+  uint32_t offset = 1;
+
+  // checking for the "-" case
+  if (is_array() && pointer[1] == '-') {
+    if (length != 2) {
+      // the pointer must be exactly "/-"
+      // there can't be anything more after '-' as an index
+      return false;
+    }
+    key_or_index = '-';
+    offset = length; // will skip the loop coming right after
+  }
+
+  // We either transform the first reference token to a valid json key
+  // or we make sure it is a valid index in an array.
+  for (; offset < length; offset++) {
+    if (pointer[offset] == '/') {
+      // beginning of the next key or index
+      break;
+    }
+    if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) {
+      // the index of an array must be an integer
+      // we also make sure std::stoi won't discard whitespaces later
+      return false;
+    }
+    if (pointer[offset] == '~') {
+      // "~1" represents "/"
+      if (pointer[offset + 1] == '1') {
+        key_or_index += '/';
+        offset++;
+        continue;
+      }
+      // "~0" represents "~"
+      if (pointer[offset + 1] == '0') {
+        key_or_index += '~';
+        offset++;
+        continue;
+      }
+    }
+    if (pointer[offset] == '\\') {
+      if (pointer[offset + 1] == '\\' || pointer[offset + 1] == '"' ||
+          (pointer[offset + 1] <= 0x1F)) {
+        key_or_index += pointer[offset + 1];
+        offset++;
+        continue;
+      }
+      return false; // invalid escaped character
+    }
+    if (pointer[offset] == '\"') {
+      // unescaped quote character. this is an invalid case.
+      // lets do nothing and assume most pointers will be valid.
+      // it won't find any corresponding json key anyway.
+      // return false;
+    }
+    key_or_index += pointer[offset];
+  }
+
+  bool found = false;
+  if (is_object()) {
+    if (move_to_key(key_or_index.c_str(), uint32_t(key_or_index.length()))) {
+      found = relative_move_to(pointer + offset, length - offset);
+    }
+  } else if (is_array()) {
+    if (key_or_index == "-") { // handling "-" case first
+      if (down()) {
+        while (next())
+          ; // moving to the end of the array
+        // moving to the nonexistent value right after...
+        size_t npos;
+        if ((current_type == '[') || (current_type == '{')) {
+          // we need to jump
+          npos = uint32_t(current_val);
+        } else {
+          npos =
+              location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
+        }
+        location = npos;
+        current_val = doc.tape[npos];
+        current_type = uint8_t(current_val >> 56);
+        return true; // how could it fail ?
+      }
+    } else { // regular numeric index
+      // The index can't have a leading '0'
+      if (key_or_index[0] == '0' && key_or_index.length() > 1) {
+        return false;
+      }
+      // it cannot be empty
+      if (key_or_index.length() == 0) {
+        return false;
+      }
+      // we already checked the index contains only valid digits
+      uint32_t index = std::stoi(key_or_index);
+      if (move_to_index(index)) {
+        found = relative_move_to(pointer + offset, length - offset);
+      }
+    }
+  }
+
+  return found;
+}
+
+SIMDJSON_POP_DISABLE_WARNINGS
+} // namespace simdjson
+
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+
+
+#endif // SIMDJSON_INLINE_PARSEDJSON_ITERATOR_H
+/* end file include/simdjson/dom/parsedjson_iterator-inl.h */
+/* begin file include/simdjson/dom/parser-inl.h */
+#ifndef SIMDJSON_INLINE_PARSER_H
+#define SIMDJSON_INLINE_PARSER_H
+
+#include <cstdio>
+#include <climits>
+
+namespace simdjson {
+namespace dom {
+
+//
+// parser inline implementation
+//
+simdjson_inline parser::parser(size_t max_capacity) noexcept
+  : _max_capacity{max_capacity},
+    loaded_bytes(nullptr) {
+}
+simdjson_inline parser::parser(parser &&other) noexcept = default;
+simdjson_inline parser &parser::operator=(parser &&other) noexcept = default;
+
+inline bool parser::is_valid() const noexcept { return valid; }
+inline int parser::get_error_code() const noexcept { return error; }
+inline std::string parser::get_error_message() const noexcept { return error_message(error); }
+
+inline bool parser::dump_raw_tape(std::ostream &os) const noexcept {
+  return valid ? doc.dump_raw_tape(os) : false;
+}
+
+inline simdjson_result<size_t> parser::read_file(const std::string &path) noexcept {
+  // Open the file
+  SIMDJSON_PUSH_DISABLE_WARNINGS
+  SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+  std::FILE *fp = std::fopen(path.c_str(), "rb");
+  SIMDJSON_POP_DISABLE_WARNINGS
+
+  if (fp == nullptr) {
+    return IO_ERROR;
+  }
+
+  // Get the file size
+  int ret;
+#if defined(SIMDJSON_VISUAL_STUDIO) && !SIMDJSON_IS_32BITS
+  ret = _fseeki64(fp, 0, SEEK_END);
+#else
+  ret = std::fseek(fp, 0, SEEK_END);
+#endif // _WIN64
+  if(ret < 0) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+#if defined(SIMDJSON_VISUAL_STUDIO) && !SIMDJSON_IS_32BITS
+  __int64 len = _ftelli64(fp);
+  if(len == -1L) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+#else
+  long len = std::ftell(fp);
+  if((len < 0) || (len == LONG_MAX)) {
+    std::fclose(fp);
+    return IO_ERROR;
+  }
+#endif
+
+  // Make sure we have enough capacity to load the file
+  if (_loaded_bytes_capacity < size_t(len)) {
+    loaded_bytes.reset( internal::allocate_padded_buffer(len) );
+    if (!loaded_bytes) {
+      std::fclose(fp);
+      return MEMALLOC;
+    }
+    _loaded_bytes_capacity = len;
+  }
+
+  // Read the string
+  std::rewind(fp);
+  size_t bytes_read = std::fread(loaded_bytes.get(), 1, len, fp);
+  if (std::fclose(fp) != 0 || bytes_read != size_t(len)) {
+    return IO_ERROR;
+  }
+
+  return bytes_read;
+}
+
+inline simdjson_result<element> parser::load(const std::string &path) & noexcept {
+  size_t len;
+  auto _error = read_file(path).get(len);
+  if (_error) { return _error; }
+  return parse(loaded_bytes.get(), len, false);
+}
+
+inline simdjson_result<document_stream> parser::load_many(const std::string &path, size_t batch_size) noexcept {
+  size_t len;
+  auto _error = read_file(path).get(len);
+  if (_error) { return _error; }
+  if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; }
+  return document_stream(*this, reinterpret_cast<const uint8_t*>(loaded_bytes.get()), len, batch_size);
+}
+
+inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
+  // Important: we need to ensure that document has enough capacity.
+  // Important: It is possible that provided_doc is actually the internal 'doc' within the parser!!!
+  error_code _error = ensure_capacity(provided_doc, len);
+  if (_error) { return _error; }
+  if (realloc_if_needed) {
+    // Make sure we have enough capacity to copy len bytes
+    if (!loaded_bytes || _loaded_bytes_capacity < len) {
+      loaded_bytes.reset( internal::allocate_padded_buffer(len) );
+      if (!loaded_bytes) {
+        return MEMALLOC;
+      }
+      _loaded_bytes_capacity = len;
+    }
+    std::memcpy(static_cast<void *>(loaded_bytes.get()), buf, len);
+  }
+  _error = implementation->parse(realloc_if_needed ? reinterpret_cast<const uint8_t*>(loaded_bytes.get()): buf, len, provided_doc);
+
+  if (_error) { return _error; }
+
+  return provided_doc.root();
+}
+
+simdjson_inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const char *buf, size_t len, bool realloc_if_needed) & noexcept {
+  return parse_into_document(provided_doc, reinterpret_cast<const uint8_t *>(buf), len, realloc_if_needed);
+}
+simdjson_inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const std::string &s) & noexcept {
+  return parse_into_document(provided_doc, s.data(), s.length(), s.capacity() - s.length() < SIMDJSON_PADDING);
+}
+simdjson_inline simdjson_result<element> parser::parse_into_document(document& provided_doc, const padded_string &s) & noexcept {
+  return parse_into_document(provided_doc, s.data(), s.length(), false);
+}
+
+
+inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bool realloc_if_needed) & noexcept {
+  return parse_into_document(doc, buf, len, realloc_if_needed);
+}
+
+simdjson_inline simdjson_result<element> parser::parse(const char *buf, size_t len, bool realloc_if_needed) & noexcept {
+  return parse(reinterpret_cast<const uint8_t *>(buf), len, realloc_if_needed);
+}
+simdjson_inline simdjson_result<element> parser::parse(const std::string &s) & noexcept {
+  return parse(s.data(), s.length(), s.capacity() - s.length() < SIMDJSON_PADDING);
+}
+simdjson_inline simdjson_result<element> parser::parse(const padded_string &s) & noexcept {
+  return parse(s.data(), s.length(), false);
+}
+
+inline simdjson_result<document_stream> parser::parse_many(const uint8_t *buf, size_t len, size_t batch_size) noexcept {
+  if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; }
+  return document_stream(*this, buf, len, batch_size);
+}
+inline simdjson_result<document_stream> parser::parse_many(const char *buf, size_t len, size_t batch_size) noexcept {
+  return parse_many(reinterpret_cast<const uint8_t *>(buf), len, batch_size);
+}
+inline simdjson_result<document_stream> parser::parse_many(const std::string &s, size_t batch_size) noexcept {
+  return parse_many(s.data(), s.length(), batch_size);
+}
+inline simdjson_result<document_stream> parser::parse_many(const padded_string &s, size_t batch_size) noexcept {
+  return parse_many(s.data(), s.length(), batch_size);
+}
+
+simdjson_inline size_t parser::capacity() const noexcept {
+  return implementation ? implementation->capacity() : 0;
+}
+simdjson_inline size_t parser::max_capacity() const noexcept {
+  return _max_capacity;
+}
+simdjson_inline size_t parser::max_depth() const noexcept {
+  return implementation ? implementation->max_depth() : DEFAULT_MAX_DEPTH;
+}
+
+simdjson_warn_unused
+inline error_code parser::allocate(size_t capacity, size_t max_depth) noexcept {
+  //
+  // Reallocate implementation if needed
+  //
+  error_code err;
+  if (implementation) {
+    err = implementation->allocate(capacity, max_depth);
+  } else {
+    err = simdjson::get_active_implementation()->create_dom_parser_implementation(capacity, max_depth, implementation);
+  }
+  if (err) { return err; }
+  return SUCCESS;
+}
+
+#ifndef SIMDJSON_DISABLE_DEPRECATED_API
+simdjson_warn_unused
+inline bool parser::allocate_capacity(size_t capacity, size_t max_depth) noexcept {
+  return !allocate(capacity, max_depth);
+}
+#endif // SIMDJSON_DISABLE_DEPRECATED_API
+
+inline error_code parser::ensure_capacity(size_t desired_capacity) noexcept {
+  return ensure_capacity(doc, desired_capacity);
+}
+
+
+inline error_code parser::ensure_capacity(document& target_document, size_t desired_capacity) noexcept {
+  // 1. It is wasteful to allocate a document and a parser for documents spanning less than MINIMAL_DOCUMENT_CAPACITY bytes.
+  // 2. If we allow desired_capacity = 0 then it is possible to exit this function with implementation == nullptr.
+  if(desired_capacity < MINIMAL_DOCUMENT_CAPACITY) { desired_capacity = MINIMAL_DOCUMENT_CAPACITY; }
+  // If we don't have enough capacity, (try to) automatically bump it.
+  // If the document needs allocation, do it too.
+  // Both in one if statement to minimize unlikely branching.
+  //
+  // Note: we must make sure that this function is called if capacity() == 0. We do so because we
+  // ensure that desired_capacity > 0.
+  if (simdjson_unlikely(capacity() < desired_capacity || target_document.capacity() < desired_capacity)) {
+    if (desired_capacity > max_capacity()) {
+      return error = CAPACITY;
+    }
+    error_code err1 = target_document.capacity() < desired_capacity ? target_document.allocate(desired_capacity) : SUCCESS;
+    error_code err2 = capacity() < desired_capacity ? allocate(desired_capacity, max_depth()) : SUCCESS;
+    if(err1 != SUCCESS) { return error = err1; }
+    if(err2 != SUCCESS) { return error = err2; }
+  }
+  return SUCCESS;
+}
+
+simdjson_inline void parser::set_max_capacity(size_t max_capacity) noexcept {
+  if(max_capacity < MINIMAL_DOCUMENT_CAPACITY) {
+    _max_capacity = max_capacity;
+  } else {
+    _max_capacity = MINIMAL_DOCUMENT_CAPACITY;
+  }
+}
+
+} // namespace dom
+} // namespace simdjson
+
+#endif // SIMDJSON_INLINE_PARSER_H
+/* end file include/simdjson/dom/parser-inl.h */
+/* begin file include/simdjson/internal/tape_ref-inl.h */
+#ifndef SIMDJSON_INLINE_TAPE_REF_H
+#define SIMDJSON_INLINE_TAPE_REF_H
+
+#include <cstring>
+
+namespace simdjson {
+namespace internal {
+
+//
+// tape_ref inline implementation
+//
+simdjson_inline tape_ref::tape_ref() noexcept : doc{nullptr}, json_index{0} {}
+simdjson_inline tape_ref::tape_ref(const dom::document *_doc, size_t _json_index) noexcept : doc{_doc}, json_index{_json_index} {}
+
+
+simdjson_inline bool tape_ref::is_document_root() const noexcept {
+  return json_index == 1; // should we ever change the structure of the tape, this should get updated.
+}
+simdjson_inline bool tape_ref::usable() const noexcept {
+  return doc != nullptr; // when the document pointer is null, this tape_ref is uninitialized (should not be accessed).
+}
+// Some value types have a specific on-tape word value. It can be faster
+// to check the type by doing a word-to-word comparison instead of extracting the
+// most significant 8 bits.
+
+simdjson_inline bool tape_ref::is_double() const noexcept {
+  constexpr uint64_t tape_double = uint64_t(tape_type::DOUBLE)<<56;
+  return doc->tape[json_index] == tape_double;
+}
+simdjson_inline bool tape_ref::is_int64() const noexcept {
+  constexpr uint64_t tape_int64 = uint64_t(tape_type::INT64)<<56;
+  return doc->tape[json_index] == tape_int64;
+}
+simdjson_inline bool tape_ref::is_uint64() const noexcept {
+  constexpr uint64_t tape_uint64 = uint64_t(tape_type::UINT64)<<56;
+  return doc->tape[json_index] == tape_uint64;
+}
+simdjson_inline bool tape_ref::is_false() const noexcept {
+  constexpr uint64_t tape_false = uint64_t(tape_type::FALSE_VALUE)<<56;
+  return doc->tape[json_index] == tape_false;
+}
+simdjson_inline bool tape_ref::is_true() const noexcept {
+  constexpr uint64_t tape_true = uint64_t(tape_type::TRUE_VALUE)<<56;
+  return doc->tape[json_index] == tape_true;
+}
+simdjson_inline bool tape_ref::is_null_on_tape() const noexcept {
+  constexpr uint64_t tape_null = uint64_t(tape_type::NULL_VALUE)<<56;
+  return doc->tape[json_index] == tape_null;
+}
+
+inline size_t tape_ref::after_element() const noexcept {
+  switch (tape_ref_type()) {
+    case tape_type::START_ARRAY:
+    case tape_type::START_OBJECT:
+      return matching_brace_index();
+    case tape_type::UINT64:
+    case tape_type::INT64:
+    case tape_type::DOUBLE:
+      return json_index + 2;
+    default:
+      return json_index + 1;
+  }
+}
+simdjson_inline tape_type tape_ref::tape_ref_type() const noexcept {
+  return static_cast<tape_type>(doc->tape[json_index] >> 56);
+}
+simdjson_inline uint64_t internal::tape_ref::tape_value() const noexcept {
+  return doc->tape[json_index] & internal::JSON_VALUE_MASK;
+}
+simdjson_inline uint32_t internal::tape_ref::matching_brace_index() const noexcept {
+  return uint32_t(doc->tape[json_index]);
+}
+simdjson_inline uint32_t internal::tape_ref::scope_count() const noexcept {
+  return uint32_t((doc->tape[json_index] >> 32) & internal::JSON_COUNT_MASK);
+}
+
+template<typename T>
+simdjson_inline T tape_ref::next_tape_value() const noexcept {
+  static_assert(sizeof(T) == sizeof(uint64_t), "next_tape_value() template parameter must be 64-bit");
+  // Though the following is tempting...
+  //  return *reinterpret_cast<const T*>(&doc->tape[json_index + 1]);
+  // It is not generally safe. It is safer, and often faster to rely
+  // on memcpy. Yes, it is uglier, but it is also encapsulated.
+  T x;
+  std::memcpy(&x,&doc->tape[json_index + 1],sizeof(uint64_t));
+  return x;
+}
+
+simdjson_inline uint32_t internal::tape_ref::get_string_length() const noexcept {
+  size_t string_buf_index = size_t(tape_value());
+  uint32_t len;
+  std::memcpy(&len, &doc->string_buf[string_buf_index], sizeof(len));
+  return len;
+}
+
+simdjson_inline const char * internal::tape_ref::get_c_str() const noexcept {
+  size_t string_buf_index = size_t(tape_value());
+  return reinterpret_cast<const char *>(&doc->string_buf[string_buf_index + sizeof(uint32_t)]);
+}
+
+inline std::string_view internal::tape_ref::get_string_view() const noexcept {
+  return std::string_view(
+      get_c_str(),
+      get_string_length()
+  );
+}
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INLINE_TAPE_REF_H
+/* end file include/simdjson/internal/tape_ref-inl.h */
+/* begin file include/simdjson/dom/serialization-inl.h */
+
+#ifndef SIMDJSON_SERIALIZATION_INL_H
+#define SIMDJSON_SERIALIZATION_INL_H
+
+
+#include <cinttypes>
+#include <type_traits>
+
+namespace simdjson {
+namespace dom {
+inline bool parser::print_json(std::ostream &os) const noexcept {
+  if (!valid) { return false; }
+  simdjson::internal::string_builder<> sb;
+  sb.append(doc.root());
+  std::string_view answer = sb.str();
+  os << answer;
+  return true;
+}
+}
+/***
+ * Number utility functions
+ **/
+
+
+namespace {
+/**@private
+ * Escape sequence like \b or \u0001
+ * We expect that most compilers will use 8 bytes for this data structure.
+ **/
+struct escape_sequence {
+    uint8_t length;
+    const char string[7]; // technically, we only ever need 6 characters, we pad to 8
+};
+/**@private
+ * This converts a signed integer into a character sequence.
+ * The caller is responsible for providing enough memory (at least
+ * 20 characters.)
+ * Though various runtime libraries provide itoa functions,
+ * it is not part of the C++ standard. The C++17 standard
+ * adds the to_chars functions which would do as well, but
+ * we want to support C++11.
+ */
+char *fast_itoa(char *output, int64_t value) noexcept {
+  // This is a standard implementation of itoa.
+  char buffer[20];
+  uint64_t value_positive;
+  // In general, negating a signed integer is unsafe.
+  if(value < 0) {
+    *output++ = '-';
+    // Doing value_positive = -value; while avoiding
+    // undefined behavior warnings.
+    // It assumes two complement's which is universal at this
+    // point in time.
+    std::memcpy(&value_positive, &value, sizeof(value));
+    value_positive = (~value_positive) + 1; // this is a negation
+  } else {
+    value_positive = value;
+  }
+  // We work solely with value_positive. It *might* be easier
+  // for an optimizing compiler to deal with an unsigned variable
+  // as far as performance goes.
+  const char *const end_buffer = buffer + 20;
+  char *write_pointer = buffer + 19;
+  // A faster approach is possible if we expect large integers:
+  // unroll the loop (work in 100s, 1000s) and use some kind of
+  // memoization.
+  while(value_positive >= 10) {
+    *write_pointer-- = char('0' + (value_positive % 10));
+    value_positive /= 10;
+  }
+  *write_pointer = char('0' + value_positive);
+  size_t len = end_buffer - write_pointer;
+  std::memcpy(output, write_pointer, len);
+  return output + len;
+}
+/**@private
+ * This converts an unsigned integer into a character sequence.
+ * The caller is responsible for providing enough memory (at least
+ * 19 characters.)
+ * Though various runtime libraries provide itoa functions,
+ * it is not part of the C++ standard. The C++17 standard
+ * adds the to_chars functions which would do as well, but
+ * we want to support C++11.
+ */
+char *fast_itoa(char *output, uint64_t value) noexcept {
+  // This is a standard implementation of itoa.
+  char buffer[20];
+  const char *const end_buffer = buffer + 20;
+  char *write_pointer = buffer + 19;
+  // A faster approach is possible if we expect large integers:
+  // unroll the loop (work in 100s, 1000s) and use some kind of
+  // memoization.
+  while(value >= 10) {
+    *write_pointer-- = char('0' + (value % 10));
+    value /= 10;
+  };
+  *write_pointer = char('0' + value);
+  size_t len = end_buffer - write_pointer;
+  std::memcpy(output, write_pointer, len);
+  return output + len;
+}
+} // anonymous namespace
+namespace internal {
+
+/***
+ * Minifier/formatter code.
+ **/
+
+simdjson_inline void mini_formatter::number(uint64_t x) {
+  char number_buffer[24];
+  char *newp = fast_itoa(number_buffer, x);
+  buffer.insert(buffer.end(), number_buffer, newp);
+}
+
+simdjson_inline void mini_formatter::number(int64_t x) {
+  char number_buffer[24];
+  char *newp = fast_itoa(number_buffer, x);
+  buffer.insert(buffer.end(), number_buffer, newp);
+}
+
+simdjson_inline void mini_formatter::number(double x) {
+  char number_buffer[24];
+  // Currently, passing the nullptr to the second argument is
+  // safe because our implementation does not check the second
+  // argument.
+  char *newp = internal::to_chars(number_buffer, nullptr, x);
+  buffer.insert(buffer.end(), number_buffer, newp);
+}
+
+simdjson_inline void mini_formatter::start_array() { one_char('['); }
+simdjson_inline void mini_formatter::end_array() { one_char(']'); }
+simdjson_inline void mini_formatter::start_object() { one_char('{'); }
+simdjson_inline void mini_formatter::end_object() { one_char('}'); }
+simdjson_inline void mini_formatter::comma() { one_char(','); }
+
+
+simdjson_inline void mini_formatter::true_atom() {
+  const char * s = "true";
+  buffer.insert(buffer.end(), s, s + 4);
+}
+simdjson_inline void mini_formatter::false_atom() {
+  const char * s = "false";
+  buffer.insert(buffer.end(), s, s + 5);
+}
+simdjson_inline void mini_formatter::null_atom() {
+  const char * s = "null";
+  buffer.insert(buffer.end(), s, s + 4);
+}
+simdjson_inline void mini_formatter::one_char(char c) { buffer.push_back(c); }
+simdjson_inline void mini_formatter::key(std::string_view unescaped) {
+  string(unescaped);
+  one_char(':');
+}
+simdjson_inline void mini_formatter::string(std::string_view unescaped) {
+  one_char('\"');
+  size_t i = 0;
+  // Fast path for the case where we have no control character, no ", and no backslash.
+  // This should include most keys.
+  //
+  // We would like to use 'bool' but some compilers take offense to bitwise operation
+  // with bool types.
+  constexpr static char needs_escaping[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  for(;i + 8 <= unescaped.length(); i += 8) {
+    // Poor's man vectorization. This could get much faster if we used SIMD.
+    //
+    // It is not the case that replacing '|' with '||' would be neutral performance-wise.
+    if(needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i+1])]
+      | needs_escaping[uint8_t(unescaped[i+2])] | needs_escaping[uint8_t(unescaped[i+3])]
+      | needs_escaping[uint8_t(unescaped[i+4])] | needs_escaping[uint8_t(unescaped[i+5])]
+      | needs_escaping[uint8_t(unescaped[i+6])] | needs_escaping[uint8_t(unescaped[i+7])]
+      ) { break; }
+  }
+  for(;i < unescaped.length(); i++) {
+    if(needs_escaping[uint8_t(unescaped[i])]) { break; }
+  }
+  // The following is also possible and omits a 256-byte table, but it is slower:
+  // for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F)
+  //      && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {}
+
+  // At least for long strings, the following should be fast. We could
+  // do better by integrating the checks and the insertion.
+  buffer.insert(buffer.end(), unescaped.data(), unescaped.data() + i);
+  // We caught a control character if we enter this loop (slow).
+  // Note that we are do not restart from the beginning, but rather we continue
+  // from the point where we encountered something that requires escaping.
+  for (; i < unescaped.length(); i++) {
+    switch (unescaped[i]) {
+    case '\"':
+      {
+        const char * s = "\\\"";
+        buffer.insert(buffer.end(), s, s + 2);
+      }
+      break;
+    case '\\':
+      {
+        const char * s = "\\\\";
+        buffer.insert(buffer.end(), s, s + 2);
+      }
+      break;
+    default:
+      if (uint8_t(unescaped[i]) <= 0x1F) {
+        // If packed, this uses 8 * 32 bytes.
+        // Note that we expect most compilers to embed this code in the data
+        // section.
+        constexpr static escape_sequence escaped[32] = {
+          {6, "\\u0000"}, {6, "\\u0001"}, {6, "\\u0002"}, {6, "\\u0003"},
+          {6, "\\u0004"}, {6, "\\u0005"}, {6, "\\u0006"}, {6, "\\u0007"},
+          {2, "\\b"},     {2, "\\t"},     {2, "\\n"},     {6, "\\u000b"},
+          {2, "\\f"},     {2, "\\r"},     {6, "\\u000e"}, {6, "\\u000f"},
+          {6, "\\u0010"}, {6, "\\u0011"}, {6, "\\u0012"}, {6, "\\u0013"},
+          {6, "\\u0014"}, {6, "\\u0015"}, {6, "\\u0016"}, {6, "\\u0017"},
+          {6, "\\u0018"}, {6, "\\u0019"}, {6, "\\u001a"}, {6, "\\u001b"},
+          {6, "\\u001c"}, {6, "\\u001d"}, {6, "\\u001e"}, {6, "\\u001f"}};
+        auto u = escaped[uint8_t(unescaped[i])];
+        buffer.insert(buffer.end(), u.string, u.string + u.length);
+      } else {
+        one_char(unescaped[i]);
+      }
+    } // switch
+  }   // for
+  one_char('\"');
+}
+
+inline void mini_formatter::clear() {
+  buffer.clear();
+}
+
+simdjson_inline std::string_view mini_formatter::str() const {
+  return std::string_view(buffer.data(), buffer.size());
+}
+
+
+/***
+ * String building code.
+ **/
+
+template <class serializer>
+inline void string_builder<serializer>::append(simdjson::dom::element value) {
+  // using tape_type = simdjson::internal::tape_type;
+  size_t depth = 0;
+  constexpr size_t MAX_DEPTH = 16;
+  bool is_object[MAX_DEPTH];
+  is_object[0] = false;
+  bool after_value = false;
+
+  internal::tape_ref iter(value.tape);
+  do {
+    // print commas after each value
+    if (after_value) {
+      format.comma();
+    }
+    // If we are in an object, print the next key and :, and skip to the next
+    // value.
+    if (is_object[depth]) {
+      format.key(iter.get_string_view());
+      iter.json_index++;
+    }
+    switch (iter.tape_ref_type()) {
+
+    // Arrays
+    case tape_type::START_ARRAY: {
+      // If we're too deep, we need to recurse to go deeper.
+      depth++;
+      if (simdjson_unlikely(depth >= MAX_DEPTH)) {
+        append(simdjson::dom::array(iter));
+        iter.json_index = iter.matching_brace_index() - 1; // Jump to the ]
+        depth--;
+        break;
+      }
+
+      // Output start [
+      format.start_array();
+      iter.json_index++;
+
+      // Handle empty [] (we don't want to come back around and print commas)
+      if (iter.tape_ref_type() == tape_type::END_ARRAY) {
+        format.end_array();
+        depth--;
+        break;
+      }
+
+      is_object[depth] = false;
+      after_value = false;
+      continue;
+    }
+
+    // Objects
+    case tape_type::START_OBJECT: {
+      // If we're too deep, we need to recurse to go deeper.
+      depth++;
+      if (simdjson_unlikely(depth >= MAX_DEPTH)) {
+        append(simdjson::dom::object(iter));
+        iter.json_index = iter.matching_brace_index() - 1; // Jump to the }
+        depth--;
+        break;
+      }
+
+      // Output start {
+      format.start_object();
+      iter.json_index++;
+
+      // Handle empty {} (we don't want to come back around and print commas)
+      if (iter.tape_ref_type() == tape_type::END_OBJECT) {
+        format.end_object();
+        depth--;
+        break;
+      }
+
+      is_object[depth] = true;
+      after_value = false;
+      continue;
+    }
+
+    // Scalars
+    case tape_type::STRING:
+      format.string(iter.get_string_view());
+      break;
+    case tape_type::INT64:
+      format.number(iter.next_tape_value<int64_t>());
+      iter.json_index++; // numbers take up 2 spots, so we need to increment
+                         // extra
+      break;
+    case tape_type::UINT64:
+      format.number(iter.next_tape_value<uint64_t>());
+      iter.json_index++; // numbers take up 2 spots, so we need to increment
+                         // extra
+      break;
+    case tape_type::DOUBLE:
+      format.number(iter.next_tape_value<double>());
+      iter.json_index++; // numbers take up 2 spots, so we need to increment
+                         // extra
+      break;
+    case tape_type::TRUE_VALUE:
+      format.true_atom();
+      break;
+    case tape_type::FALSE_VALUE:
+      format.false_atom();
+      break;
+    case tape_type::NULL_VALUE:
+      format.null_atom();
+      break;
+
+    // These are impossible
+    case tape_type::END_ARRAY:
+    case tape_type::END_OBJECT:
+    case tape_type::ROOT:
+      SIMDJSON_UNREACHABLE();
+    }
+    iter.json_index++;
+    after_value = true;
+
+    // Handle multiple ends in a row
+    while (depth != 0 && (iter.tape_ref_type() == tape_type::END_ARRAY ||
+                          iter.tape_ref_type() == tape_type::END_OBJECT)) {
+      if (iter.tape_ref_type() == tape_type::END_ARRAY) {
+        format.end_array();
+      } else {
+        format.end_object();
+      }
+      depth--;
+      iter.json_index++;
+    }
+
+    // Stop when we're at depth 0
+  } while (depth != 0);
+}
+
+template <class serializer>
+inline void string_builder<serializer>::append(simdjson::dom::object value) {
+  format.start_object();
+  auto pair = value.begin();
+  auto end = value.end();
+  if (pair != end) {
+    append(*pair);
+    for (++pair; pair != end; ++pair) {
+      format.comma();
+      append(*pair);
+    }
+  }
+  format.end_object();
+}
+
+template <class serializer>
+inline void string_builder<serializer>::append(simdjson::dom::array value) {
+  format.start_array();
+  auto iter = value.begin();
+  auto end = value.end();
+  if (iter != end) {
+    append(*iter);
+    for (++iter; iter != end; ++iter) {
+      format.comma();
+      append(*iter);
+    }
+  }
+  format.end_array();
+}
+
+template <class serializer>
+simdjson_inline void string_builder<serializer>::append(simdjson::dom::key_value_pair kv) {
+  format.key(kv.key);
+  append(kv.value);
+}
+
+template <class serializer>
+simdjson_inline void string_builder<serializer>::clear() {
+  format.clear();
+}
+
+template <class serializer>
+simdjson_inline std::string_view string_builder<serializer>::str() const {
+  return format.str();
+}
+
+
+} // namespace internal
+} // namespace simdjson
+
+#endif
+/* end file include/simdjson/dom/serialization-inl.h */
+
+SIMDJSON_POP_DISABLE_WARNINGS
+
+#endif // SIMDJSON_DOM_H
+/* end file include/simdjson/dom.h */
+/* begin file include/simdjson/builtin.h */
+#ifndef SIMDJSON_BUILTIN_H
+#define SIMDJSON_BUILTIN_H
+
+/* begin file include/simdjson/implementations.h */
+#ifndef SIMDJSON_IMPLEMENTATIONS_H
+#define SIMDJSON_IMPLEMENTATIONS_H
+
+/* begin file include/simdjson/implementation-base.h */
+#ifndef SIMDJSON_IMPLEMENTATION_BASE_H
+#define SIMDJSON_IMPLEMENTATION_BASE_H
+
+/**
+ * @file
+ *
+ * Includes common stuff needed for implementations.
+ */
+
+
+// Implementation-internal files (must be included before the implementations themselves, to keep
+// amalgamation working--otherwise, the first time a file is included, it might be put inside the
+// #ifdef SIMDJSON_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't
+// compile unless that implementation is turned on).
+/* begin file include/simdjson/internal/jsoncharutils_tables.h */
+#ifndef SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H
+#define SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H
+
+
+#ifdef JSON_TEST_STRINGS
+void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
+                  const uint8_t *parsed_end);
+void found_bad_string(const uint8_t *buf);
+#endif
+
+namespace simdjson {
+namespace internal {
+// structural chars here are
+// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
+// we are also interested in the four whitespace characters
+// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
+
+extern SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256];
+extern SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256];
+extern SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886];
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H
+/* end file include/simdjson/internal/jsoncharutils_tables.h */
+/* begin file include/simdjson/internal/numberparsing_tables.h */
+#ifndef SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H
+#define SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H
+
+
+namespace simdjson {
+namespace internal {
+/**
+ * The smallest non-zero float (binary64) is 2^-1074.
+ * We take as input numbers of the form w x 10^q where w < 2^64.
+ * We have that w * 10^-343  <  2^(64-344) 5^-343 < 2^-1076.
+ * However, we have that
+ * (2^64-1) * 10^-342 =  (2^64-1) * 2^-342 * 5^-342 > 2^-1074.
+ * Thus it is possible for a number of the form w * 10^-342 where
+ * w is a 64-bit value to be a non-zero floating-point number.
+ *********
+ * Any number of form w * 10^309 where w>= 1 is going to be
+ * infinite in binary64 so we never need to worry about powers
+ * of 5 greater than 308.
+ */
+constexpr int smallest_power = -342;
+constexpr int largest_power = 308;
+
+/**
+ * Represents a 128-bit value.
+ * low: least significant 64 bits.
+ * high: most significant 64 bits.
+ */
+struct value128 {
+  uint64_t low;
+  uint64_t high;
+};
+
+
+// Precomputed powers of ten from 10^0 to 10^22. These
+// can be represented exactly using the double type.
+extern SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[];
+
+
+/**
+ * When mapping numbers from decimal to binary,
+ * we go from w * 10^q to m * 2^p but we have
+ * 10^q = 5^q * 2^q, so effectively
+ * we are trying to match
+ * w * 2^q * 5^q to m * 2^p. Thus the powers of two
+ * are not a concern since they can be represented
+ * exactly using the binary notation, only the powers of five
+ * affect the binary significand.
+ */
+
+
+// The truncated powers of five from 5^-342 all the way to 5^308
+// The mantissa is truncated to 128 bits, and
+// never rounded up. Uses about 10KB.
+extern SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[];
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H
+/* end file include/simdjson/internal/numberparsing_tables.h */
+/* begin file include/simdjson/internal/simdprune_tables.h */
+#ifndef SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H
+#define SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H
+
+#include <cstdint>
+
+namespace simdjson { // table modified and copied from
+namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
+
+extern SIMDJSON_DLLIMPORTEXPORT const unsigned char BitsSetTable256mul2[256];
+
+extern SIMDJSON_DLLIMPORTEXPORT const uint8_t pshufb_combine_table[272];
+
+// 256 * 8 bytes = 2kB, easily fits in cache.
+extern SIMDJSON_DLLIMPORTEXPORT const uint64_t thintable_epi8[256];
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H
+/* end file include/simdjson/internal/simdprune_tables.h */
+
+#endif // SIMDJSON_IMPLEMENTATION_BASE_H
+/* end file include/simdjson/implementation-base.h */
+
+//
+// First, figure out which implementations can be run. Doing it here makes it so we don't have to worry about the order
+// in which we include them.
+//
+
+#ifndef SIMDJSON_IMPLEMENTATION_ARM64
+#define SIMDJSON_IMPLEMENTATION_ARM64 (SIMDJSON_IS_ARM64)
+#endif
+#define SIMDJSON_CAN_ALWAYS_RUN_ARM64 SIMDJSON_IMPLEMENTATION_ARM64 && SIMDJSON_IS_ARM64
+
+#ifdef __has_include
+// How do we detect that a compiler supports vbmi2?
+// For sure if the following header is found, we are ok?
+#if __has_include(<avx512vbmi2intrin.h>)
+#define SIMDJSON_COMPILER_SUPPORTS_VBMI2 1
+#endif
+#endif
+
+#ifdef _MSC_VER
+#if _MSC_VER >= 1920
+// Visual Studio 2019 and up support VBMI2 under x64 even if the header
+// avx512vbmi2intrin.h is not found.
+#define SIMDJSON_COMPILER_SUPPORTS_VBMI2 1
+#endif
+#endif
+
+// By default, we allow AVX512.
+#ifndef SIMDJSON_AVX512_ALLOWED
+#define SIMDJSON_AVX512_ALLOWED 1
+#endif
+
+// Default Icelake to on if this is x86-64. Even if we're not compiled for it, it could be selected
+// at runtime.
+#ifndef SIMDJSON_IMPLEMENTATION_ICELAKE
+#define SIMDJSON_IMPLEMENTATION_ICELAKE ((SIMDJSON_IS_X86_64) && (SIMDJSON_AVX512_ALLOWED) && (SIMDJSON_COMPILER_SUPPORTS_VBMI2))
+#endif
+
+#ifdef _MSC_VER
+// To see why  (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
+// https://github.com/simdjson/simdjson/issues/1247
+#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE ((SIMDJSON_IMPLEMENTATION_ICELAKE) && (__AVX2__) && (__AVX512F__) && (__AVX512DQ__) && (__AVX512CD__) && (__AVX512BW__) && (__AVX512VL__) && (__AVX512VBMI2__))
+#else
+#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE ((SIMDJSON_IMPLEMENTATION_ICELAKE) && (__AVX2__) && (__BMI__) && (__PCLMUL__) && (__LZCNT__) && (__AVX512F__) && (__AVX512DQ__) && (__AVX512CD__) && (__AVX512BW__) && (__AVX512VL__) && (__AVX512VBMI2__))
+#endif
+
+// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
+// at runtime.
+#ifndef SIMDJSON_IMPLEMENTATION_HASWELL
+#define SIMDJSON_IMPLEMENTATION_HASWELL SIMDJSON_IS_X86_64
+#endif
+#ifdef _MSC_VER
+// To see why  (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
+// https://github.com/simdjson/simdjson/issues/1247
+#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL ((SIMDJSON_IMPLEMENTATION_HASWELL) && (SIMDJSON_IS_X86_64) && (__AVX2__))
+#else
+#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL ((SIMDJSON_IMPLEMENTATION_HASWELL) && (SIMDJSON_IS_X86_64) && (__AVX2__) && (__BMI__) && (__PCLMUL__) && (__LZCNT__))
+#endif
+
+// Default Westmere to on if this is x86-64. Note that the macro SIMDJSON_REQUIRES_HASWELL appears unused.
+#ifndef SIMDJSON_IMPLEMENTATION_WESTMERE
+#define SIMDJSON_IMPLEMENTATION_WESTMERE (SIMDJSON_IS_X86_64 && !SIMDJSON_REQUIRES_HASWELL)
+#endif
+#define SIMDJSON_CAN_ALWAYS_RUN_WESTMERE (SIMDJSON_IMPLEMENTATION_WESTMERE && SIMDJSON_IS_X86_64 && __SSE4_2__ && __PCLMUL__)
+
+#ifndef SIMDJSON_IMPLEMENTATION_PPC64
+#define SIMDJSON_IMPLEMENTATION_PPC64 (SIMDJSON_IS_PPC64)
+#endif
+#define SIMDJSON_CAN_ALWAYS_RUN_PPC64 SIMDJSON_IMPLEMENTATION_PPC64 && SIMDJSON_IS_PPC64
+
+// Default Fallback to on unless a builtin implementation has already been selected.
+#ifndef SIMDJSON_IMPLEMENTATION_FALLBACK
+#define SIMDJSON_IMPLEMENTATION_FALLBACK 1 // (!SIMDJSON_CAN_ALWAYS_RUN_ARM64 && !SIMDJSON_CAN_ALWAYS_RUN_HASWELL && !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE && !SIMDJSON_CAN_ALWAYS_RUN_PPC64)
+#endif
+#define SIMDJSON_CAN_ALWAYS_RUN_FALLBACK SIMDJSON_IMPLEMENTATION_FALLBACK
+
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_UNDESIRED_WARNINGS
+
+// Implementations
+/* begin file include/simdjson/arm64.h */
+#ifndef SIMDJSON_ARM64_H
+#define SIMDJSON_ARM64_H
+
+
+#if SIMDJSON_IMPLEMENTATION_ARM64
+
+namespace simdjson {
+/**
+ * Implementation for NEON (ARMv8).
+ */
+namespace arm64 {
+} // namespace arm64
+} // namespace simdjson
+
+/* begin file include/simdjson/arm64/implementation.h */
+#ifndef SIMDJSON_ARM64_IMPLEMENTATION_H
+#define SIMDJSON_ARM64_IMPLEMENTATION_H
+
+
+namespace simdjson {
+namespace arm64 {
+
+namespace {
+using namespace simdjson;
+using namespace simdjson::dom;
+}
+
+class implementation final : public simdjson::implementation {
+public:
+  simdjson_inline implementation() : simdjson::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
+  simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+};
+
+} // namespace arm64
+} // namespace simdjson
+
+#endif // SIMDJSON_ARM64_IMPLEMENTATION_H
+/* end file include/simdjson/arm64/implementation.h */
+
+/* begin file include/simdjson/arm64/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "arm64"
+// #define SIMDJSON_IMPLEMENTATION arm64
+/* end file include/simdjson/arm64/begin.h */
+
+// Declarations
+/* begin file include/simdjson/generic/dom_parser_implementation.h */
+
+namespace simdjson {
+namespace arm64 {
+
+// expectation: sizeof(open_container) = 64/8.
+struct open_container {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct open_container
+
+static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<open_container[]> open_containers{};
+  /** Whether each open container is a [ or { */
+  std::unique_ptr<bool[]> is_array{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+
+  inline dom_parser_implementation() noexcept;
+  inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
+  inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+  simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
+  simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
+  simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst) const noexcept final;
+  inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
+  inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
+private:
+  simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
+
+};
+
+} // namespace arm64
+} // namespace simdjson
+
+namespace simdjson {
+namespace arm64 {
+
+inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
+inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
+inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
+
+// Leaving these here so they can be inlined if so desired
+inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
+  // Stage 1 index output
+  size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
+  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
+  structural_indexes[0] = 0;
+  n_structural_indexes = 0;
+
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  // Stage 2 stacks
+  open_containers.reset(new (std::nothrow) open_container[max_depth]);
+  is_array.reset(new (std::nothrow) bool[max_depth]);
+  if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
+
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+
+} // namespace arm64
+} // namespace simdjson
+/* end file include/simdjson/generic/dom_parser_implementation.h */
+/* begin file include/simdjson/arm64/intrinsics.h */
+#ifndef SIMDJSON_ARM64_INTRINSICS_H
+#define SIMDJSON_ARM64_INTRINSICS_H
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <arm_neon.h>
+
+static_assert(sizeof(uint8x16_t) <= simdjson::SIMDJSON_PADDING, "insufficient padding for arm64");
+
+#endif //  SIMDJSON_ARM64_INTRINSICS_H
+/* end file include/simdjson/arm64/intrinsics.h */
+/* begin file include/simdjson/arm64/bitmanipulation.h */
+#ifndef SIMDJSON_ARM64_BITMANIPULATION_H
+#define SIMDJSON_ARM64_BITMANIPULATION_H
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+SIMDJSON_NO_SANITIZE_UNDEFINED
+simdjson_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  // Search the mask data from least significant bit (LSB)
+  // to the most significant bit (MSB) for a set bit (1).
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else // SIMDJSON_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return input_num & (input_num-1);
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline int leading_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline int count_ones(uint64_t input_num) {
+   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+}
+
+
+#if defined(__GNUC__) // catches clang and gcc
+/**
+ * ARM has a fast 64-bit "bit reversal function" that is handy. However,
+ * it is not generally available as an intrinsic function under Visual
+ * Studio (though this might be changing). Even under clang/gcc, we
+ * apparently need to invoke inline assembly.
+ */
+/*
+ * We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that
+ * work well with bit reversal may use it.
+ */
+#define SIMDJSON_PREFER_REVERSE_BITS 1
+
+/* reverse the bits */
+simdjson_inline uint64_t reverse_bits(uint64_t input_num) {
+  uint64_t rev_bits;
+  __asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num));
+  return rev_bits;
+}
+
+/**
+ * Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes,
+ * then this will set to zero the leading bit. It is possible for leading_zeroes to be
+ * greating or equal to 63 in which case we trigger undefined behavior, but the output
+ * of such undefined behavior is never used.
+ **/
+SIMDJSON_NO_SANITIZE_UNDEFINED
+simdjson_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) {
+  return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes);
+}
+
+#endif
+
+simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  *result = value1 + value2;
+  return *result < value1;
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+
+#endif // SIMDJSON_ARM64_BITMANIPULATION_H
+/* end file include/simdjson/arm64/bitmanipulation.h */
+/* begin file include/simdjson/arm64/bitmask.h */
+#ifndef SIMDJSON_ARM64_BITMASK_H
+#define SIMDJSON_ARM64_BITMASK_H
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
+  /////////////
+  // We could do this with PMULL, but it is apparently slow.
+  //
+  //#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
+  //return vmull_p64(-1ULL, bitmask);
+  //#else
+  // Analysis by @sebpop:
+  // When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
+  // in between other vector code, so effectively the extra cycles of the sequence do not matter
+  // because the GPR units are idle otherwise and the critical path is on the FP side.
+  // Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 )
+  // and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
+  ///////////
+  bitmask ^= bitmask << 1;
+  bitmask ^= bitmask << 2;
+  bitmask ^= bitmask << 4;
+  bitmask ^= bitmask << 8;
+  bitmask ^= bitmask << 16;
+  bitmask ^= bitmask << 32;
+  return bitmask;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+
+#endif
+/* end file include/simdjson/arm64/bitmask.h */
+/* begin file include/simdjson/arm64/simd.h */
+#ifndef SIMDJSON_ARM64_SIMD_H
+#define SIMDJSON_ARM64_SIMD_H
+
+#include <type_traits>
+
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace simd {
+
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+namespace {
+// Start of private section with Visual Studio workaround
+
+
+/**
+ * make_uint8x16_t initializes a SIMD register (uint8x16_t).
+ * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
+ * is not recognized under Visual Studio! This is a workaround.
+ * Using a std::initializer_list<uint8_t>  as a parameter resulted in
+ * inefficient code. With the current approach, if the parameters are
+ * compile-time constants,
+ * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
+ * You should not use this function except for compile-time constants:
+ * it is not efficient.
+ */
+simdjson_inline uint8x16_t make_uint8x16_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
+                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8,
+                                         uint8_t x9,  uint8_t x10, uint8_t x11, uint8_t x12,
+                                         uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_u8(array);
+  uint8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_u8(x1, x, 0);
+  x = vsetq_lane_u8(x2, x, 1);
+  x = vsetq_lane_u8(x3, x, 2);
+  x = vsetq_lane_u8(x4, x, 3);
+  x = vsetq_lane_u8(x5, x, 4);
+  x = vsetq_lane_u8(x6, x, 5);
+  x = vsetq_lane_u8(x7, x, 6);
+  x = vsetq_lane_u8(x8, x, 7);
+  x = vsetq_lane_u8(x9, x, 8);
+  x = vsetq_lane_u8(x10, x, 9);
+  x = vsetq_lane_u8(x11, x, 10);
+  x = vsetq_lane_u8(x12, x, 11);
+  x = vsetq_lane_u8(x13, x, 12);
+  x = vsetq_lane_u8(x14, x, 13);
+  x = vsetq_lane_u8(x15, x, 14);
+  x = vsetq_lane_u8(x16, x, 15);
+  return x;
+}
+
+simdjson_inline uint8x8_t make_uint8x8_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
+                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8) {
+  uint8x8_t x{};
+  x = vset_lane_u8(x1, x, 0);
+  x = vset_lane_u8(x2, x, 1);
+  x = vset_lane_u8(x3, x, 2);
+  x = vset_lane_u8(x4, x, 3);
+  x = vset_lane_u8(x5, x, 4);
+  x = vset_lane_u8(x6, x, 5);
+  x = vset_lane_u8(x7, x, 6);
+  x = vset_lane_u8(x8, x, 7);
+  return x;
+}
+
+// We have to do the same work for make_int8x16_t
+simdjson_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_t x4,
+                                       int8_t x5,  int8_t x6,  int8_t x7,  int8_t x8,
+                                       int8_t x9,  int8_t x10, int8_t x11, int8_t x12,
+                                       int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_s8(array);
+  int8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_s8(x1, x, 0);
+  x = vsetq_lane_s8(x2, x, 1);
+  x = vsetq_lane_s8(x3, x, 2);
+  x = vsetq_lane_s8(x4, x, 3);
+  x = vsetq_lane_s8(x5, x, 4);
+  x = vsetq_lane_s8(x6, x, 5);
+  x = vsetq_lane_s8(x7, x, 6);
+  x = vsetq_lane_s8(x8, x, 7);
+  x = vsetq_lane_s8(x9, x, 8);
+  x = vsetq_lane_s8(x10, x, 9);
+  x = vsetq_lane_s8(x11, x, 10);
+  x = vsetq_lane_s8(x12, x, 11);
+  x = vsetq_lane_s8(x13, x, 12);
+  x = vsetq_lane_s8(x14, x, 13);
+  x = vsetq_lane_s8(x15, x, 14);
+  x = vsetq_lane_s8(x16, x, 15);
+  return x;
+}
+
+// End of private section with Visual Studio workaround
+} // namespace
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+
+
+  template<typename T>
+  struct simd8;
+
+  //
+  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
+  //
+  template<typename T, typename Mask=simd8<bool>>
+  struct base_u8 {
+    uint8x16_t value;
+    static const int SIZE = sizeof(value);
+
+    // Conversion from/to SIMD register
+    simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {}
+    simdjson_inline operator const uint8x16_t&() const { return this->value; }
+    simdjson_inline operator uint8x16_t&() { return this->value; }
+
+    // Bit operations
+    simdjson_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
+    simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
+    simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
+    simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
+    simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+    simdjson_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+
+    friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
+
+    template<int N=1>
+    simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return vextq_u8(prev_chunk, *this, 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base_u8<bool> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
+
+    static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
+
+    simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
+    // False constructor
+    simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {}
+    // Splat constructor
+    simdjson_inline simd8(bool _value) : simd8(splat(_value)) {}
+
+    // We return uint32_t instead of uint16_t because that seems to be more efficient for most
+    // purposes (cutting it down to uint16_t costs performance in some compilers).
+    simdjson_inline uint32_t to_bitmask() const {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+      const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+      auto minput = *this & bit_mask;
+      uint8x16_t tmp = vpaddq_u8(minput, minput);
+      tmp = vpaddq_u8(tmp, tmp);
+      tmp = vpaddq_u8(tmp, tmp);
+      return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+    }
+    simdjson_inline bool any() const { return vmaxvq_u8(*this) != 0; }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base_u8<uint8_t> {
+    static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
+    static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(0); }
+    static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
+
+    simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
+    // Zero constructor
+    simdjson_inline simd8() : simd8(zero()) {}
+    // Array constructor
+    simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+    // Splat constructor
+    simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Member-by-member initialization
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+    simdjson_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(make_uint8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+#else
+    simdjson_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(uint8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    }) {}
+#endif
+
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<uint8_t> repeat_16(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Store to array
+    simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
+
+    // Saturated math
+    simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
+    simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
+    simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
+    simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
+    simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
+
+    // Order-specific operations
+    simdjson_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
+    simdjson_inline uint8_t min_val() const { return vminvq_u8(*this); }
+    simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
+    simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
+    simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
+    simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
+    simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
+    simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
+    // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
+    simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
+    // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
+    simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
+
+    // Bit-specific operations
+    simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
+    simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
+    simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return lookup_table.apply_lookup_16_to(*this);
+    }
+
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint16_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    simdjson_inline void compress(uint16_t mask, L * output) const {
+      using internal::thintable_epi8;
+      using internal::BitsSetTable256mul2;
+      using internal::pshufb_combine_table;
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in two steps, first 8 bytes and then second 8 bytes
+      uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+      uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
+      uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
+      // we increment by 0x08 the second half of the mask
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
+#else
+      uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
+#endif
+      shufmask = vaddq_u8(shufmask, inc);
+      // this is the version "nearly pruned"
+      uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
+      // we still need to put the two halves together.
+      // we compute the popcount of the first half:
+      int pop1 = BitsSetTable256mul2[mask1];
+      // then load the corresponding mask, what it does is to write
+      // only the first pop1 bytes from the first 8 bytes, and then
+      // it fills in with the bytes from the second 8 bytes + some filling
+      // at the end.
+      uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
+      uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+      vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
+    }
+
+    // Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
+    // bitset) to output1, then those corresponding to a 0 in the high half to output2.
+    template<typename L>
+    simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
+      using internal::thintable_epi8;
+      uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+      uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+      uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
+      uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
+      // we increment by 0x08 the second half of the mask
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      uint8x8_t inc = make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
+#else
+      uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
+#endif
+      compactmask2 = vadd_u8(compactmask2, inc);
+      // store each result (with the second store possibly overlapping the first)
+      vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
+      vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
+    }
+
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+
+    template<typename T>
+    simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
+      return vqtbl1q_u8(*this, simd8<uint8_t>(original));
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> {
+    int8x16_t value;
+
+    static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
+    static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
+    static simdjson_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
+
+    // Conversion from/to SIMD register
+    simdjson_inline simd8(const int8x16_t _value) : value{_value} {}
+    simdjson_inline operator const int8x16_t&() const { return this->value; }
+    simdjson_inline operator int8x16_t&() { return this->value; }
+
+    // Zero constructor
+    simdjson_inline simd8() : simd8(zero()) {}
+    // Splat constructor
+    simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+    simdjson_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(make_int8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+#else
+    simdjson_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(int8x16_t{
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    }) {}
+#endif
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<int8_t> repeat_16(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Store to array
+    simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
+
+    // Explicit conversion to/from unsigned
+    //
+    // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
+    // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
+    // and relatively ugly and hard to read.
+#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
+    simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
+#endif
+    simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
+
+    // Math
+    simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
+    simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
+    simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
+    simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
+
+    // Order-sensitive comparisons
+    simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
+    simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
+    simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
+    simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
+    simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
+
+    template<int N=1>
+    simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
+      return vextq_s8(prev_chunk, *this, 16 - N);
+    }
+
+    // Perform a lookup assuming no value is larger than 16
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return lookup_table.apply_lookup_16_to(*this);
+    }
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+
+    template<typename T>
+    simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
+      return vqtbl1q_s8(*this, simd8<uint8_t>(original));
+    }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
+    const simd8<T> chunks[NUM_CHUNKS];
+
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
+
+    simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
+
+    simdjson_inline void store(T ptr[64]) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
+    }
+
+    simdjson_inline simd8<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+
+    simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
+      uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
+      // compute the prefix sum of the popcounts of each byte
+      uint64_t offsets = popcounts * 0x0101010101010101;
+      this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
+      this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
+      this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
+      this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
+      return offsets >> 56;
+    }
+
+    simdjson_inline uint64_t to_bitmask() const {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      const uint8x16_t bit_mask = make_uint8x16_t(
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      );
+#else
+      const uint8x16_t bit_mask = {
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      };
+#endif
+      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+      uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
+      uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
+      sum0 = vpaddq_u8(sum0, sum1);
+      sum0 = vpaddq_u8(sum0, sum0);
+      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdjson_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask,
+        this->chunks[2] == mask,
+        this->chunks[3] == mask
+      ).to_bitmask();
+    }
+
+    simdjson_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask,
+        this->chunks[2] <= mask,
+        this->chunks[3] <= mask
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
+
+} // namespace simd
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+
+#endif // SIMDJSON_ARM64_SIMD_H
+/* end file include/simdjson/arm64/simd.h */
+/* begin file include/simdjson/generic/jsoncharutils.h */
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+namespace jsoncharutils {
+
+// return non-zero if not a structural or whitespace char
+// zero otherwise
+simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace_negated[c];
+}
+
+simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace[c];
+}
+
+// returns a value with the high 16 bits set if not valid
+// otherwise returns the conversion of the 4 hex digits at src into the bottom
+// 16 bits of the 32-bit return register
+//
+// see
+// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
+static inline uint32_t hex_to_u32_nocheck(
+    const uint8_t *src) { // strictly speaking, static inline is a C-ism
+  uint32_t v1 = internal::digit_to_val32[630 + src[0]];
+  uint32_t v2 = internal::digit_to_val32[420 + src[1]];
+  uint32_t v3 = internal::digit_to_val32[210 + src[2]];
+  uint32_t v4 = internal::digit_to_val32[0 + src[3]];
+  return v1 | v2 | v3 | v4;
+}
+
+// given a code point cp, writes to c
+// the utf-8 code, outputting the length in
+// bytes, if the length is zero, the code point
+// is invalid
+//
+// This can possibly be made faster using pdep
+// and clz and table lookups, but JSON documents
+// have few escaped code points, and the following
+// function looks cheap.
+//
+// Note: we assume that surrogates are treated separately
+//
+simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
+  if (cp <= 0x7F) {
+    c[0] = uint8_t(cp);
+    return 1; // ascii
+  }
+  if (cp <= 0x7FF) {
+    c[0] = uint8_t((cp >> 6) + 192);
+    c[1] = uint8_t((cp & 63) + 128);
+    return 2; // universal plane
+    //  Surrogates are treated elsewhere...
+    //} //else if (0xd800 <= cp && cp <= 0xdfff) {
+    //  return 0; // surrogates // could put assert here
+  } else if (cp <= 0xFFFF) {
+    c[0] = uint8_t((cp >> 12) + 224);
+    c[1] = uint8_t(((cp >> 6) & 63) + 128);
+    c[2] = uint8_t((cp & 63) + 128);
+    return 3;
+  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
+                               // is not needed
+    c[0] = uint8_t((cp >> 18) + 240);
+    c[1] = uint8_t(((cp >> 12) & 63) + 128);
+    c[2] = uint8_t(((cp >> 6) & 63) + 128);
+    c[3] = uint8_t((cp & 63) + 128);
+    return 4;
+  }
+  // will return 0 when the code point was too large.
+  return 0; // bad r
+}
+
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit
+//
+static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
+  return x * (uint64_t)y;
+}
+static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + !!(lo < bd);
+  return lo;
+}
+#endif
+
+using internal::value128;
+
+simdjson_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
+  value128 answer;
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emultate
+  answer.high = __umulh(value1, value2);
+  answer.low = value1 * value2;
+#else
+  answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+  __uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#endif
+  return answer;
+}
+
+} // namespace jsoncharutils
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file include/simdjson/generic/jsoncharutils.h */
+/* begin file include/simdjson/generic/atomparsing.h */
+namespace simdjson {
+namespace arm64 {
+namespace {
+/// @private
+namespace atomparsing {
+
+// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
+// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
+// be certain that the character pointer will be properly aligned.
+// You might think that using memcpy makes this function expensive, but you'd be wrong.
+// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
+// to the compile-time constant 1936482662.
+simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
+
+
+// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
+// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
+simdjson_warn_unused
+simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
+  uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
+  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+  std::memcpy(&srcval, src, sizeof(uint32_t));
+  return srcval ^ string_to_uint32(atom);
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
+  return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_true_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "true"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
+  return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
+  if (len > 5) { return is_valid_false_atom(src); }
+  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
+  return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_null_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "null"); }
+  else { return false; }
+}
+
+} // namespace atomparsing
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file include/simdjson/generic/atomparsing.h */
+/* begin file include/simdjson/arm64/stringparsing.h */
+#ifndef SIMDJSON_ARM64_STRINGPARSING_H
+#define SIMDJSON_ARM64_STRINGPARSING_H
+
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+
+using namespace simd;
+
+// Holds backslashes and quotes locations.
+struct backslash_and_quote {
+public:
+  static constexpr uint32_t BYTES_PROCESSED = 32;
+  simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+
+  simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
+  simdjson_inline bool has_backslash() { return bs_bits != 0; }
+  simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
+  simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
+
+  uint32_t bs_bits;
+  uint32_t quote_bits;
+}; // struct backslash_and_quote
+
+simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
+  // this can read up to 31 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
+  simd8<uint8_t> v0(src);
+  simd8<uint8_t> v1(src + sizeof(v0));
+  v0.store(dst);
+  v1.store(dst + sizeof(v0));
+
+  // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we
+  // smash them together into a 64-byte mask and get the bitmask from there.
+  uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
+  return {
+    uint32_t(bs_and_quote),      // bs_bits
+    uint32_t(bs_and_quote >> 32) // quote_bits
+  };
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+
+#endif // SIMDJSON_ARM64_STRINGPARSING_H
+/* end file include/simdjson/arm64/stringparsing.h */
+/* begin file include/simdjson/arm64/numberparsing.h */
+#ifndef SIMDJSON_ARM64_NUMBERPARSING_H
+#define SIMDJSON_ARM64_NUMBERPARSING_H
+
+namespace simdjson {
+namespace arm64 {
+namespace {
+
+// we don't have SSE, so let us use a scalar function
+// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
+  uint64_t val;
+  std::memcpy(&val, chars, sizeof(uint64_t));
+  val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
+  val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
+  return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+
+#define SIMDJSON_SWAR_NUMBER_PARSING 1
+
+/* begin file include/simdjson/generic/numberparsing.h */
+#include <limits>
+
+namespace simdjson {
+namespace arm64 {
+
+namespace ondemand {
+/**
+ * The type of a JSON number
+ */
+enum class number_type {
+    floating_point_number=1, /// a binary64 number
+    signed_integer,          /// a signed integer that fits in a 64-bit word using two's complement
+    unsigned_integer         /// a positive integer larger or equal to 1<<63
+};
+}
+
+namespace {
+/// @private
+namespace numberparsing {
+
+
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
+#endif
+
+namespace {
+// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
+// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
+// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
+simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
+    double d;
+    mantissa &= ~(1ULL << 52);
+    mantissa |= real_exponent << 52;
+    mantissa |= ((static_cast<uint64_t>(negative)) << 63);
+    std::memcpy(&d, &mantissa, sizeof(d));
+    return d;
+}
+}
+// Attempts to compute i * 10^(power) exactly; and if "negative" is
+// true, negate the result.
+// This function will only work in some cases, when it does not work, success is
+// set to false. This should work *most of the time* (like 99% of the time).
+// We assume that power is in the [smallest_power,
+// largest_power] interval: the caller is responsible for this check.
+simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
+  // we start with a fast path
+  // It was described in
+  // Clinger WD. How to read floating point numbers accurately.
+  // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
+  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
+    // convert the integer into a double. This is lossless since
+    // 0 <= i <= 2^53 - 1.
+    d = double(i);
+    //
+    // The general idea is as follows.
+    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+    // 1) Both s and p can be represented exactly as 64-bit floating-point
+    // values
+    // (binary64).
+    // 2) Because s and p can be represented exactly as floating-point values,
+    // then s * p
+    // and s / p will produce correctly rounded values.
+    //
+    if (power < 0) {
+      d = d / simdjson::internal::power_of_ten[-power];
+    } else {
+      d = d * simdjson::internal::power_of_ten[power];
+    }
+    if (negative) {
+      d = -d;
+    }
+    return true;
+  }
+  // When 22 < power && power <  22 + 16, we could
+  // hope for another, secondary fast path.  It was
+  // described by David M. Gay in  "Correctly rounded
+  // binary-decimal and decimal-binary conversions." (1990)
+  // If you need to compute i * 10^(22 + x) for x < 16,
+  // first compute i * 10^x, if you know that result is exact
+  // (e.g., when i * 10^x < 2^53),
+  // then you can still proceed and do (i * 10^x) * 10^22.
+  // Is this worth your time?
+  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+  // for this second fast path to work.
+  // If you you have 22 < power *and* power <  22 + 16, and then you
+  // optimistically compute "i * 10^(x-22)", there is still a chance that you
+  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+  // this optimization maybe less common than we would like. Source:
+  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+
+  // The fast path has now failed, so we are failing back on the slower path.
+
+  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+  // possible, except if i == 0, so we handle i == 0 separately.
+  if(i == 0) {
+    d = negative ? -0.0 : 0.0;
+    return true;
+  }
+
+
+  // The exponent is 1024 + 63 + power
+  //     + floor(log(5**power)/log(2)).
+  // The 1024 comes from the ieee64 standard.
+  // The 63 comes from the fact that we use a 64-bit word.
+  //
+  // Computing floor(log(5**power)/log(2)) could be
+  // slow. Instead we use a fast function.
+  //
+  // For power in (-400,350), we have that
+  // (((152170 + 65536) * power ) >> 16);
+  // is equal to
+  //  floor(log(5**power)/log(2)) + power when power >= 0
+  // and it is equal to
+  //  ceil(log(5**-power)/log(2)) + power when power < 0
+  //
+  // The 65536 is (1<<16) and corresponds to
+  // (65536 * power) >> 16 ---> power
+  //
+  // ((152170 * power ) >> 16) is equal to
+  // floor(log(5**power)/log(2))
+  //
+  // Note that this is not magic: 152170/(1<<16) is
+  // approximatively equal to log(5)/log(2).
+  // The 1<<16 value is a power of two; we could use a
+  // larger power of 2 if we wanted to.
+  //
+  int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
+
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(i);
+  i <<= lz;
+
+
+  // We are going to need to do some 64-bit arithmetic to get a precise product.
+  // We use a table lookup approach.
+  // It is safe because
+  // power >= smallest_power
+  // and power <= largest_power
+  // We recover the mantissa of the power, it has a leading 1. It is always
+  // rounded down.
+  //
+  // We want the most significant 64 bits of the product. We know
+  // this will be non-zero because the most significant bit of i is
+  // 1.
+  const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
+  // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
+  //
+  // The full_multiplication function computes the 128-bit product of two 64-bit words
+  // with a returned value of type value128 with a "low component" corresponding to the
+  // 64-bit least significant bits of the product and with a "high component" corresponding
+  // to the 64-bit most significant bits of the product.
+  simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
+  // Both i and power_of_five_128[index] have their most significant bit set to 1 which
+  // implies that the either the most or the second most significant bit of the product
+  // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
+  // we make of the product. It also makes it easy to reason about the product: there
+  // is 0 or 1 leading zero in the product.
+
+  // Unless the least significant 9 bits of the high (64-bit) part of the full
+  // product are all 1s, then we know that the most significant 55 bits are
+  // exact and no further work is needed. Having 55 bits is necessary because
+  // we need 53 bits for the mantissa but we have to have one rounding bit and
+  // we can waste a bit if the most significant bit of the product is zero.
+  if((firstproduct.high & 0x1FF) == 0x1FF) {
+    // We want to compute i * 5^q, but only care about the top 55 bits at most.
+    // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
+    // the full computation is wasteful. So we do what is called a "truncated
+    // multiplication".
+    // We take the most significant 64-bits, and we put them in
+    // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
+    // to the desired approximation using one multiplication. Sometimes it does not suffice.
+    // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
+    // then we get a better approximation to i * 5^q. In very rare cases, even that
+    // will not suffice, though it is seemingly very hard to find such a scenario.
+    //
+    // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
+    // more complicated.
+    //
+    // There is an extra layer of complexity in that we need more than 55 bits of
+    // accuracy in the round-to-even scenario.
+    //
+    // The full_multiplication function computes the 128-bit product of two 64-bit words
+    // with a returned value of type value128 with a "low component" corresponding to the
+    // 64-bit least significant bits of the product and with a "high component" corresponding
+    // to the 64-bit most significant bits of the product.
+    simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
+    // At this point, we might need to add at most one to firstproduct, but this
+    // can only change the value of firstproduct.high if firstproduct.low is maximal.
+    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
+      // This is very unlikely, but if so, we need to do much more work!
+      return false;
+    }
+  }
+  uint64_t lower = firstproduct.low;
+  uint64_t upper = firstproduct.high;
+  // The final mantissa should be 53 bits with a leading 1.
+  // We shift it so that it occupies 54 bits with a leading 1.
+  ///////
+  uint64_t upperbit = upper >> 63;
+  uint64_t mantissa = upper >> (upperbit + 9);
+  lz += int(1 ^ upperbit);
+
+  // Here we have mantissa < (1<<54).
+  int64_t real_exponent = exponent - lz;
+  if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
+    // Here have that real_exponent <= 0 so -real_exponent >= 0
+    if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+      d = negative ? -0.0 : 0.0;
+      return true;
+    }
+    // next line is safe because -real_exponent + 1 < 0
+    mantissa >>= -real_exponent + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    mantissa += (mantissa & 1); // round up
+    mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
+    d = to_double(mantissa, real_exponent, negative);
+    return true;
+  }
+  // We have to round to even. The "to even" part
+  // is only a problem when we are right in between two floats
+  // which we guard against.
+  // If we have lots of trailing zeros, we may fall right between two
+  // floating-point values.
+  //
+  // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
+  // times a power of two. That is, it is right between a number with binary significand
+  // m and another number with binary significand m+1; and it must be the case
+  // that it cannot be represented by a float itself.
+  //
+  // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
+  // Recall that 10^q = 5^q * 2^q.
+  // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
+  //  5^23 <=  2^54 and it is the last power of five to qualify, so q <= 23.
+  // When q<0, we have  w  >=  (2m+1) x 5^{-q}.  We must have that w<2^{64} so
+  // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
+  // 2^{53} x 5^{-q} < 2^{64}.
+  // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
+  //
+  // We require lower <= 1 and not lower == 0 because we could not prove that
+  // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
+  if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
+    if((mantissa  << (upperbit + 64 - 53 - 2)) ==  upper) {
+      mantissa &= ~1;             // flip it so that we do not round up
+    }
+  }
+
+  mantissa += mantissa & 1;
+  mantissa >>= 1;
+
+  // Here we have mantissa < (1<<53), unless there was an overflow
+  if (mantissa >= (1ULL << 53)) {
+    //////////
+    // This will happen when parsing values such as 7.2057594037927933e+16
+    ////////
+    mantissa = (1ULL << 52);
+    real_exponent++;
+  }
+  mantissa &= ~(1ULL << 52);
+  // we have to check that real_exponent is in range, otherwise we bail out
+  if (simdjson_unlikely(real_exponent > 2046)) {
+    // We have an infinite value!!! We could actually throw an error here if we could.
+    return false;
+  }
+  d = to_double(mantissa, real_exponent, negative);
+  return true;
+}
+
+// We call a fallback floating-point parser that might be slow. Note
+// it will accept JSON numbers, but the JSON spec. is more restrictive so
+// before you call parse_float_fallback, you need to have validated the input
+// string with the JSON grammar.
+// It will return an error (false) if the parsed number is infinite.
+// The string parsing itself always succeeds. We know that there is at least
+// one digit.
+static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+
+// check quickly whether the next 8 chars are made of digits
+// at a glance, it looks better than Mula's
+// http://0x80.pl/articles/swar-digits-validate.html
+simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
+  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+  std::memcpy(&val, chars, 8);
+  // a branchy method might be faster:
+  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+  //  0x3030303030303030);
+  return (((val & 0xF0F0F0F0F0F0F0F0) |
+           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+          0x3333333333333333);
+}
+
+template<typename W>
+error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
+  double d;
+  if (parse_float_fallback(src, &d)) {
+    writer.append_double(d);
+    return SUCCESS;
+  }
+  return INVALID_NUMBER(src);
+}
+
+template<typename I>
+SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+simdjson_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
+  }
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
+}
+
+simdjson_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
+  // we continue with the fiction that we have an integer. If the
+  // floating point number is representable as x * 10^z for some integer
+  // z that fits in 53 bits, then we will be able to convert back the
+  // the integer into a float in a lossless manner.
+  const uint8_t *const first_after_period = p;
+
+#ifdef SIMDJSON_SWAR_NUMBER_PARSING
+#if SIMDJSON_SWAR_NUMBER_PARSING
+  // this helps if we have lots of decimals!
+  // this turns out to be frequent enough.
+  if (is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p);
+    p += 8;
+  }
+#endif // SIMDJSON_SWAR_NUMBER_PARSING
+#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
+  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
+  if (parse_digit(*p, i)) { ++p; }
+  while (parse_digit(*p, i)) { p++; }
+  exponent = first_after_period - p;
+  // Decimal without digits (123.) is illegal
+  if (exponent == 0) {
+    return INVALID_NUMBER(src);
+  }
+  return SUCCESS;
+}
+
+simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
+  // Exp Sign: -123.456e[-]78
+  bool neg_exp = ('-' == *p);
+  if (neg_exp || '+' == *p) { p++; } // Skip + as well
+
+  // Exponent: -123.456e-[78]
+  auto start_exp = p;
+  int64_t exp_number = 0;
+  while (parse_digit(*p, exp_number)) { ++p; }
+  // It is possible for parse_digit to overflow.
+  // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
+  // Thus we *must* check for possible overflow before we negate exp_number.
+
+  // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
+  // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
+  // not oblige and may, in fact, generate two distinct paths in any case. It might be
+  // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
+  // instructions for a simdjson_likely branch, an unconclusive gain.
+
+  // If there were no digits, it's an error.
+  if (simdjson_unlikely(p == start_exp)) {
+    return INVALID_NUMBER(src);
+  }
+  // We have a valid positive exponent in exp_number at this point, except that
+  // it may have overflowed.
+
+  // If there were more than 18 digits, we may have overflowed the integer. We have to do
+  // something!!!!
+  if (simdjson_unlikely(p > start_exp+18)) {
+    // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
+    while (*start_exp == '0') { start_exp++; }
+    // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
+    // support exponents smaller than -999,999,999,999,999,999 and bigger
+    // than 999,999,999,999,999,999.
+    // We can truncate.
+    // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
+    // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
+    // truncate at 324.
+    // Note that there is no reason to fail per se at this point in time.
+    // E.g., 0e999999999999999999999 is a fine number.
+    if (p > start_exp+18) { exp_number = 999999999999999999; }
+  }
+  // At this point, we know that exp_number is a sane, positive, signed integer.
+  // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
+  // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
+  // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
+  // To sum it up: the next line should never overflow.
+  exponent += (neg_exp ? -exp_number : exp_number);
+  return SUCCESS;
+}
+
+simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
+  // It is possible that the integer had an overflow.
+  // We have to handle the case where we have 0.0000somenumber.
+  const uint8_t *start = start_digits;
+  while ((*start == '0') || (*start == '.')) { ++start; }
+  // we over-decrement by one when there is a '.'
+  return digit_count - size_t(start - start_digits);
+}
+
+template<typename W>
+simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon in practice.
+  //
+  // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
+  // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
+  // may not have a decimal separator!
+  if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
+    // Ok, chances are good that we had an overflow!
+    // this is almost never going to get called!!!
+    // we start anew, going slowly!!!
+    // This will happen in the following examples:
+    // 10000000000000000000000000000000000000000000e+308
+    // 3.1415926535897932384626433832795028841971693993751
+    //
+    // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
+    // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
+    // it, it would force it to be stored in memory, preventing the compiler from picking it apart
+    // and putting into registers. i.e. if we pass it as reference, it gets slow.
+    // This is what forces the skip_double, as well.
+    error_code error = slow_float_parsing(src, writer);
+    writer.skip_double();
+    return error;
+  }
+  // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
+  // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+  // To future reader: we'd love if someone found a better way, or at least could explain this result!
+  if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
+    //
+    // Important: smallest_power is such that it leads to a zero value.
+    // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
+    // so something x 10^-343 goes to zero, but not so with  something x 10^-342.
+    static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
+    //
+    if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
+      // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
+      WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
+      return SUCCESS;
+    } else { // (exponent > largest_power) and (i != 0)
+      // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
+      return INVALID_NUMBER(src);
+    }
+  }
+  double d;
+  if (!compute_float_64(exponent, i, negative, d)) {
+    // we are almost never going to get here.
+    if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
+  }
+  WRITE_DOUBLE(d, src, writer);
+  return SUCCESS;
+}
+
+// for performance analysis, it is sometimes  useful to skip parsing
+#ifdef SIMDJSON_SKIPNUMBERPARSING
+
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
+  writer.append_s64(0);        // always write zero
+  return SUCCESS;              // always succeeds
+}
+
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { return ondemand::number_type::signed_integer; }
+#else
+
+// parse the number at src
+// define JSON_TEST_NUMBERS for unit testing
+//
+// It is assumed that the number is followed by a structural ({,},],[) character
+// or a white space character. If that is not the case (e.g., when the JSON
+// document is made of a single number), then it is necessary to copy the
+// content and append a space before calling this function.
+//
+// Our objective is accurate parsing (ULP of 0) at high speed.
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
+
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
+
+  //
+  // Handle floats if there is a . or e (or both)
+  //
+  int64_t exponent = 0;
+  bool is_float = false;
+  if ('.' == *p) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
+    digit_count = int(p - start_digits); // used later to guard against overflows
+  }
+  if (('e' == *p) || ('E' == *p)) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_exponent(src, p, exponent) );
+  }
+  if (is_float) {
+    const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
+    SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
+    if (dirty_end) { return INVALID_NUMBER(src); }
+    return SUCCESS;
+  }
+
+  // The longest negative 64-bit number is 19 digits.
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  size_t longest_digit_count = negative ? 19 : 20;
+  if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
+  if (digit_count == longest_digit_count) {
+    if (negative) {
+      // Anything negative above INT64_MAX+1 is invalid
+      if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src);  }
+      WRITE_INTEGER(~i+1, src, writer);
+      if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+      return SUCCESS;
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    }  else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
+  }
+
+  // Write unsigned if it doesn't fit in a signed integer.
+  if (i > uint64_t(INT64_MAX)) {
+    WRITE_UNSIGNED(i, src, writer);
+  } else {
+    WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
+  }
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+  return SUCCESS;
+}
+
+// Inlineable functions
+namespace {
+
+// This table can be used to characterize the final character of an integer
+// string. For JSON structural character and allowable white space characters,
+// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
+// we return NUMBER_ERROR.
+// Optimization note: we could easily reduce the size of the table by half (to 128)
+// at the cost of an extra branch.
+// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
+static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
+
+const uint8_t integer_string_finisher[256] = {
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   SUCCESS,      NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, SUCCESS,        NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR};
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
+  const uint8_t *p = src + 1;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    // Note: we use src[1] and not src[0] because src[0] is the quote character in this
+    // instance.
+    if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  //
+  // Check for minus sign
+  //
+  if(src == src_end) { return NUMBER_ERROR; }
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = src;
+  uint64_t i = 0;
+  while (parse_digit(*src, i)) { src++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(src - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
+  //  return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(*src != '"') { return NUMBER_ERROR; }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
+  return (*src == '-');
+}
+
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
+  return false;
+}
+
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) {
+    // We have an integer.
+    // If the number is negative and valid, it must be a signed integer.
+    if(negative) { return ondemand::number_type::signed_integer; }
+    // We want values larger or equal to 9223372036854775808 to be unsigned
+    // integers, and the other values to be signed integers.
+    int digit_count = int(p - src);
+    if(digit_count >= 19) {
+      const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
+      if((digit_count >= 20) || (memcmp(src, smaller_big_integer, 19) >= 0)) {
+        return ondemand::number_type::unsigned_integer;
+      }
+    }
+    return ondemand::number_type::signed_integer;
+  }
+  // Hopefully, we have 'e' or 'E' or '.'.
+  return ondemand::number_type::floating_point_number;
+}
+
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
+  if(src == src_end) { return NUMBER_ERROR; }
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  if(p == src_end) { return NUMBER_ERROR; }
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely((p != src_end) && (*p == '.'))) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while ((p != src_end) && parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if ((p != src_end) && (*p == 'e' || *p == 'E')) {
+    p++;
+    if(p == src_end) { return NUMBER_ERROR; }
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while ((p != src_end) && parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+} //namespace {}
+#endif // SIMDJSON_SKIPNUMBERPARSING
+
+} // namespace numberparsing
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdjson
+/* end file include/simdjson/generic/numberparsing.h */
+
+#endif // SIMDJSON_ARM64_NUMBERPARSING_H
+/* end file include/simdjson/arm64/numberparsing.h */
+/* begin file include/simdjson/arm64/end.h */
+/* end file include/simdjson/arm64/end.h */
+
+#endif // SIMDJSON_IMPLEMENTATION_ARM64
+
+#endif // SIMDJSON_ARM64_H
+/* end file include/simdjson/arm64.h */
+/* begin file include/simdjson/fallback.h */
+#ifndef SIMDJSON_FALLBACK_H
+#define SIMDJSON_FALLBACK_H
+
+
+#if SIMDJSON_IMPLEMENTATION_FALLBACK
+
+namespace simdjson {
+/**
+ * Fallback implementation (runs on any machine).
+ */
+namespace fallback {
+} // namespace fallback
+} // namespace simdjson
+
+/* begin file include/simdjson/fallback/implementation.h */
+#ifndef SIMDJSON_FALLBACK_IMPLEMENTATION_H
+#define SIMDJSON_FALLBACK_IMPLEMENTATION_H
+
+
+namespace simdjson {
+namespace fallback {
+
+namespace {
+using namespace simdjson;
+using namespace simdjson::dom;
+}
+
+class implementation final : public simdjson::implementation {
+public:
+  simdjson_inline implementation() : simdjson::implementation(
+      "fallback",
+      "Generic fallback implementation",
+      0
+  ) {}
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
+  simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+};
+
+} // namespace fallback
+} // namespace simdjson
+
+#endif // SIMDJSON_FALLBACK_IMPLEMENTATION_H
+/* end file include/simdjson/fallback/implementation.h */
+
+/* begin file include/simdjson/fallback/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "fallback"
+// #define SIMDJSON_IMPLEMENTATION fallback
+/* end file include/simdjson/fallback/begin.h */
+
+// Declarations
+/* begin file include/simdjson/generic/dom_parser_implementation.h */
+
+namespace simdjson {
+namespace fallback {
+
+// expectation: sizeof(open_container) = 64/8.
+struct open_container {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct open_container
+
+static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<open_container[]> open_containers{};
+  /** Whether each open container is a [ or { */
+  std::unique_ptr<bool[]> is_array{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+
+  inline dom_parser_implementation() noexcept;
+  inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
+  inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+  simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
+  simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
+  simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst) const noexcept final;
+  inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
+  inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
+private:
+  simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
+
+};
+
+} // namespace fallback
+} // namespace simdjson
+
+namespace simdjson {
+namespace fallback {
+
+inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
+inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
+inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
+
+// Leaving these here so they can be inlined if so desired
+inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
+  // Stage 1 index output
+  size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
+  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
+  structural_indexes[0] = 0;
+  n_structural_indexes = 0;
+
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  // Stage 2 stacks
+  open_containers.reset(new (std::nothrow) open_container[max_depth]);
+  is_array.reset(new (std::nothrow) bool[max_depth]);
+  if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
+
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+
+} // namespace fallback
+} // namespace simdjson
+/* end file include/simdjson/generic/dom_parser_implementation.h */
+/* begin file include/simdjson/fallback/bitmanipulation.h */
+#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H
+#define SIMDJSON_FALLBACK_BITMANIPULATION_H
+
+#include <limits>
+
+namespace simdjson {
+namespace fallback {
+namespace {
+
+#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
+static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
+  unsigned long x0 = (unsigned long)x, top, bottom;
+  _BitScanForward(&top, (unsigned long)(x >> 32));
+  _BitScanForward(&bottom, x0);
+  *ret = x0 ? bottom : 32 + top;
+  return x != 0;
+}
+static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
+  unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
+  _BitScanReverse(&top, x1);
+  _BitScanReverse(&bottom, (unsigned long)x);
+  *ret = x1 ? top + 32 : bottom;
+  return x != 0;
+}
+#endif
+
+/* result might be undefined when input_num is zero */
+simdjson_inline int leading_zeroes(uint64_t input_num) {
+#ifdef _MSC_VER
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif// _MSC_VER
+}
+
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+
+#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H
+/* end file include/simdjson/fallback/bitmanipulation.h */
+/* begin file include/simdjson/generic/jsoncharutils.h */
+
+namespace simdjson {
+namespace fallback {
+namespace {
+namespace jsoncharutils {
+
+// return non-zero if not a structural or whitespace char
+// zero otherwise
+simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace_negated[c];
+}
+
+simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace[c];
+}
+
+// returns a value with the high 16 bits set if not valid
+// otherwise returns the conversion of the 4 hex digits at src into the bottom
+// 16 bits of the 32-bit return register
+//
+// see
+// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
+static inline uint32_t hex_to_u32_nocheck(
+    const uint8_t *src) { // strictly speaking, static inline is a C-ism
+  uint32_t v1 = internal::digit_to_val32[630 + src[0]];
+  uint32_t v2 = internal::digit_to_val32[420 + src[1]];
+  uint32_t v3 = internal::digit_to_val32[210 + src[2]];
+  uint32_t v4 = internal::digit_to_val32[0 + src[3]];
+  return v1 | v2 | v3 | v4;
+}
+
+// given a code point cp, writes to c
+// the utf-8 code, outputting the length in
+// bytes, if the length is zero, the code point
+// is invalid
+//
+// This can possibly be made faster using pdep
+// and clz and table lookups, but JSON documents
+// have few escaped code points, and the following
+// function looks cheap.
+//
+// Note: we assume that surrogates are treated separately
+//
+simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
+  if (cp <= 0x7F) {
+    c[0] = uint8_t(cp);
+    return 1; // ascii
+  }
+  if (cp <= 0x7FF) {
+    c[0] = uint8_t((cp >> 6) + 192);
+    c[1] = uint8_t((cp & 63) + 128);
+    return 2; // universal plane
+    //  Surrogates are treated elsewhere...
+    //} //else if (0xd800 <= cp && cp <= 0xdfff) {
+    //  return 0; // surrogates // could put assert here
+  } else if (cp <= 0xFFFF) {
+    c[0] = uint8_t((cp >> 12) + 224);
+    c[1] = uint8_t(((cp >> 6) & 63) + 128);
+    c[2] = uint8_t((cp & 63) + 128);
+    return 3;
+  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
+                               // is not needed
+    c[0] = uint8_t((cp >> 18) + 240);
+    c[1] = uint8_t(((cp >> 12) & 63) + 128);
+    c[2] = uint8_t(((cp >> 6) & 63) + 128);
+    c[3] = uint8_t((cp & 63) + 128);
+    return 4;
+  }
+  // will return 0 when the code point was too large.
+  return 0; // bad r
+}
+
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit
+//
+static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
+  return x * (uint64_t)y;
+}
+static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + !!(lo < bd);
+  return lo;
+}
+#endif
+
+using internal::value128;
+
+simdjson_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
+  value128 answer;
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emultate
+  answer.high = __umulh(value1, value2);
+  answer.low = value1 * value2;
+#else
+  answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+  __uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#endif
+  return answer;
+}
+
+} // namespace jsoncharutils
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file include/simdjson/generic/jsoncharutils.h */
+/* begin file include/simdjson/generic/atomparsing.h */
+namespace simdjson {
+namespace fallback {
+namespace {
+/// @private
+namespace atomparsing {
+
+// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
+// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
+// be certain that the character pointer will be properly aligned.
+// You might think that using memcpy makes this function expensive, but you'd be wrong.
+// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
+// to the compile-time constant 1936482662.
+simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
+
+
+// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
+// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
+simdjson_warn_unused
+simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
+  uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
+  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+  std::memcpy(&srcval, src, sizeof(uint32_t));
+  return srcval ^ string_to_uint32(atom);
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
+  return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_true_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "true"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
+  return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
+  if (len > 5) { return is_valid_false_atom(src); }
+  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
+  return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_null_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "null"); }
+  else { return false; }
+}
+
+} // namespace atomparsing
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file include/simdjson/generic/atomparsing.h */
+/* begin file include/simdjson/fallback/stringparsing.h */
+#ifndef SIMDJSON_FALLBACK_STRINGPARSING_H
+#define SIMDJSON_FALLBACK_STRINGPARSING_H
+
+
+namespace simdjson {
+namespace fallback {
+namespace {
+
+// Holds backslashes and quotes locations.
+struct backslash_and_quote {
+public:
+  static constexpr uint32_t BYTES_PROCESSED = 1;
+  simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+
+  simdjson_inline bool has_quote_first() { return c == '"'; }
+  simdjson_inline bool has_backslash() { return c == '\\'; }
+  simdjson_inline int quote_index() { return c == '"' ? 0 : 1; }
+  simdjson_inline int backslash_index() { return c == '\\' ? 0 : 1; }
+
+  uint8_t c;
+}; // struct backslash_and_quote
+
+simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
+  // store to dest unconditionally - we can overwrite the bits we don't like later
+  dst[0] = src[0];
+  return { src[0] };
+}
+
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+
+#endif // SIMDJSON_FALLBACK_STRINGPARSING_H
+/* end file include/simdjson/fallback/stringparsing.h */
+/* begin file include/simdjson/fallback/numberparsing.h */
+#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_H
+#define SIMDJSON_FALLBACK_NUMBERPARSING_H
+
+#ifdef JSON_TEST_NUMBERS // for unit testing
+void found_invalid_number(const uint8_t *buf);
+void found_integer(int64_t result, const uint8_t *buf);
+void found_unsigned_integer(uint64_t result, const uint8_t *buf);
+void found_float(double result, const uint8_t *buf);
+#endif
+
+namespace simdjson {
+namespace fallback {
+namespace {
+// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+static simdjson_inline uint32_t parse_eight_digits_unrolled(const char *chars) {
+  uint64_t val;
+  memcpy(&val, chars, sizeof(uint64_t));
+  val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
+  val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
+  return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
+}
+static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
+  return parse_eight_digits_unrolled(reinterpret_cast<const char *>(chars));
+}
+
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+
+#define SIMDJSON_SWAR_NUMBER_PARSING 1
+
+/* begin file include/simdjson/generic/numberparsing.h */
+#include <limits>
+
+namespace simdjson {
+namespace fallback {
+
+namespace ondemand {
+/**
+ * The type of a JSON number
+ */
+enum class number_type {
+    floating_point_number=1, /// a binary64 number
+    signed_integer,          /// a signed integer that fits in a 64-bit word using two's complement
+    unsigned_integer         /// a positive integer larger or equal to 1<<63
+};
+}
+
+namespace {
+/// @private
+namespace numberparsing {
+
+
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
+#endif
+
+namespace {
+// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
+// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
+// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
+simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
+    double d;
+    mantissa &= ~(1ULL << 52);
+    mantissa |= real_exponent << 52;
+    mantissa |= ((static_cast<uint64_t>(negative)) << 63);
+    std::memcpy(&d, &mantissa, sizeof(d));
+    return d;
+}
+}
+// Attempts to compute i * 10^(power) exactly; and if "negative" is
+// true, negate the result.
+// This function will only work in some cases, when it does not work, success is
+// set to false. This should work *most of the time* (like 99% of the time).
+// We assume that power is in the [smallest_power,
+// largest_power] interval: the caller is responsible for this check.
+simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
+  // we start with a fast path
+  // It was described in
+  // Clinger WD. How to read floating point numbers accurately.
+  // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
+  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
+    // convert the integer into a double. This is lossless since
+    // 0 <= i <= 2^53 - 1.
+    d = double(i);
+    //
+    // The general idea is as follows.
+    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+    // 1) Both s and p can be represented exactly as 64-bit floating-point
+    // values
+    // (binary64).
+    // 2) Because s and p can be represented exactly as floating-point values,
+    // then s * p
+    // and s / p will produce correctly rounded values.
+    //
+    if (power < 0) {
+      d = d / simdjson::internal::power_of_ten[-power];
+    } else {
+      d = d * simdjson::internal::power_of_ten[power];
+    }
+    if (negative) {
+      d = -d;
+    }
+    return true;
+  }
+  // When 22 < power && power <  22 + 16, we could
+  // hope for another, secondary fast path.  It was
+  // described by David M. Gay in  "Correctly rounded
+  // binary-decimal and decimal-binary conversions." (1990)
+  // If you need to compute i * 10^(22 + x) for x < 16,
+  // first compute i * 10^x, if you know that result is exact
+  // (e.g., when i * 10^x < 2^53),
+  // then you can still proceed and do (i * 10^x) * 10^22.
+  // Is this worth your time?
+  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+  // for this second fast path to work.
+  // If you you have 22 < power *and* power <  22 + 16, and then you
+  // optimistically compute "i * 10^(x-22)", there is still a chance that you
+  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+  // this optimization maybe less common than we would like. Source:
+  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+
+  // The fast path has now failed, so we are failing back on the slower path.
+
+  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+  // possible, except if i == 0, so we handle i == 0 separately.
+  if(i == 0) {
+    d = negative ? -0.0 : 0.0;
+    return true;
+  }
+
+
+  // The exponent is 1024 + 63 + power
+  //     + floor(log(5**power)/log(2)).
+  // The 1024 comes from the ieee64 standard.
+  // The 63 comes from the fact that we use a 64-bit word.
+  //
+  // Computing floor(log(5**power)/log(2)) could be
+  // slow. Instead we use a fast function.
+  //
+  // For power in (-400,350), we have that
+  // (((152170 + 65536) * power ) >> 16);
+  // is equal to
+  //  floor(log(5**power)/log(2)) + power when power >= 0
+  // and it is equal to
+  //  ceil(log(5**-power)/log(2)) + power when power < 0
+  //
+  // The 65536 is (1<<16) and corresponds to
+  // (65536 * power) >> 16 ---> power
+  //
+  // ((152170 * power ) >> 16) is equal to
+  // floor(log(5**power)/log(2))
+  //
+  // Note that this is not magic: 152170/(1<<16) is
+  // approximatively equal to log(5)/log(2).
+  // The 1<<16 value is a power of two; we could use a
+  // larger power of 2 if we wanted to.
+  //
+  int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
+
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(i);
+  i <<= lz;
+
+
+  // We are going to need to do some 64-bit arithmetic to get a precise product.
+  // We use a table lookup approach.
+  // It is safe because
+  // power >= smallest_power
+  // and power <= largest_power
+  // We recover the mantissa of the power, it has a leading 1. It is always
+  // rounded down.
+  //
+  // We want the most significant 64 bits of the product. We know
+  // this will be non-zero because the most significant bit of i is
+  // 1.
+  const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
+  // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
+  //
+  // The full_multiplication function computes the 128-bit product of two 64-bit words
+  // with a returned value of type value128 with a "low component" corresponding to the
+  // 64-bit least significant bits of the product and with a "high component" corresponding
+  // to the 64-bit most significant bits of the product.
+  simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
+  // Both i and power_of_five_128[index] have their most significant bit set to 1 which
+  // implies that the either the most or the second most significant bit of the product
+  // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
+  // we make of the product. It also makes it easy to reason about the product: there
+  // is 0 or 1 leading zero in the product.
+
+  // Unless the least significant 9 bits of the high (64-bit) part of the full
+  // product are all 1s, then we know that the most significant 55 bits are
+  // exact and no further work is needed. Having 55 bits is necessary because
+  // we need 53 bits for the mantissa but we have to have one rounding bit and
+  // we can waste a bit if the most significant bit of the product is zero.
+  if((firstproduct.high & 0x1FF) == 0x1FF) {
+    // We want to compute i * 5^q, but only care about the top 55 bits at most.
+    // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
+    // the full computation is wasteful. So we do what is called a "truncated
+    // multiplication".
+    // We take the most significant 64-bits, and we put them in
+    // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
+    // to the desired approximation using one multiplication. Sometimes it does not suffice.
+    // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
+    // then we get a better approximation to i * 5^q. In very rare cases, even that
+    // will not suffice, though it is seemingly very hard to find such a scenario.
+    //
+    // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
+    // more complicated.
+    //
+    // There is an extra layer of complexity in that we need more than 55 bits of
+    // accuracy in the round-to-even scenario.
+    //
+    // The full_multiplication function computes the 128-bit product of two 64-bit words
+    // with a returned value of type value128 with a "low component" corresponding to the
+    // 64-bit least significant bits of the product and with a "high component" corresponding
+    // to the 64-bit most significant bits of the product.
+    simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
+    // At this point, we might need to add at most one to firstproduct, but this
+    // can only change the value of firstproduct.high if firstproduct.low is maximal.
+    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
+      // This is very unlikely, but if so, we need to do much more work!
+      return false;
+    }
+  }
+  uint64_t lower = firstproduct.low;
+  uint64_t upper = firstproduct.high;
+  // The final mantissa should be 53 bits with a leading 1.
+  // We shift it so that it occupies 54 bits with a leading 1.
+  ///////
+  uint64_t upperbit = upper >> 63;
+  uint64_t mantissa = upper >> (upperbit + 9);
+  lz += int(1 ^ upperbit);
+
+  // Here we have mantissa < (1<<54).
+  int64_t real_exponent = exponent - lz;
+  if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
+    // Here have that real_exponent <= 0 so -real_exponent >= 0
+    if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+      d = negative ? -0.0 : 0.0;
+      return true;
+    }
+    // next line is safe because -real_exponent + 1 < 0
+    mantissa >>= -real_exponent + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    mantissa += (mantissa & 1); // round up
+    mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
+    d = to_double(mantissa, real_exponent, negative);
+    return true;
+  }
+  // We have to round to even. The "to even" part
+  // is only a problem when we are right in between two floats
+  // which we guard against.
+  // If we have lots of trailing zeros, we may fall right between two
+  // floating-point values.
+  //
+  // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
+  // times a power of two. That is, it is right between a number with binary significand
+  // m and another number with binary significand m+1; and it must be the case
+  // that it cannot be represented by a float itself.
+  //
+  // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
+  // Recall that 10^q = 5^q * 2^q.
+  // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
+  //  5^23 <=  2^54 and it is the last power of five to qualify, so q <= 23.
+  // When q<0, we have  w  >=  (2m+1) x 5^{-q}.  We must have that w<2^{64} so
+  // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
+  // 2^{53} x 5^{-q} < 2^{64}.
+  // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
+  //
+  // We require lower <= 1 and not lower == 0 because we could not prove that
+  // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
+  if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
+    if((mantissa  << (upperbit + 64 - 53 - 2)) ==  upper) {
+      mantissa &= ~1;             // flip it so that we do not round up
+    }
+  }
+
+  mantissa += mantissa & 1;
+  mantissa >>= 1;
+
+  // Here we have mantissa < (1<<53), unless there was an overflow
+  if (mantissa >= (1ULL << 53)) {
+    //////////
+    // This will happen when parsing values such as 7.2057594037927933e+16
+    ////////
+    mantissa = (1ULL << 52);
+    real_exponent++;
+  }
+  mantissa &= ~(1ULL << 52);
+  // we have to check that real_exponent is in range, otherwise we bail out
+  if (simdjson_unlikely(real_exponent > 2046)) {
+    // We have an infinite value!!! We could actually throw an error here if we could.
+    return false;
+  }
+  d = to_double(mantissa, real_exponent, negative);
+  return true;
+}
+
+// We call a fallback floating-point parser that might be slow. Note
+// it will accept JSON numbers, but the JSON spec. is more restrictive so
+// before you call parse_float_fallback, you need to have validated the input
+// string with the JSON grammar.
+// It will return an error (false) if the parsed number is infinite.
+// The string parsing itself always succeeds. We know that there is at least
+// one digit.
+static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+
+// check quickly whether the next 8 chars are made of digits
+// at a glance, it looks better than Mula's
+// http://0x80.pl/articles/swar-digits-validate.html
+simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
+  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+  std::memcpy(&val, chars, 8);
+  // a branchy method might be faster:
+  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+  //  0x3030303030303030);
+  return (((val & 0xF0F0F0F0F0F0F0F0) |
+           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+          0x3333333333333333);
+}
+
+template<typename W>
+error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
+  double d;
+  if (parse_float_fallback(src, &d)) {
+    writer.append_double(d);
+    return SUCCESS;
+  }
+  return INVALID_NUMBER(src);
+}
+
+template<typename I>
+SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+simdjson_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
+  }
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
+}
+
+simdjson_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
+  // we continue with the fiction that we have an integer. If the
+  // floating point number is representable as x * 10^z for some integer
+  // z that fits in 53 bits, then we will be able to convert back the
+  // the integer into a float in a lossless manner.
+  const uint8_t *const first_after_period = p;
+
+#ifdef SIMDJSON_SWAR_NUMBER_PARSING
+#if SIMDJSON_SWAR_NUMBER_PARSING
+  // this helps if we have lots of decimals!
+  // this turns out to be frequent enough.
+  if (is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p);
+    p += 8;
+  }
+#endif // SIMDJSON_SWAR_NUMBER_PARSING
+#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
+  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
+  if (parse_digit(*p, i)) { ++p; }
+  while (parse_digit(*p, i)) { p++; }
+  exponent = first_after_period - p;
+  // Decimal without digits (123.) is illegal
+  if (exponent == 0) {
+    return INVALID_NUMBER(src);
+  }
+  return SUCCESS;
+}
+
+simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
+  // Exp Sign: -123.456e[-]78
+  bool neg_exp = ('-' == *p);
+  if (neg_exp || '+' == *p) { p++; } // Skip + as well
+
+  // Exponent: -123.456e-[78]
+  auto start_exp = p;
+  int64_t exp_number = 0;
+  while (parse_digit(*p, exp_number)) { ++p; }
+  // It is possible for parse_digit to overflow.
+  // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
+  // Thus we *must* check for possible overflow before we negate exp_number.
+
+  // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
+  // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
+  // not oblige and may, in fact, generate two distinct paths in any case. It might be
+  // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
+  // instructions for a simdjson_likely branch, an unconclusive gain.
+
+  // If there were no digits, it's an error.
+  if (simdjson_unlikely(p == start_exp)) {
+    return INVALID_NUMBER(src);
+  }
+  // We have a valid positive exponent in exp_number at this point, except that
+  // it may have overflowed.
+
+  // If there were more than 18 digits, we may have overflowed the integer. We have to do
+  // something!!!!
+  if (simdjson_unlikely(p > start_exp+18)) {
+    // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
+    while (*start_exp == '0') { start_exp++; }
+    // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
+    // support exponents smaller than -999,999,999,999,999,999 and bigger
+    // than 999,999,999,999,999,999.
+    // We can truncate.
+    // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
+    // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
+    // truncate at 324.
+    // Note that there is no reason to fail per se at this point in time.
+    // E.g., 0e999999999999999999999 is a fine number.
+    if (p > start_exp+18) { exp_number = 999999999999999999; }
+  }
+  // At this point, we know that exp_number is a sane, positive, signed integer.
+  // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
+  // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
+  // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
+  // To sum it up: the next line should never overflow.
+  exponent += (neg_exp ? -exp_number : exp_number);
+  return SUCCESS;
+}
+
+simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
+  // It is possible that the integer had an overflow.
+  // We have to handle the case where we have 0.0000somenumber.
+  const uint8_t *start = start_digits;
+  while ((*start == '0') || (*start == '.')) { ++start; }
+  // we over-decrement by one when there is a '.'
+  return digit_count - size_t(start - start_digits);
+}
+
+template<typename W>
+simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon in practice.
+  //
+  // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
+  // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
+  // may not have a decimal separator!
+  if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
+    // Ok, chances are good that we had an overflow!
+    // this is almost never going to get called!!!
+    // we start anew, going slowly!!!
+    // This will happen in the following examples:
+    // 10000000000000000000000000000000000000000000e+308
+    // 3.1415926535897932384626433832795028841971693993751
+    //
+    // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
+    // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
+    // it, it would force it to be stored in memory, preventing the compiler from picking it apart
+    // and putting into registers. i.e. if we pass it as reference, it gets slow.
+    // This is what forces the skip_double, as well.
+    error_code error = slow_float_parsing(src, writer);
+    writer.skip_double();
+    return error;
+  }
+  // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
+  // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+  // To future reader: we'd love if someone found a better way, or at least could explain this result!
+  if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
+    //
+    // Important: smallest_power is such that it leads to a zero value.
+    // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
+    // so something x 10^-343 goes to zero, but not so with  something x 10^-342.
+    static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
+    //
+    if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
+      // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
+      WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
+      return SUCCESS;
+    } else { // (exponent > largest_power) and (i != 0)
+      // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
+      return INVALID_NUMBER(src);
+    }
+  }
+  double d;
+  if (!compute_float_64(exponent, i, negative, d)) {
+    // we are almost never going to get here.
+    if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
+  }
+  WRITE_DOUBLE(d, src, writer);
+  return SUCCESS;
+}
+
+// for performance analysis, it is sometimes  useful to skip parsing
+#ifdef SIMDJSON_SKIPNUMBERPARSING
+
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
+  writer.append_s64(0);        // always write zero
+  return SUCCESS;              // always succeeds
+}
+
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { return ondemand::number_type::signed_integer; }
+#else
+
+// parse the number at src
+// define JSON_TEST_NUMBERS for unit testing
+//
+// It is assumed that the number is followed by a structural ({,},],[) character
+// or a white space character. If that is not the case (e.g., when the JSON
+// document is made of a single number), then it is necessary to copy the
+// content and append a space before calling this function.
+//
+// Our objective is accurate parsing (ULP of 0) at high speed.
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
+
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
+
+  //
+  // Handle floats if there is a . or e (or both)
+  //
+  int64_t exponent = 0;
+  bool is_float = false;
+  if ('.' == *p) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
+    digit_count = int(p - start_digits); // used later to guard against overflows
+  }
+  if (('e' == *p) || ('E' == *p)) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_exponent(src, p, exponent) );
+  }
+  if (is_float) {
+    const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
+    SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
+    if (dirty_end) { return INVALID_NUMBER(src); }
+    return SUCCESS;
+  }
+
+  // The longest negative 64-bit number is 19 digits.
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  size_t longest_digit_count = negative ? 19 : 20;
+  if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
+  if (digit_count == longest_digit_count) {
+    if (negative) {
+      // Anything negative above INT64_MAX+1 is invalid
+      if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src);  }
+      WRITE_INTEGER(~i+1, src, writer);
+      if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+      return SUCCESS;
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    }  else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
+  }
+
+  // Write unsigned if it doesn't fit in a signed integer.
+  if (i > uint64_t(INT64_MAX)) {
+    WRITE_UNSIGNED(i, src, writer);
+  } else {
+    WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
+  }
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+  return SUCCESS;
+}
+
+// Inlineable functions
+namespace {
+
+// This table can be used to characterize the final character of an integer
+// string. For JSON structural character and allowable white space characters,
+// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
+// we return NUMBER_ERROR.
+// Optimization note: we could easily reduce the size of the table by half (to 128)
+// at the cost of an extra branch.
+// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
+static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
+
+const uint8_t integer_string_finisher[256] = {
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   SUCCESS,      NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, SUCCESS,        NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR};
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
+  const uint8_t *p = src + 1;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    // Note: we use src[1] and not src[0] because src[0] is the quote character in this
+    // instance.
+    if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  //
+  // Check for minus sign
+  //
+  if(src == src_end) { return NUMBER_ERROR; }
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = src;
+  uint64_t i = 0;
+  while (parse_digit(*src, i)) { src++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(src - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
+  //  return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(*src != '"') { return NUMBER_ERROR; }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
+  return (*src == '-');
+}
+
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
+  return false;
+}
+
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) {
+    // We have an integer.
+    // If the number is negative and valid, it must be a signed integer.
+    if(negative) { return ondemand::number_type::signed_integer; }
+    // We want values larger or equal to 9223372036854775808 to be unsigned
+    // integers, and the other values to be signed integers.
+    int digit_count = int(p - src);
+    if(digit_count >= 19) {
+      const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
+      if((digit_count >= 20) || (memcmp(src, smaller_big_integer, 19) >= 0)) {
+        return ondemand::number_type::unsigned_integer;
+      }
+    }
+    return ondemand::number_type::signed_integer;
+  }
+  // Hopefully, we have 'e' or 'E' or '.'.
+  return ondemand::number_type::floating_point_number;
+}
+
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
+  if(src == src_end) { return NUMBER_ERROR; }
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  if(p == src_end) { return NUMBER_ERROR; }
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely((p != src_end) && (*p == '.'))) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while ((p != src_end) && parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if ((p != src_end) && (*p == 'e' || *p == 'E')) {
+    p++;
+    if(p == src_end) { return NUMBER_ERROR; }
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while ((p != src_end) && parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+} //namespace {}
+#endif // SIMDJSON_SKIPNUMBERPARSING
+
+} // namespace numberparsing
+} // unnamed namespace
+} // namespace fallback
+} // namespace simdjson
+/* end file include/simdjson/generic/numberparsing.h */
+
+#endif // SIMDJSON_FALLBACK_NUMBERPARSING_H
+/* end file include/simdjson/fallback/numberparsing.h */
+/* begin file include/simdjson/fallback/end.h */
+/* end file include/simdjson/fallback/end.h */
+
+#endif // SIMDJSON_IMPLEMENTATION_FALLBACK
+#endif // SIMDJSON_FALLBACK_H
+/* end file include/simdjson/fallback.h */
+/* begin file include/simdjson/icelake.h */
+#ifndef SIMDJSON_ICELAKE_H
+#define SIMDJSON_ICELAKE_H
+
+
+#if SIMDJSON_IMPLEMENTATION_ICELAKE
+
+#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
+#define SIMDJSON_TARGET_ICELAKE
+#define SIMDJSON_UNTARGET_ICELAKE
+#else
+#define SIMDJSON_TARGET_ICELAKE SIMDJSON_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,pclmul,lzcnt")
+#define SIMDJSON_UNTARGET_ICELAKE SIMDJSON_UNTARGET_REGION
+#endif
+
+namespace simdjson {
+/**
+ * Implementation for Icelake (Intel AVX512).
+ */
+namespace icelake {
+} // namespace icelake
+} // namespace simdjson
+
+//
+// These two need to be included outside SIMDJSON_TARGET_ICELAKE
+//
+/* begin file include/simdjson/icelake/implementation.h */
+#ifndef SIMDJSON_ICELAKE_IMPLEMENTATION_H
+#define SIMDJSON_ICELAKE_IMPLEMENTATION_H
+
+
+// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_ICELAKE
+namespace simdjson {
+namespace icelake {
+
+using namespace simdjson;
+
+class implementation final : public simdjson::implementation {
+public:
+  simdjson_inline implementation() : simdjson::implementation(
+      "icelake",
+      "Intel/AMD AVX512",
+      internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512F | internal::instruction_set::AVX512DQ | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2
+  ) {}
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
+  simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+};
+
+} // namespace icelake
+} // namespace simdjson
+
+#endif // SIMDJSON_ICELAKE_IMPLEMENTATION_H
+/* end file include/simdjson/icelake/implementation.h */
+/* begin file include/simdjson/icelake/intrinsics.h */
+#ifndef SIMDJSON_ICELAKE_INTRINSICS_H
+#define SIMDJSON_ICELAKE_INTRINSICS_H
+
+
+#ifdef SIMDJSON_VISUAL_STUDIO
+// under clang within visual studio, this will include <x86intrin.h>
+#include <intrin.h>  // visual studio or clang
+#else
+#include <x86intrin.h> // elsewhere
+#endif // SIMDJSON_VISUAL_STUDIO
+
+#ifdef SIMDJSON_CLANG_VISUAL_STUDIO
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ * e.g., if __AVX2__ is set... in turn,  we normally set these
+ * macros by compiling against the corresponding architecture
+ * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
+ * software with these advanced instructions. In simdjson, we
+ * want to compile the whole program for a generic target,
+ * and only target our specific kernels. As a workaround,
+ * we directly include the needed headers. These headers would
+ * normally guard against such usage, but we carefully included
+ * <x86intrin.h>  (or <intrin.h>) before, so the headers
+ * are fooled.
+ */
+#include <bmiintrin.h>   // for _blsr_u64
+#include <lzcntintrin.h> // for  __lzcnt64
+#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <wmmintrin.h>   // for  _mm_clmulepi64_si128
+// Important: we need the AVX-512 headers:
+#include <avx512fintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512cdintrin.h>
+#include <avx512bwintrin.h>
+#include <avx512vlintrin.h>
+#include <avx512vbmiintrin.h>
+#include <avx512vbmi2intrin.h>
+// unfortunately, we may not get _blsr_u64, but, thankfully, clang
+// has it as a macro.
+#ifndef _blsr_u64
+// we roll our own
+#define _blsr_u64(n) ((n - 1) & n)
+#endif //  _blsr_u64
+#endif // SIMDJSON_CLANG_VISUAL_STUDIO
+
+static_assert(sizeof(__m512i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for icelake");
+
+#endif // SIMDJSON_ICELAKE_INTRINSICS_H
+/* end file include/simdjson/icelake/intrinsics.h */
+
+//
+// The rest need to be inside the region
+//
+/* begin file include/simdjson/icelake/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "icelake"
+// #define SIMDJSON_IMPLEMENTATION icelake
+SIMDJSON_TARGET_ICELAKE
+/* end file include/simdjson/icelake/begin.h */
+
+// Declarations
+/* begin file include/simdjson/generic/dom_parser_implementation.h */
+
+namespace simdjson {
+namespace icelake {
+
+// expectation: sizeof(open_container) = 64/8.
+struct open_container {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct open_container
+
+static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<open_container[]> open_containers{};
+  /** Whether each open container is a [ or { */
+  std::unique_ptr<bool[]> is_array{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+
+  inline dom_parser_implementation() noexcept;
+  inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
+  inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+  simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
+  simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
+  simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst) const noexcept final;
+  inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
+  inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
+private:
+  simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
+
+};
+
+} // namespace icelake
+} // namespace simdjson
+
+namespace simdjson {
+namespace icelake {
+
+inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
+inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
+inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
+
+// Leaving these here so they can be inlined if so desired
+inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
+  // Stage 1 index output
+  size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
+  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
+  structural_indexes[0] = 0;
+  n_structural_indexes = 0;
+
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  // Stage 2 stacks
+  open_containers.reset(new (std::nothrow) open_container[max_depth]);
+  is_array.reset(new (std::nothrow) bool[max_depth]);
+  if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
+
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+
+} // namespace icelake
+} // namespace simdjson
+/* end file include/simdjson/generic/dom_parser_implementation.h */
+/* begin file include/simdjson/icelake/bitmanipulation.h */
+#ifndef SIMDJSON_ICELAKE_BITMANIPULATION_H
+#define SIMDJSON_ICELAKE_BITMANIPULATION_H
+
+namespace simdjson {
+namespace icelake {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+SIMDJSON_NO_SANITIZE_UNDEFINED
+simdjson_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  return (int)_tzcnt_u64(input_num);
+#else // SIMDJSON_REGULAR_VISUAL_STUDIO
+  ////////
+  // You might expect the next line to be equivalent to
+  // return (int)_tzcnt_u64(input_num);
+  // but the generated code differs and might be less efficient?
+  ////////
+  return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return _blsr_u64(input_num);
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline int leading_zeroes(uint64_t input_num) {
+  return int(_lzcnt_u64(input_num));
+}
+
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
+}
+#else
+simdjson_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
+}
+#endif
+
+simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                uint64_t *result) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  return _addcarry_u64(0, value1, value2,
+                       reinterpret_cast<unsigned __int64 *>(result));
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+
+#endif // SIMDJSON_ICELAKE_BITMANIPULATION_H
+/* end file include/simdjson/icelake/bitmanipulation.h */
+/* begin file include/simdjson/icelake/bitmask.h */
+#ifndef SIMDJSON_ICELAKE_BITMASK_H
+#define SIMDJSON_ICELAKE_BITMASK_H
+
+namespace simdjson {
+namespace icelake {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
+  // There should be no such thing with a processor supporting avx2
+  // but not clmul.
+  __m128i all_ones = _mm_set1_epi8('\xFF');
+  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+  return _mm_cvtsi128_si64(result);
+}
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+
+#endif // SIMDJSON_ICELAKE_BITMASK_H
+/* end file include/simdjson/icelake/bitmask.h */
+/* begin file include/simdjson/icelake/simd.h */
+#ifndef SIMDJSON_ICELAKE_SIMD_H
+#define SIMDJSON_ICELAKE_SIMD_H
+
+
+
+
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ == 8
+#define SIMDJSON_GCC8 1
+#endif //  __GNUC__ == 8
+#endif // defined(__GNUC__) && !defined(__clang__)
+
+#if SIMDJSON_GCC8
+/**
+ * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
+ */
+inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
+  return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
+                          uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
+                          uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
+                          uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
+                          uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
+                          uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
+                          uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
+                          uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
+}
+#endif // SIMDJSON_GCC8
+
+
+
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace simd {
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename Child>
+  struct base {
+    __m512i value;
+
+    // Zero constructor
+    simdjson_inline base() : value{__m512i()} {}
+
+    // Conversion from SIMD register
+    simdjson_inline base(const __m512i _value) : value(_value) {}
+
+    // Conversion to SIMD register
+    simdjson_inline operator const __m512i&() const { return this->value; }
+    simdjson_inline operator __m512i&() { return this->value; }
+
+    // Bit operations
+    simdjson_inline Child operator|(const Child other) const { return _mm512_or_si512(*this, other); }
+    simdjson_inline Child operator&(const Child other) const { return _mm512_and_si512(*this, other); }
+    simdjson_inline Child operator^(const Child other) const { return _mm512_xor_si512(*this, other); }
+    simdjson_inline Child bit_andnot(const Child other) const { return _mm512_andnot_si512(other, *this); }
+    simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+  };
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
+
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
+    typedef uint32_t bitmask_t;
+    typedef uint64_t bitmask2_t;
+
+    simdjson_inline base8() : base<simd8<T>>() {}
+    simdjson_inline base8(const __m512i _value) : base<simd8<T>>(_value) {}
+
+    friend simdjson_really_inline uint64_t operator==(const simd8<T> lhs, const simd8<T> rhs) {
+      return _mm512_cmpeq_epi8_mask(lhs, rhs);
+    }
+
+    static const int SIZE = sizeof(base<T>::value);
+
+    template<int N=1>
+    simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+#if SIMDJSON_GCC8
+     // workaround for compilers unable to figure out that 16 - N is a constant (GCC 8)
+      constexpr int shift = 16 - N;
+      return _mm512_alignr_epi8(*this, _mm512_permutex2var_epi64(prev_chunk, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), *this), shift);
+#else
+      return _mm512_alignr_epi8(*this, _mm512_permutex2var_epi64(prev_chunk, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), *this), 16 - N);
+#endif
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
+    static simdjson_inline simd8<bool> splat(bool _value) { return _mm512_set1_epi8(uint8_t(-(!!_value))); }
+
+    simdjson_inline simd8<bool>() : base8() {}
+    simdjson_inline simd8<bool>(const __m512i _value) : base8<bool>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+    simdjson_inline bool any() const { return !!_mm512_test_epi8_mask (*this, *this); }
+    simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
+  };
+
+  template<typename T>
+  struct base8_numeric: base8<T> {
+    static simdjson_inline simd8<T> splat(T _value) { return _mm512_set1_epi8(_value); }
+    static simdjson_inline simd8<T> zero() { return _mm512_setzero_si512(); }
+    static simdjson_inline simd8<T> load(const T values[64]) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i *>(values));
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    static simdjson_inline simd8<T> repeat_16(
+      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
+      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
+    ) {
+      return simd8<T>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    simdjson_inline base8_numeric() : base8<T>() {}
+    simdjson_inline base8_numeric(const __m512i _value) : base8<T>(_value) {}
+
+    // Store to array
+    simdjson_inline void store(T dst[64]) const { return _mm512_storeu_si512(reinterpret_cast<__m512i *>(dst), *this); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm512_add_epi8(*this, other); }
+    simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm512_sub_epi8(*this, other); }
+    simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
+    simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+
+    // Override to distinguish from bool version
+    simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return _mm512_shuffle_epi8(lookup_table, *this);
+    }
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint32_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    simdjson_inline void compress(uint64_t mask, L * output) const {
+      _mm512_mask_compressstoreu_epi8 (output,~mask,*this);
+    }
+
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdjson_inline simd8() : base8_numeric<int8_t>() {}
+    simdjson_inline simd8(const __m512i _value) : base8_numeric<int8_t>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdjson_inline simd8(const int8_t values[64]) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdjson_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31,
+      int8_t v32, int8_t v33, int8_t v34, int8_t v35, int8_t v36, int8_t v37, int8_t v38, int8_t v39,
+      int8_t v40, int8_t v41, int8_t v42, int8_t v43, int8_t v44, int8_t v45, int8_t v46, int8_t v47,
+      int8_t v48, int8_t v49, int8_t v50, int8_t v51, int8_t v52, int8_t v53, int8_t v54, int8_t v55,
+      int8_t v56, int8_t v57, int8_t v58, int8_t v59, int8_t v60, int8_t v61, int8_t v62, int8_t v63
+    ) : simd8(_mm512_set_epi8(
+      v63, v62, v61, v60, v59, v58, v57, v56,
+      v55, v54, v53, v52, v51, v50, v49, v48,
+      v47, v46, v45, v44, v43, v42, v41, v40,
+      v39, v38, v37, v36, v35, v34, v33, v32,
+      v31, v30, v29, v28, v27, v26, v25, v24,
+      v23, v22, v21, v20, v19, v18, v17, v16,
+      v15, v14, v13, v12, v11, v10,  v9,  v8,
+       v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0
+    )) {}
+
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<int8_t> repeat_16(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Order-sensitive comparisons
+    simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm512_max_epi8(*this, other); }
+    simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm512_min_epi8(*this, other); }
+
+    simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(*this, other),_mm512_set1_epi8(uint8_t(0x80))); }
+    simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(other, *this),_mm512_set1_epi8(uint8_t(0x80))); }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t> {
+    simdjson_inline simd8() : base8_numeric<uint8_t>() {}
+    simdjson_inline simd8(const __m512i _value) : base8_numeric<uint8_t>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdjson_inline simd8(const uint8_t values[64]) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdjson_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31,
+      uint8_t v32, uint8_t v33, uint8_t v34, uint8_t v35, uint8_t v36, uint8_t v37, uint8_t v38, uint8_t v39,
+      uint8_t v40, uint8_t v41, uint8_t v42, uint8_t v43, uint8_t v44, uint8_t v45, uint8_t v46, uint8_t v47,
+      uint8_t v48, uint8_t v49, uint8_t v50, uint8_t v51, uint8_t v52, uint8_t v53, uint8_t v54, uint8_t v55,
+      uint8_t v56, uint8_t v57, uint8_t v58, uint8_t v59, uint8_t v60, uint8_t v61, uint8_t v62, uint8_t v63
+    ) : simd8(_mm512_set_epi8(
+      v63, v62, v61, v60, v59, v58, v57, v56,
+      v55, v54, v53, v52, v51, v50, v49, v48,
+      v47, v46, v45, v44, v43, v42, v41, v40,
+      v39, v38, v37, v36, v35, v34, v33, v32,
+      v31, v30, v29, v28, v27, v26, v25, v24,
+      v23, v22, v21, v20, v19, v18, v17, v16,
+      v15, v14, v13, v12, v11, v10,  v9,  v8,
+       v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0
+    )) {}
+
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<uint8_t> repeat_16(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Saturated math
+    simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm512_adds_epu8(*this, other); }
+    simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm512_subs_epu8(*this, other); }
+
+    // Order-specific operations
+    simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm512_max_epu8(*this, other); }
+    simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm512_min_epu8(other, *this); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
+    simdjson_inline uint64_t operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
+    simdjson_inline uint64_t operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
+    simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdjson_inline simd8<bool> bits_not_set() const { return _mm512_mask_blend_epi8(*this == uint8_t(0), _mm512_set1_epi8(0), _mm512_set1_epi8(-1)); }
+    simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
+
+    simdjson_inline bool is_ascii() const { return _mm512_movepi8_mask(*this) == 0; }
+    simdjson_inline bool bits_not_set_anywhere() const {
+      return !_mm512_test_epi8_mask(*this, *this);
+    }
+    simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return !_mm512_test_epi8_mask(*this, bits); }
+    simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm512_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm512_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdjson_inline uint64_t get_bit() const { return _mm512_movepi8_mask(_mm512_slli_epi16(*this, 7-N)); }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 1, "Icelake kernel should use one register per 64-byte block.");
+    const simd8<T> chunks[NUM_CHUNKS];
+
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
+
+    simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
+    simdjson_inline simd8x64(const simd8<T> chunk0) : chunks{chunk0} {}
+    simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr)} {}
+
+    simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
+      this->chunks[0].compress(mask, output);
+      return 64 - count_ones(mask);
+    }
+
+    simdjson_inline void store(T ptr[64]) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
+    }
+
+    simdjson_inline simd8<T> reduce_or() const {
+      return this->chunks[0];
+    }
+
+    simdjson_inline simd8x64<T> bit_or(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return simd8x64<T>(
+        this->chunks[0] | mask
+      );
+    }
+
+    simdjson_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->chunks[0] == mask;
+    }
+
+    simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+      return this->chunks[0] == other.chunks[0];
+    }
+
+    simdjson_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return this->chunks[0] <= mask;
+    }
+  }; // struct simd8x64<T>
+
+} // namespace simd
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+
+#endif // SIMDJSON_ICELAKE_SIMD_H
+/* end file include/simdjson/icelake/simd.h */
+/* begin file include/simdjson/generic/jsoncharutils.h */
+
+namespace simdjson {
+namespace icelake {
+namespace {
+namespace jsoncharutils {
+
+// return non-zero if not a structural or whitespace char
+// zero otherwise
+simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace_negated[c];
+}
+
+simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace[c];
+}
+
+// returns a value with the high 16 bits set if not valid
+// otherwise returns the conversion of the 4 hex digits at src into the bottom
+// 16 bits of the 32-bit return register
+//
+// see
+// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
+static inline uint32_t hex_to_u32_nocheck(
+    const uint8_t *src) { // strictly speaking, static inline is a C-ism
+  uint32_t v1 = internal::digit_to_val32[630 + src[0]];
+  uint32_t v2 = internal::digit_to_val32[420 + src[1]];
+  uint32_t v3 = internal::digit_to_val32[210 + src[2]];
+  uint32_t v4 = internal::digit_to_val32[0 + src[3]];
+  return v1 | v2 | v3 | v4;
+}
+
+// given a code point cp, writes to c
+// the utf-8 code, outputting the length in
+// bytes, if the length is zero, the code point
+// is invalid
+//
+// This can possibly be made faster using pdep
+// and clz and table lookups, but JSON documents
+// have few escaped code points, and the following
+// function looks cheap.
+//
+// Note: we assume that surrogates are treated separately
+//
+simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
+  if (cp <= 0x7F) {
+    c[0] = uint8_t(cp);
+    return 1; // ascii
+  }
+  if (cp <= 0x7FF) {
+    c[0] = uint8_t((cp >> 6) + 192);
+    c[1] = uint8_t((cp & 63) + 128);
+    return 2; // universal plane
+    //  Surrogates are treated elsewhere...
+    //} //else if (0xd800 <= cp && cp <= 0xdfff) {
+    //  return 0; // surrogates // could put assert here
+  } else if (cp <= 0xFFFF) {
+    c[0] = uint8_t((cp >> 12) + 224);
+    c[1] = uint8_t(((cp >> 6) & 63) + 128);
+    c[2] = uint8_t((cp & 63) + 128);
+    return 3;
+  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
+                               // is not needed
+    c[0] = uint8_t((cp >> 18) + 240);
+    c[1] = uint8_t(((cp >> 12) & 63) + 128);
+    c[2] = uint8_t(((cp >> 6) & 63) + 128);
+    c[3] = uint8_t((cp & 63) + 128);
+    return 4;
+  }
+  // will return 0 when the code point was too large.
+  return 0; // bad r
+}
+
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit
+//
+static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
+  return x * (uint64_t)y;
+}
+static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + !!(lo < bd);
+  return lo;
+}
+#endif
+
+using internal::value128;
+
+simdjson_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
+  value128 answer;
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emultate
+  answer.high = __umulh(value1, value2);
+  answer.low = value1 * value2;
+#else
+  answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+  __uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#endif
+  return answer;
+}
+
+} // namespace jsoncharutils
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file include/simdjson/generic/jsoncharutils.h */
+/* begin file include/simdjson/generic/atomparsing.h */
+namespace simdjson {
+namespace icelake {
+namespace {
+/// @private
+namespace atomparsing {
+
+// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
+// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
+// be certain that the character pointer will be properly aligned.
+// You might think that using memcpy makes this function expensive, but you'd be wrong.
+// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
+// to the compile-time constant 1936482662.
+simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
+
+
+// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
+// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
+simdjson_warn_unused
+simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
+  uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
+  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+  std::memcpy(&srcval, src, sizeof(uint32_t));
+  return srcval ^ string_to_uint32(atom);
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
+  return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_true_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "true"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
+  return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
+  if (len > 5) { return is_valid_false_atom(src); }
+  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
+  return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_null_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "null"); }
+  else { return false; }
+}
+
+} // namespace atomparsing
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file include/simdjson/generic/atomparsing.h */
+/* begin file include/simdjson/icelake/stringparsing.h */
+#ifndef SIMDJSON_ICELAKE_STRINGPARSING_H
+#define SIMDJSON_ICELAKE_STRINGPARSING_H
+
+
+namespace simdjson {
+namespace icelake {
+namespace {
+
+using namespace simd;
+
+// Holds backslashes and quotes locations.
+struct backslash_and_quote {
+public:
+  static constexpr uint32_t BYTES_PROCESSED = 32;
+  simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+
+  simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
+  simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
+  simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
+  simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
+
+  uint64_t bs_bits;
+  uint64_t quote_bits;
+}; // struct backslash_and_quote
+
+simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
+  // this can read up to 15 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
+  simd8<uint8_t> v(src);
+  // store to dest unconditionally - we can overwrite the bits we don't like later
+  v.store(dst);
+  return {
+      static_cast<uint64_t>(v == '\\'), // bs_bits
+      static_cast<uint64_t>(v == '"'), // quote_bits
+  };
+}
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+
+#endif // SIMDJSON_ICELAKE_STRINGPARSING_H
+/* end file include/simdjson/icelake/stringparsing.h */
+/* begin file include/simdjson/icelake/numberparsing.h */
+#ifndef SIMDJSON_ICELAKE_NUMBERPARSING_H
+#define SIMDJSON_ICELAKE_NUMBERPARSING_H
+
+namespace simdjson {
+namespace icelake {
+namespace {
+
+static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
+  // this actually computes *16* values so we are being wasteful.
+  const __m128i ascii0 = _mm_set1_epi8('0');
+  const __m128i mul_1_10 =
+      _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
+  const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
+  const __m128i mul_1_10000 =
+      _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
+  const __m128i input = _mm_sub_epi8(
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
+  const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
+  const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
+  const __m128i t3 = _mm_packus_epi32(t2, t2);
+  const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
+  return _mm_cvtsi128_si32(
+      t4); // only captures the sum of the first 8 digits, drop the rest
+}
+
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+
+#define SIMDJSON_SWAR_NUMBER_PARSING 1
+
+/* begin file include/simdjson/generic/numberparsing.h */
+#include <limits>
+
+namespace simdjson {
+namespace icelake {
+
+namespace ondemand {
+/**
+ * The type of a JSON number
+ */
+enum class number_type {
+    floating_point_number=1, /// a binary64 number
+    signed_integer,          /// a signed integer that fits in a 64-bit word using two's complement
+    unsigned_integer         /// a positive integer larger or equal to 1<<63
+};
+}
+
+namespace {
+/// @private
+namespace numberparsing {
+
+
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
+#endif
+
+namespace {
+// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
+// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
+// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
+simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
+    double d;
+    mantissa &= ~(1ULL << 52);
+    mantissa |= real_exponent << 52;
+    mantissa |= ((static_cast<uint64_t>(negative)) << 63);
+    std::memcpy(&d, &mantissa, sizeof(d));
+    return d;
+}
+}
+// Attempts to compute i * 10^(power) exactly; and if "negative" is
+// true, negate the result.
+// This function will only work in some cases, when it does not work, success is
+// set to false. This should work *most of the time* (like 99% of the time).
+// We assume that power is in the [smallest_power,
+// largest_power] interval: the caller is responsible for this check.
+simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
+  // we start with a fast path
+  // It was described in
+  // Clinger WD. How to read floating point numbers accurately.
+  // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
+  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
+    // convert the integer into a double. This is lossless since
+    // 0 <= i <= 2^53 - 1.
+    d = double(i);
+    //
+    // The general idea is as follows.
+    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+    // 1) Both s and p can be represented exactly as 64-bit floating-point
+    // values
+    // (binary64).
+    // 2) Because s and p can be represented exactly as floating-point values,
+    // then s * p
+    // and s / p will produce correctly rounded values.
+    //
+    if (power < 0) {
+      d = d / simdjson::internal::power_of_ten[-power];
+    } else {
+      d = d * simdjson::internal::power_of_ten[power];
+    }
+    if (negative) {
+      d = -d;
+    }
+    return true;
+  }
+  // When 22 < power && power <  22 + 16, we could
+  // hope for another, secondary fast path.  It was
+  // described by David M. Gay in  "Correctly rounded
+  // binary-decimal and decimal-binary conversions." (1990)
+  // If you need to compute i * 10^(22 + x) for x < 16,
+  // first compute i * 10^x, if you know that result is exact
+  // (e.g., when i * 10^x < 2^53),
+  // then you can still proceed and do (i * 10^x) * 10^22.
+  // Is this worth your time?
+  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+  // for this second fast path to work.
+  // If you you have 22 < power *and* power <  22 + 16, and then you
+  // optimistically compute "i * 10^(x-22)", there is still a chance that you
+  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+  // this optimization maybe less common than we would like. Source:
+  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+
+  // The fast path has now failed, so we are failing back on the slower path.
+
+  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+  // possible, except if i == 0, so we handle i == 0 separately.
+  if(i == 0) {
+    d = negative ? -0.0 : 0.0;
+    return true;
+  }
+
+
+  // The exponent is 1024 + 63 + power
+  //     + floor(log(5**power)/log(2)).
+  // The 1024 comes from the ieee64 standard.
+  // The 63 comes from the fact that we use a 64-bit word.
+  //
+  // Computing floor(log(5**power)/log(2)) could be
+  // slow. Instead we use a fast function.
+  //
+  // For power in (-400,350), we have that
+  // (((152170 + 65536) * power ) >> 16);
+  // is equal to
+  //  floor(log(5**power)/log(2)) + power when power >= 0
+  // and it is equal to
+  //  ceil(log(5**-power)/log(2)) + power when power < 0
+  //
+  // The 65536 is (1<<16) and corresponds to
+  // (65536 * power) >> 16 ---> power
+  //
+  // ((152170 * power ) >> 16) is equal to
+  // floor(log(5**power)/log(2))
+  //
+  // Note that this is not magic: 152170/(1<<16) is
+  // approximatively equal to log(5)/log(2).
+  // The 1<<16 value is a power of two; we could use a
+  // larger power of 2 if we wanted to.
+  //
+  int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
+
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(i);
+  i <<= lz;
+
+
+  // We are going to need to do some 64-bit arithmetic to get a precise product.
+  // We use a table lookup approach.
+  // It is safe because
+  // power >= smallest_power
+  // and power <= largest_power
+  // We recover the mantissa of the power, it has a leading 1. It is always
+  // rounded down.
+  //
+  // We want the most significant 64 bits of the product. We know
+  // this will be non-zero because the most significant bit of i is
+  // 1.
+  const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
+  // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
+  //
+  // The full_multiplication function computes the 128-bit product of two 64-bit words
+  // with a returned value of type value128 with a "low component" corresponding to the
+  // 64-bit least significant bits of the product and with a "high component" corresponding
+  // to the 64-bit most significant bits of the product.
+  simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
+  // Both i and power_of_five_128[index] have their most significant bit set to 1 which
+  // implies that the either the most or the second most significant bit of the product
+  // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
+  // we make of the product. It also makes it easy to reason about the product: there
+  // is 0 or 1 leading zero in the product.
+
+  // Unless the least significant 9 bits of the high (64-bit) part of the full
+  // product are all 1s, then we know that the most significant 55 bits are
+  // exact and no further work is needed. Having 55 bits is necessary because
+  // we need 53 bits for the mantissa but we have to have one rounding bit and
+  // we can waste a bit if the most significant bit of the product is zero.
+  if((firstproduct.high & 0x1FF) == 0x1FF) {
+    // We want to compute i * 5^q, but only care about the top 55 bits at most.
+    // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
+    // the full computation is wasteful. So we do what is called a "truncated
+    // multiplication".
+    // We take the most significant 64-bits, and we put them in
+    // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
+    // to the desired approximation using one multiplication. Sometimes it does not suffice.
+    // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
+    // then we get a better approximation to i * 5^q. In very rare cases, even that
+    // will not suffice, though it is seemingly very hard to find such a scenario.
+    //
+    // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
+    // more complicated.
+    //
+    // There is an extra layer of complexity in that we need more than 55 bits of
+    // accuracy in the round-to-even scenario.
+    //
+    // The full_multiplication function computes the 128-bit product of two 64-bit words
+    // with a returned value of type value128 with a "low component" corresponding to the
+    // 64-bit least significant bits of the product and with a "high component" corresponding
+    // to the 64-bit most significant bits of the product.
+    simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
+    // At this point, we might need to add at most one to firstproduct, but this
+    // can only change the value of firstproduct.high if firstproduct.low is maximal.
+    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
+      // This is very unlikely, but if so, we need to do much more work!
+      return false;
+    }
+  }
+  uint64_t lower = firstproduct.low;
+  uint64_t upper = firstproduct.high;
+  // The final mantissa should be 53 bits with a leading 1.
+  // We shift it so that it occupies 54 bits with a leading 1.
+  ///////
+  uint64_t upperbit = upper >> 63;
+  uint64_t mantissa = upper >> (upperbit + 9);
+  lz += int(1 ^ upperbit);
+
+  // Here we have mantissa < (1<<54).
+  int64_t real_exponent = exponent - lz;
+  if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
+    // Here have that real_exponent <= 0 so -real_exponent >= 0
+    if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+      d = negative ? -0.0 : 0.0;
+      return true;
+    }
+    // next line is safe because -real_exponent + 1 < 0
+    mantissa >>= -real_exponent + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    mantissa += (mantissa & 1); // round up
+    mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
+    d = to_double(mantissa, real_exponent, negative);
+    return true;
+  }
+  // We have to round to even. The "to even" part
+  // is only a problem when we are right in between two floats
+  // which we guard against.
+  // If we have lots of trailing zeros, we may fall right between two
+  // floating-point values.
+  //
+  // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
+  // times a power of two. That is, it is right between a number with binary significand
+  // m and another number with binary significand m+1; and it must be the case
+  // that it cannot be represented by a float itself.
+  //
+  // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
+  // Recall that 10^q = 5^q * 2^q.
+  // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
+  //  5^23 <=  2^54 and it is the last power of five to qualify, so q <= 23.
+  // When q<0, we have  w  >=  (2m+1) x 5^{-q}.  We must have that w<2^{64} so
+  // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
+  // 2^{53} x 5^{-q} < 2^{64}.
+  // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
+  //
+  // We require lower <= 1 and not lower == 0 because we could not prove that
+  // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
+  if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
+    if((mantissa  << (upperbit + 64 - 53 - 2)) ==  upper) {
+      mantissa &= ~1;             // flip it so that we do not round up
+    }
+  }
+
+  mantissa += mantissa & 1;
+  mantissa >>= 1;
+
+  // Here we have mantissa < (1<<53), unless there was an overflow
+  if (mantissa >= (1ULL << 53)) {
+    //////////
+    // This will happen when parsing values such as 7.2057594037927933e+16
+    ////////
+    mantissa = (1ULL << 52);
+    real_exponent++;
+  }
+  mantissa &= ~(1ULL << 52);
+  // we have to check that real_exponent is in range, otherwise we bail out
+  if (simdjson_unlikely(real_exponent > 2046)) {
+    // We have an infinite value!!! We could actually throw an error here if we could.
+    return false;
+  }
+  d = to_double(mantissa, real_exponent, negative);
+  return true;
+}
+
+// We call a fallback floating-point parser that might be slow. Note
+// it will accept JSON numbers, but the JSON spec. is more restrictive so
+// before you call parse_float_fallback, you need to have validated the input
+// string with the JSON grammar.
+// It will return an error (false) if the parsed number is infinite.
+// The string parsing itself always succeeds. We know that there is at least
+// one digit.
+static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+
+// check quickly whether the next 8 chars are made of digits
+// at a glance, it looks better than Mula's
+// http://0x80.pl/articles/swar-digits-validate.html
+simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
+  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+  std::memcpy(&val, chars, 8);
+  // a branchy method might be faster:
+  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+  //  0x3030303030303030);
+  return (((val & 0xF0F0F0F0F0F0F0F0) |
+           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+          0x3333333333333333);
+}
+
+template<typename W>
+error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
+  double d;
+  if (parse_float_fallback(src, &d)) {
+    writer.append_double(d);
+    return SUCCESS;
+  }
+  return INVALID_NUMBER(src);
+}
+
+template<typename I>
+SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+simdjson_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
+  }
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
+}
+
+simdjson_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
+  // we continue with the fiction that we have an integer. If the
+  // floating point number is representable as x * 10^z for some integer
+  // z that fits in 53 bits, then we will be able to convert back the
+  // the integer into a float in a lossless manner.
+  const uint8_t *const first_after_period = p;
+
+#ifdef SIMDJSON_SWAR_NUMBER_PARSING
+#if SIMDJSON_SWAR_NUMBER_PARSING
+  // this helps if we have lots of decimals!
+  // this turns out to be frequent enough.
+  if (is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p);
+    p += 8;
+  }
+#endif // SIMDJSON_SWAR_NUMBER_PARSING
+#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
+  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
+  if (parse_digit(*p, i)) { ++p; }
+  while (parse_digit(*p, i)) { p++; }
+  exponent = first_after_period - p;
+  // Decimal without digits (123.) is illegal
+  if (exponent == 0) {
+    return INVALID_NUMBER(src);
+  }
+  return SUCCESS;
+}
+
+simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
+  // Exp Sign: -123.456e[-]78
+  bool neg_exp = ('-' == *p);
+  if (neg_exp || '+' == *p) { p++; } // Skip + as well
+
+  // Exponent: -123.456e-[78]
+  auto start_exp = p;
+  int64_t exp_number = 0;
+  while (parse_digit(*p, exp_number)) { ++p; }
+  // It is possible for parse_digit to overflow.
+  // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
+  // Thus we *must* check for possible overflow before we negate exp_number.
+
+  // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
+  // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
+  // not oblige and may, in fact, generate two distinct paths in any case. It might be
+  // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
+  // instructions for a simdjson_likely branch, an unconclusive gain.
+
+  // If there were no digits, it's an error.
+  if (simdjson_unlikely(p == start_exp)) {
+    return INVALID_NUMBER(src);
+  }
+  // We have a valid positive exponent in exp_number at this point, except that
+  // it may have overflowed.
+
+  // If there were more than 18 digits, we may have overflowed the integer. We have to do
+  // something!!!!
+  if (simdjson_unlikely(p > start_exp+18)) {
+    // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
+    while (*start_exp == '0') { start_exp++; }
+    // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
+    // support exponents smaller than -999,999,999,999,999,999 and bigger
+    // than 999,999,999,999,999,999.
+    // We can truncate.
+    // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
+    // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
+    // truncate at 324.
+    // Note that there is no reason to fail per se at this point in time.
+    // E.g., 0e999999999999999999999 is a fine number.
+    if (p > start_exp+18) { exp_number = 999999999999999999; }
+  }
+  // At this point, we know that exp_number is a sane, positive, signed integer.
+  // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
+  // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
+  // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
+  // To sum it up: the next line should never overflow.
+  exponent += (neg_exp ? -exp_number : exp_number);
+  return SUCCESS;
+}
+
+simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
+  // It is possible that the integer had an overflow.
+  // We have to handle the case where we have 0.0000somenumber.
+  const uint8_t *start = start_digits;
+  while ((*start == '0') || (*start == '.')) { ++start; }
+  // we over-decrement by one when there is a '.'
+  return digit_count - size_t(start - start_digits);
+}
+
+template<typename W>
+simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon in practice.
+  //
+  // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
+  // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
+  // may not have a decimal separator!
+  if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
+    // Ok, chances are good that we had an overflow!
+    // this is almost never going to get called!!!
+    // we start anew, going slowly!!!
+    // This will happen in the following examples:
+    // 10000000000000000000000000000000000000000000e+308
+    // 3.1415926535897932384626433832795028841971693993751
+    //
+    // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
+    // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
+    // it, it would force it to be stored in memory, preventing the compiler from picking it apart
+    // and putting into registers. i.e. if we pass it as reference, it gets slow.
+    // This is what forces the skip_double, as well.
+    error_code error = slow_float_parsing(src, writer);
+    writer.skip_double();
+    return error;
+  }
+  // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
+  // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+  // To future reader: we'd love if someone found a better way, or at least could explain this result!
+  if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
+    //
+    // Important: smallest_power is such that it leads to a zero value.
+    // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
+    // so something x 10^-343 goes to zero, but not so with  something x 10^-342.
+    static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
+    //
+    if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
+      // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
+      WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
+      return SUCCESS;
+    } else { // (exponent > largest_power) and (i != 0)
+      // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
+      return INVALID_NUMBER(src);
+    }
+  }
+  double d;
+  if (!compute_float_64(exponent, i, negative, d)) {
+    // we are almost never going to get here.
+    if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
+  }
+  WRITE_DOUBLE(d, src, writer);
+  return SUCCESS;
+}
+
+// for performance analysis, it is sometimes  useful to skip parsing
+#ifdef SIMDJSON_SKIPNUMBERPARSING
+
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
+  writer.append_s64(0);        // always write zero
+  return SUCCESS;              // always succeeds
+}
+
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { return ondemand::number_type::signed_integer; }
+#else
+
+// parse the number at src
+// define JSON_TEST_NUMBERS for unit testing
+//
+// It is assumed that the number is followed by a structural ({,},],[) character
+// or a white space character. If that is not the case (e.g., when the JSON
+// document is made of a single number), then it is necessary to copy the
+// content and append a space before calling this function.
+//
+// Our objective is accurate parsing (ULP of 0) at high speed.
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
+
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
+
+  //
+  // Handle floats if there is a . or e (or both)
+  //
+  int64_t exponent = 0;
+  bool is_float = false;
+  if ('.' == *p) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
+    digit_count = int(p - start_digits); // used later to guard against overflows
+  }
+  if (('e' == *p) || ('E' == *p)) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_exponent(src, p, exponent) );
+  }
+  if (is_float) {
+    const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
+    SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
+    if (dirty_end) { return INVALID_NUMBER(src); }
+    return SUCCESS;
+  }
+
+  // The longest negative 64-bit number is 19 digits.
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  size_t longest_digit_count = negative ? 19 : 20;
+  if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
+  if (digit_count == longest_digit_count) {
+    if (negative) {
+      // Anything negative above INT64_MAX+1 is invalid
+      if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src);  }
+      WRITE_INTEGER(~i+1, src, writer);
+      if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+      return SUCCESS;
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    }  else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
+  }
+
+  // Write unsigned if it doesn't fit in a signed integer.
+  if (i > uint64_t(INT64_MAX)) {
+    WRITE_UNSIGNED(i, src, writer);
+  } else {
+    WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
+  }
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+  return SUCCESS;
+}
+
+// Inlineable functions
+namespace {
+
+// This table can be used to characterize the final character of an integer
+// string. For JSON structural character and allowable white space characters,
+// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
+// we return NUMBER_ERROR.
+// Optimization note: we could easily reduce the size of the table by half (to 128)
+// at the cost of an extra branch.
+// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
+static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
+
+const uint8_t integer_string_finisher[256] = {
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   SUCCESS,      NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, SUCCESS,        NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR};
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
+  const uint8_t *p = src + 1;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    // Note: we use src[1] and not src[0] because src[0] is the quote character in this
+    // instance.
+    if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  //
+  // Check for minus sign
+  //
+  if(src == src_end) { return NUMBER_ERROR; }
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = src;
+  uint64_t i = 0;
+  while (parse_digit(*src, i)) { src++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(src - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
+  //  return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(*src != '"') { return NUMBER_ERROR; }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
+  return (*src == '-');
+}
+
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
+  return false;
+}
+
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) {
+    // We have an integer.
+    // If the number is negative and valid, it must be a signed integer.
+    if(negative) { return ondemand::number_type::signed_integer; }
+    // We want values larger or equal to 9223372036854775808 to be unsigned
+    // integers, and the other values to be signed integers.
+    int digit_count = int(p - src);
+    if(digit_count >= 19) {
+      const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
+      if((digit_count >= 20) || (memcmp(src, smaller_big_integer, 19) >= 0)) {
+        return ondemand::number_type::unsigned_integer;
+      }
+    }
+    return ondemand::number_type::signed_integer;
+  }
+  // Hopefully, we have 'e' or 'E' or '.'.
+  return ondemand::number_type::floating_point_number;
+}
+
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
+  if(src == src_end) { return NUMBER_ERROR; }
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  if(p == src_end) { return NUMBER_ERROR; }
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely((p != src_end) && (*p == '.'))) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while ((p != src_end) && parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if ((p != src_end) && (*p == 'e' || *p == 'E')) {
+    p++;
+    if(p == src_end) { return NUMBER_ERROR; }
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while ((p != src_end) && parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+} //namespace {}
+#endif // SIMDJSON_SKIPNUMBERPARSING
+
+} // namespace numberparsing
+} // unnamed namespace
+} // namespace icelake
+} // namespace simdjson
+/* end file include/simdjson/generic/numberparsing.h */
+
+#endif // SIMDJSON_ICELAKE_NUMBERPARSING_H
+/* end file include/simdjson/icelake/numberparsing.h */
+/* begin file include/simdjson/icelake/end.h */
+SIMDJSON_UNTARGET_ICELAKE
+/* end file include/simdjson/icelake/end.h */
+
+#endif // SIMDJSON_IMPLEMENTATION_ICELAKE
+#endif // SIMDJSON_ICELAKE_H
+/* end file include/simdjson/icelake.h */
+/* begin file include/simdjson/haswell.h */
+#ifndef SIMDJSON_HASWELL_H
+#define SIMDJSON_HASWELL_H
+
+
+#if SIMDJSON_IMPLEMENTATION_HASWELL
+
+#if SIMDJSON_CAN_ALWAYS_RUN_HASWELL
+#define SIMDJSON_TARGET_HASWELL
+#define SIMDJSON_UNTARGET_HASWELL
+#else
+#define SIMDJSON_TARGET_HASWELL SIMDJSON_TARGET_REGION("avx2,bmi,pclmul,lzcnt")
+#define SIMDJSON_UNTARGET_HASWELL SIMDJSON_UNTARGET_REGION
+#endif
+
+namespace simdjson {
+/**
+ * Implementation for Haswell (Intel AVX2).
+ */
+namespace haswell {
+} // namespace haswell
+} // namespace simdjson
+
+//
+// These two need to be included outside SIMDJSON_TARGET_HASWELL
+//
+/* begin file include/simdjson/haswell/implementation.h */
+#ifndef SIMDJSON_HASWELL_IMPLEMENTATION_H
+#define SIMDJSON_HASWELL_IMPLEMENTATION_H
+
+
+// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_HASWELL
+namespace simdjson {
+namespace haswell {
+
+using namespace simdjson;
+
+class implementation final : public simdjson::implementation {
+public:
+  simdjson_inline implementation() : simdjson::implementation(
+      "haswell",
+      "Intel/AMD AVX2",
+      internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
+  ) {}
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
+  simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+};
+
+} // namespace haswell
+} // namespace simdjson
+
+#endif // SIMDJSON_HASWELL_IMPLEMENTATION_H
+/* end file include/simdjson/haswell/implementation.h */
+/* begin file include/simdjson/haswell/intrinsics.h */
+#ifndef SIMDJSON_HASWELL_INTRINSICS_H
+#define SIMDJSON_HASWELL_INTRINSICS_H
+
+
+#ifdef SIMDJSON_VISUAL_STUDIO
+// under clang within visual studio, this will include <x86intrin.h>
+#include <intrin.h>  // visual studio or clang
+#else
+#include <x86intrin.h> // elsewhere
+#endif // SIMDJSON_VISUAL_STUDIO
+
+#ifdef SIMDJSON_CLANG_VISUAL_STUDIO
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ * e.g., if __AVX2__ is set... in turn,  we normally set these
+ * macros by compiling against the corresponding architecture
+ * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
+ * software with these advanced instructions. In simdjson, we
+ * want to compile the whole program for a generic target,
+ * and only target our specific kernels. As a workaround,
+ * we directly include the needed headers. These headers would
+ * normally guard against such usage, but we carefully included
+ * <x86intrin.h>  (or <intrin.h>) before, so the headers
+ * are fooled.
+ */
+#include <bmiintrin.h>   // for _blsr_u64
+#include <lzcntintrin.h> // for  __lzcnt64
+#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#include <wmmintrin.h>   // for  _mm_clmulepi64_si128
+// unfortunately, we may not get _blsr_u64, but, thankfully, clang
+// has it as a macro.
+#ifndef _blsr_u64
+// we roll our own
+#define _blsr_u64(n) ((n - 1) & n)
+#endif //  _blsr_u64
+#endif // SIMDJSON_CLANG_VISUAL_STUDIO
+
+static_assert(sizeof(__m256i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for haswell kernel.");
+
+#endif // SIMDJSON_HASWELL_INTRINSICS_H
+/* end file include/simdjson/haswell/intrinsics.h */
+
+//
+// The rest need to be inside the region
+//
+/* begin file include/simdjson/haswell/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "haswell"
+// #define SIMDJSON_IMPLEMENTATION haswell
+SIMDJSON_TARGET_HASWELL
+/* end file include/simdjson/haswell/begin.h */
+
+// Declarations
+/* begin file include/simdjson/generic/dom_parser_implementation.h */
+
+namespace simdjson {
+namespace haswell {
+
+// expectation: sizeof(open_container) = 64/8.
+struct open_container {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct open_container
+
+static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<open_container[]> open_containers{};
+  /** Whether each open container is a [ or { */
+  std::unique_ptr<bool[]> is_array{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+
+  inline dom_parser_implementation() noexcept;
+  inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
+  inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+  simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
+  simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
+  simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst) const noexcept final;
+  inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
+  inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
+private:
+  simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
+
+};
+
+} // namespace haswell
+} // namespace simdjson
+
+namespace simdjson {
+namespace haswell {
+
+inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
+inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
+inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
+
+// Leaving these here so they can be inlined if so desired
+inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
+  // Stage 1 index output
+  size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
+  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
+  structural_indexes[0] = 0;
+  n_structural_indexes = 0;
+
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  // Stage 2 stacks
+  open_containers.reset(new (std::nothrow) open_container[max_depth]);
+  is_array.reset(new (std::nothrow) bool[max_depth]);
+  if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
+
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+
+} // namespace haswell
+} // namespace simdjson
+/* end file include/simdjson/generic/dom_parser_implementation.h */
+/* begin file include/simdjson/haswell/bitmanipulation.h */
+#ifndef SIMDJSON_HASWELL_BITMANIPULATION_H
+#define SIMDJSON_HASWELL_BITMANIPULATION_H
+
+namespace simdjson {
+namespace haswell {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+SIMDJSON_NO_SANITIZE_UNDEFINED
+simdjson_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  return (int)_tzcnt_u64(input_num);
+#else // SIMDJSON_REGULAR_VISUAL_STUDIO
+  ////////
+  // You might expect the next line to be equivalent to
+  // return (int)_tzcnt_u64(input_num);
+  // but the generated code differs and might be less efficient?
+  ////////
+  return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return _blsr_u64(input_num);
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline int leading_zeroes(uint64_t input_num) {
+  return int(_lzcnt_u64(input_num));
+}
+
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
+}
+#else
+simdjson_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
+}
+#endif
+
+simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                uint64_t *result) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  return _addcarry_u64(0, value1, value2,
+                       reinterpret_cast<unsigned __int64 *>(result));
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+
+#endif // SIMDJSON_HASWELL_BITMANIPULATION_H
+/* end file include/simdjson/haswell/bitmanipulation.h */
+/* begin file include/simdjson/haswell/bitmask.h */
+#ifndef SIMDJSON_HASWELL_BITMASK_H
+#define SIMDJSON_HASWELL_BITMASK_H
+
+namespace simdjson {
+namespace haswell {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
+  // There should be no such thing with a processor supporting avx2
+  // but not clmul.
+  __m128i all_ones = _mm_set1_epi8('\xFF');
+  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+  return _mm_cvtsi128_si64(result);
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+
+#endif // SIMDJSON_HASWELL_BITMASK_H
+/* end file include/simdjson/haswell/bitmask.h */
+/* begin file include/simdjson/haswell/simd.h */
+#ifndef SIMDJSON_HASWELL_SIMD_H
+#define SIMDJSON_HASWELL_SIMD_H
+
+
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace simd {
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename Child>
+  struct base {
+    __m256i value;
+
+    // Zero constructor
+    simdjson_inline base() : value{__m256i()} {}
+
+    // Conversion from SIMD register
+    simdjson_inline base(const __m256i _value) : value(_value) {}
+
+    // Conversion to SIMD register
+    simdjson_inline operator const __m256i&() const { return this->value; }
+    simdjson_inline operator __m256i&() { return this->value; }
+
+    // Bit operations
+    simdjson_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
+    simdjson_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
+    simdjson_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
+    simdjson_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
+    simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+  };
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
+
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
+    typedef uint32_t bitmask_t;
+    typedef uint64_t bitmask2_t;
+
+    simdjson_inline base8() : base<simd8<T>>() {}
+    simdjson_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+
+    friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
+
+    static const int SIZE = sizeof(base<T>::value);
+
+    template<int N=1>
+    simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
+    static simdjson_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
+
+    simdjson_inline simd8<bool>() : base8() {}
+    simdjson_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+
+    simdjson_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); }
+    simdjson_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+    simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
+  };
+
+  template<typename T>
+  struct base8_numeric: base8<T> {
+    static simdjson_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
+    static simdjson_inline simd8<T> zero() { return _mm256_setzero_si256(); }
+    static simdjson_inline simd8<T> load(const T values[32]) {
+      return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    static simdjson_inline simd8<T> repeat_16(
+      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
+      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
+    ) {
+      return simd8<T>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    simdjson_inline base8_numeric() : base8<T>() {}
+    simdjson_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
+
+    // Store to array
+    simdjson_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
+    simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
+    simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
+    simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+
+    // Override to distinguish from bool version
+    simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return _mm256_shuffle_epi8(lookup_table, *this);
+    }
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint32_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    simdjson_inline void compress(uint32_t mask, L * output) const {
+      using internal::thintable_epi8;
+      using internal::BitsSetTable256mul2;
+      using internal::pshufb_combine_table;
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in four steps, first 8 bytes and then second 8 bytes...
+      uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+      uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits
+      uint8_t mask3 = uint8_t(mask >> 16); // ...
+      uint8_t mask4 = uint8_t(mask >> 24); // ...
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      __m256i shufmask =  _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3],
+        thintable_epi8[mask2], thintable_epi8[mask1]);
+      // we increment by 0x08 the second half of the mask and so forth
+      shufmask =
+      _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818,
+         0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
+      // this is the version "nearly pruned"
+      __m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
+      // we still need to put the  pieces back together.
+      // we compute the popcount of the first words:
+      int pop1 = BitsSetTable256mul2[mask1];
+      int pop3 = BitsSetTable256mul2[mask3];
+
+      // then load the corresponding mask
+      // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
+      __m256i v256 = _mm256_castsi128_si256(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8)));
+      __m256i compactmask = _mm256_insertf128_si256(v256,
+         _mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop3 * 8)), 1);
+      __m256i almostthere =  _mm256_shuffle_epi8(pruned, compactmask);
+      // We just need to write out the result.
+      // This is the tricky bit that is hard to do
+      // if we want to return a SIMD register, since there
+      // is no single-instruction approach to recombine
+      // the two 128-bit lanes with an offset.
+      __m128i v128;
+      v128 = _mm256_castsi256_si128(almostthere);
+      _mm_storeu_si128( reinterpret_cast<__m128i *>(output), v128);
+      v128 = _mm256_extractf128_si256(almostthere, 1);
+      _mm_storeu_si128( reinterpret_cast<__m128i *>(output + 16 - count_ones(mask & 0xFFFF)), v128);
+    }
+
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdjson_inline simd8() : base8_numeric<int8_t>() {}
+    simdjson_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdjson_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<int8_t> repeat_16(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Order-sensitive comparisons
+    simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
+    simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
+    simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
+    simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t> {
+    simdjson_inline simd8() : base8_numeric<uint8_t>() {}
+    simdjson_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdjson_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
+    ) : simd8(_mm256_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15,
+      v16,v17,v18,v19,v20,v21,v22,v23,
+      v24,v25,v26,v27,v28,v29,v30,v31
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<uint8_t> repeat_16(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15,
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Saturated math
+    simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
+    simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
+
+    // Order-specific operations
+    simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
+    simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
+    simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
+    simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
+    simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
+    simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
+    simdjson_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
+    simdjson_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
+    simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
+    simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdjson_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
+    const simd8<T> chunks[NUM_CHUNKS];
+
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
+
+    simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
+    simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
+
+    simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
+      uint32_t mask1 = uint32_t(mask);
+      uint32_t mask2 = uint32_t(mask >> 32);
+      this->chunks[0].compress(mask1, output);
+      this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
+      return 64 - count_ones(mask);
+    }
+
+    simdjson_inline void store(T ptr[64]) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
+    }
+
+    simdjson_inline uint64_t to_bitmask() const {
+      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+      uint64_t r_hi =                       this->chunks[1].to_bitmask();
+      return r_lo | (r_hi << 32);
+    }
+
+    simdjson_inline simd8<T> reduce_or() const {
+      return this->chunks[0] | this->chunks[1];
+    }
+
+    simdjson_inline simd8x64<T> bit_or(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return simd8x64<T>(
+        this->chunks[0] | mask,
+        this->chunks[1] | mask
+      );
+    }
+
+    simdjson_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask
+      ).to_bitmask();
+    }
+
+    simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+      return  simd8x64<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1]
+      ).to_bitmask();
+    }
+
+    simdjson_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
+
+} // namespace simd
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+
+#endif // SIMDJSON_HASWELL_SIMD_H
+/* end file include/simdjson/haswell/simd.h */
+/* begin file include/simdjson/generic/jsoncharutils.h */
+
+namespace simdjson {
+namespace haswell {
+namespace {
+namespace jsoncharutils {
+
+// return non-zero if not a structural or whitespace char
+// zero otherwise
+simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace_negated[c];
+}
+
+simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace[c];
+}
+
+// returns a value with the high 16 bits set if not valid
+// otherwise returns the conversion of the 4 hex digits at src into the bottom
+// 16 bits of the 32-bit return register
+//
+// see
+// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
+static inline uint32_t hex_to_u32_nocheck(
+    const uint8_t *src) { // strictly speaking, static inline is a C-ism
+  uint32_t v1 = internal::digit_to_val32[630 + src[0]];
+  uint32_t v2 = internal::digit_to_val32[420 + src[1]];
+  uint32_t v3 = internal::digit_to_val32[210 + src[2]];
+  uint32_t v4 = internal::digit_to_val32[0 + src[3]];
+  return v1 | v2 | v3 | v4;
+}
+
+// given a code point cp, writes to c
+// the utf-8 code, outputting the length in
+// bytes, if the length is zero, the code point
+// is invalid
+//
+// This can possibly be made faster using pdep
+// and clz and table lookups, but JSON documents
+// have few escaped code points, and the following
+// function looks cheap.
+//
+// Note: we assume that surrogates are treated separately
+//
+simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
+  if (cp <= 0x7F) {
+    c[0] = uint8_t(cp);
+    return 1; // ascii
+  }
+  if (cp <= 0x7FF) {
+    c[0] = uint8_t((cp >> 6) + 192);
+    c[1] = uint8_t((cp & 63) + 128);
+    return 2; // universal plane
+    //  Surrogates are treated elsewhere...
+    //} //else if (0xd800 <= cp && cp <= 0xdfff) {
+    //  return 0; // surrogates // could put assert here
+  } else if (cp <= 0xFFFF) {
+    c[0] = uint8_t((cp >> 12) + 224);
+    c[1] = uint8_t(((cp >> 6) & 63) + 128);
+    c[2] = uint8_t((cp & 63) + 128);
+    return 3;
+  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
+                               // is not needed
+    c[0] = uint8_t((cp >> 18) + 240);
+    c[1] = uint8_t(((cp >> 12) & 63) + 128);
+    c[2] = uint8_t(((cp >> 6) & 63) + 128);
+    c[3] = uint8_t((cp & 63) + 128);
+    return 4;
+  }
+  // will return 0 when the code point was too large.
+  return 0; // bad r
+}
+
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit
+//
+static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
+  return x * (uint64_t)y;
+}
+static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + !!(lo < bd);
+  return lo;
+}
+#endif
+
+using internal::value128;
+
+simdjson_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
+  value128 answer;
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emultate
+  answer.high = __umulh(value1, value2);
+  answer.low = value1 * value2;
+#else
+  answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+  __uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#endif
+  return answer;
+}
+
+} // namespace jsoncharutils
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file include/simdjson/generic/jsoncharutils.h */
+/* begin file include/simdjson/generic/atomparsing.h */
+namespace simdjson {
+namespace haswell {
+namespace {
+/// @private
+namespace atomparsing {
+
+// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
+// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
+// be certain that the character pointer will be properly aligned.
+// You might think that using memcpy makes this function expensive, but you'd be wrong.
+// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
+// to the compile-time constant 1936482662.
+simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
+
+
+// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
+// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
+simdjson_warn_unused
+simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
+  uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
+  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+  std::memcpy(&srcval, src, sizeof(uint32_t));
+  return srcval ^ string_to_uint32(atom);
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
+  return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_true_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "true"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
+  return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
+  if (len > 5) { return is_valid_false_atom(src); }
+  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
+  return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_null_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "null"); }
+  else { return false; }
+}
+
+} // namespace atomparsing
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file include/simdjson/generic/atomparsing.h */
+/* begin file include/simdjson/haswell/stringparsing.h */
+#ifndef SIMDJSON_HASWELL_STRINGPARSING_H
+#define SIMDJSON_HASWELL_STRINGPARSING_H
+
+
+namespace simdjson {
+namespace haswell {
+namespace {
+
+using namespace simd;
+
+// Holds backslashes and quotes locations.
+struct backslash_and_quote {
+public:
+  static constexpr uint32_t BYTES_PROCESSED = 32;
+  simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+
+  simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
+  simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
+  simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
+  simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
+
+  uint32_t bs_bits;
+  uint32_t quote_bits;
+}; // struct backslash_and_quote
+
+simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
+  // this can read up to 15 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
+  simd8<uint8_t> v(src);
+  // store to dest unconditionally - we can overwrite the bits we don't like later
+  v.store(dst);
+  return {
+      static_cast<uint32_t>((v == '\\').to_bitmask()),     // bs_bits
+      static_cast<uint32_t>((v == '"').to_bitmask()), // quote_bits
+  };
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+
+#endif // SIMDJSON_HASWELL_STRINGPARSING_H
+/* end file include/simdjson/haswell/stringparsing.h */
+/* begin file include/simdjson/haswell/numberparsing.h */
+#ifndef SIMDJSON_HASWELL_NUMBERPARSING_H
+#define SIMDJSON_HASWELL_NUMBERPARSING_H
+
+namespace simdjson {
+namespace haswell {
+namespace {
+
+static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
+  // this actually computes *16* values so we are being wasteful.
+  const __m128i ascii0 = _mm_set1_epi8('0');
+  const __m128i mul_1_10 =
+      _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
+  const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
+  const __m128i mul_1_10000 =
+      _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
+  const __m128i input = _mm_sub_epi8(
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
+  const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
+  const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
+  const __m128i t3 = _mm_packus_epi32(t2, t2);
+  const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
+  return _mm_cvtsi128_si32(
+      t4); // only captures the sum of the first 8 digits, drop the rest
+}
+
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+
+#define SIMDJSON_SWAR_NUMBER_PARSING 1
+
+/* begin file include/simdjson/generic/numberparsing.h */
+#include <limits>
+
+namespace simdjson {
+namespace haswell {
+
+namespace ondemand {
+/**
+ * The type of a JSON number
+ */
+enum class number_type {
+    floating_point_number=1, /// a binary64 number
+    signed_integer,          /// a signed integer that fits in a 64-bit word using two's complement
+    unsigned_integer         /// a positive integer larger or equal to 1<<63
+};
+}
+
+namespace {
+/// @private
+namespace numberparsing {
+
+
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
+#endif
+
+namespace {
+// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
+// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
+// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
+simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
+    double d;
+    mantissa &= ~(1ULL << 52);
+    mantissa |= real_exponent << 52;
+    mantissa |= ((static_cast<uint64_t>(negative)) << 63);
+    std::memcpy(&d, &mantissa, sizeof(d));
+    return d;
+}
+}
+// Attempts to compute i * 10^(power) exactly; and if "negative" is
+// true, negate the result.
+// This function will only work in some cases, when it does not work, success is
+// set to false. This should work *most of the time* (like 99% of the time).
+// We assume that power is in the [smallest_power,
+// largest_power] interval: the caller is responsible for this check.
+simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
+  // we start with a fast path
+  // It was described in
+  // Clinger WD. How to read floating point numbers accurately.
+  // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
+  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
+    // convert the integer into a double. This is lossless since
+    // 0 <= i <= 2^53 - 1.
+    d = double(i);
+    //
+    // The general idea is as follows.
+    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+    // 1) Both s and p can be represented exactly as 64-bit floating-point
+    // values
+    // (binary64).
+    // 2) Because s and p can be represented exactly as floating-point values,
+    // then s * p
+    // and s / p will produce correctly rounded values.
+    //
+    if (power < 0) {
+      d = d / simdjson::internal::power_of_ten[-power];
+    } else {
+      d = d * simdjson::internal::power_of_ten[power];
+    }
+    if (negative) {
+      d = -d;
+    }
+    return true;
+  }
+  // When 22 < power && power <  22 + 16, we could
+  // hope for another, secondary fast path.  It was
+  // described by David M. Gay in  "Correctly rounded
+  // binary-decimal and decimal-binary conversions." (1990)
+  // If you need to compute i * 10^(22 + x) for x < 16,
+  // first compute i * 10^x, if you know that result is exact
+  // (e.g., when i * 10^x < 2^53),
+  // then you can still proceed and do (i * 10^x) * 10^22.
+  // Is this worth your time?
+  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+  // for this second fast path to work.
+  // If you you have 22 < power *and* power <  22 + 16, and then you
+  // optimistically compute "i * 10^(x-22)", there is still a chance that you
+  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+  // this optimization maybe less common than we would like. Source:
+  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+
+  // The fast path has now failed, so we are failing back on the slower path.
+
+  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+  // possible, except if i == 0, so we handle i == 0 separately.
+  if(i == 0) {
+    d = negative ? -0.0 : 0.0;
+    return true;
+  }
+
+
+  // The exponent is 1024 + 63 + power
+  //     + floor(log(5**power)/log(2)).
+  // The 1024 comes from the ieee64 standard.
+  // The 63 comes from the fact that we use a 64-bit word.
+  //
+  // Computing floor(log(5**power)/log(2)) could be
+  // slow. Instead we use a fast function.
+  //
+  // For power in (-400,350), we have that
+  // (((152170 + 65536) * power ) >> 16);
+  // is equal to
+  //  floor(log(5**power)/log(2)) + power when power >= 0
+  // and it is equal to
+  //  ceil(log(5**-power)/log(2)) + power when power < 0
+  //
+  // The 65536 is (1<<16) and corresponds to
+  // (65536 * power) >> 16 ---> power
+  //
+  // ((152170 * power ) >> 16) is equal to
+  // floor(log(5**power)/log(2))
+  //
+  // Note that this is not magic: 152170/(1<<16) is
+  // approximatively equal to log(5)/log(2).
+  // The 1<<16 value is a power of two; we could use a
+  // larger power of 2 if we wanted to.
+  //
+  int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
+
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(i);
+  i <<= lz;
+
+
+  // We are going to need to do some 64-bit arithmetic to get a precise product.
+  // We use a table lookup approach.
+  // It is safe because
+  // power >= smallest_power
+  // and power <= largest_power
+  // We recover the mantissa of the power, it has a leading 1. It is always
+  // rounded down.
+  //
+  // We want the most significant 64 bits of the product. We know
+  // this will be non-zero because the most significant bit of i is
+  // 1.
+  const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
+  // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
+  //
+  // The full_multiplication function computes the 128-bit product of two 64-bit words
+  // with a returned value of type value128 with a "low component" corresponding to the
+  // 64-bit least significant bits of the product and with a "high component" corresponding
+  // to the 64-bit most significant bits of the product.
+  simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
+  // Both i and power_of_five_128[index] have their most significant bit set to 1 which
+  // implies that the either the most or the second most significant bit of the product
+  // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
+  // we make of the product. It also makes it easy to reason about the product: there
+  // is 0 or 1 leading zero in the product.
+
+  // Unless the least significant 9 bits of the high (64-bit) part of the full
+  // product are all 1s, then we know that the most significant 55 bits are
+  // exact and no further work is needed. Having 55 bits is necessary because
+  // we need 53 bits for the mantissa but we have to have one rounding bit and
+  // we can waste a bit if the most significant bit of the product is zero.
+  if((firstproduct.high & 0x1FF) == 0x1FF) {
+    // We want to compute i * 5^q, but only care about the top 55 bits at most.
+    // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
+    // the full computation is wasteful. So we do what is called a "truncated
+    // multiplication".
+    // We take the most significant 64-bits, and we put them in
+    // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
+    // to the desired approximation using one multiplication. Sometimes it does not suffice.
+    // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
+    // then we get a better approximation to i * 5^q. In very rare cases, even that
+    // will not suffice, though it is seemingly very hard to find such a scenario.
+    //
+    // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
+    // more complicated.
+    //
+    // There is an extra layer of complexity in that we need more than 55 bits of
+    // accuracy in the round-to-even scenario.
+    //
+    // The full_multiplication function computes the 128-bit product of two 64-bit words
+    // with a returned value of type value128 with a "low component" corresponding to the
+    // 64-bit least significant bits of the product and with a "high component" corresponding
+    // to the 64-bit most significant bits of the product.
+    simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
+    // At this point, we might need to add at most one to firstproduct, but this
+    // can only change the value of firstproduct.high if firstproduct.low is maximal.
+    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
+      // This is very unlikely, but if so, we need to do much more work!
+      return false;
+    }
+  }
+  uint64_t lower = firstproduct.low;
+  uint64_t upper = firstproduct.high;
+  // The final mantissa should be 53 bits with a leading 1.
+  // We shift it so that it occupies 54 bits with a leading 1.
+  ///////
+  uint64_t upperbit = upper >> 63;
+  uint64_t mantissa = upper >> (upperbit + 9);
+  lz += int(1 ^ upperbit);
+
+  // Here we have mantissa < (1<<54).
+  int64_t real_exponent = exponent - lz;
+  if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
+    // Here have that real_exponent <= 0 so -real_exponent >= 0
+    if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+      d = negative ? -0.0 : 0.0;
+      return true;
+    }
+    // next line is safe because -real_exponent + 1 < 0
+    mantissa >>= -real_exponent + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    mantissa += (mantissa & 1); // round up
+    mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
+    d = to_double(mantissa, real_exponent, negative);
+    return true;
+  }
+  // We have to round to even. The "to even" part
+  // is only a problem when we are right in between two floats
+  // which we guard against.
+  // If we have lots of trailing zeros, we may fall right between two
+  // floating-point values.
+  //
+  // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
+  // times a power of two. That is, it is right between a number with binary significand
+  // m and another number with binary significand m+1; and it must be the case
+  // that it cannot be represented by a float itself.
+  //
+  // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
+  // Recall that 10^q = 5^q * 2^q.
+  // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
+  //  5^23 <=  2^54 and it is the last power of five to qualify, so q <= 23.
+  // When q<0, we have  w  >=  (2m+1) x 5^{-q}.  We must have that w<2^{64} so
+  // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
+  // 2^{53} x 5^{-q} < 2^{64}.
+  // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
+  //
+  // We require lower <= 1 and not lower == 0 because we could not prove that
+  // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
+  if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
+    if((mantissa  << (upperbit + 64 - 53 - 2)) ==  upper) {
+      mantissa &= ~1;             // flip it so that we do not round up
+    }
+  }
+
+  mantissa += mantissa & 1;
+  mantissa >>= 1;
+
+  // Here we have mantissa < (1<<53), unless there was an overflow
+  if (mantissa >= (1ULL << 53)) {
+    //////////
+    // This will happen when parsing values such as 7.2057594037927933e+16
+    ////////
+    mantissa = (1ULL << 52);
+    real_exponent++;
+  }
+  mantissa &= ~(1ULL << 52);
+  // we have to check that real_exponent is in range, otherwise we bail out
+  if (simdjson_unlikely(real_exponent > 2046)) {
+    // We have an infinite value!!! We could actually throw an error here if we could.
+    return false;
+  }
+  d = to_double(mantissa, real_exponent, negative);
+  return true;
+}
+
+// We call a fallback floating-point parser that might be slow. Note
+// it will accept JSON numbers, but the JSON spec. is more restrictive so
+// before you call parse_float_fallback, you need to have validated the input
+// string with the JSON grammar.
+// It will return an error (false) if the parsed number is infinite.
+// The string parsing itself always succeeds. We know that there is at least
+// one digit.
+static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+
+// check quickly whether the next 8 chars are made of digits
+// at a glance, it looks better than Mula's
+// http://0x80.pl/articles/swar-digits-validate.html
+simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
+  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+  std::memcpy(&val, chars, 8);
+  // a branchy method might be faster:
+  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+  //  0x3030303030303030);
+  return (((val & 0xF0F0F0F0F0F0F0F0) |
+           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+          0x3333333333333333);
+}
+
+template<typename W>
+error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
+  double d;
+  if (parse_float_fallback(src, &d)) {
+    writer.append_double(d);
+    return SUCCESS;
+  }
+  return INVALID_NUMBER(src);
+}
+
+template<typename I>
+SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+simdjson_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
+  }
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
+}
+
+simdjson_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
+  // we continue with the fiction that we have an integer. If the
+  // floating point number is representable as x * 10^z for some integer
+  // z that fits in 53 bits, then we will be able to convert back the
+  // the integer into a float in a lossless manner.
+  const uint8_t *const first_after_period = p;
+
+#ifdef SIMDJSON_SWAR_NUMBER_PARSING
+#if SIMDJSON_SWAR_NUMBER_PARSING
+  // this helps if we have lots of decimals!
+  // this turns out to be frequent enough.
+  if (is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p);
+    p += 8;
+  }
+#endif // SIMDJSON_SWAR_NUMBER_PARSING
+#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
+  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
+  if (parse_digit(*p, i)) { ++p; }
+  while (parse_digit(*p, i)) { p++; }
+  exponent = first_after_period - p;
+  // Decimal without digits (123.) is illegal
+  if (exponent == 0) {
+    return INVALID_NUMBER(src);
+  }
+  return SUCCESS;
+}
+
+simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
+  // Exp Sign: -123.456e[-]78
+  bool neg_exp = ('-' == *p);
+  if (neg_exp || '+' == *p) { p++; } // Skip + as well
+
+  // Exponent: -123.456e-[78]
+  auto start_exp = p;
+  int64_t exp_number = 0;
+  while (parse_digit(*p, exp_number)) { ++p; }
+  // It is possible for parse_digit to overflow.
+  // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
+  // Thus we *must* check for possible overflow before we negate exp_number.
+
+  // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
+  // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
+  // not oblige and may, in fact, generate two distinct paths in any case. It might be
+  // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
+  // instructions for a simdjson_likely branch, an unconclusive gain.
+
+  // If there were no digits, it's an error.
+  if (simdjson_unlikely(p == start_exp)) {
+    return INVALID_NUMBER(src);
+  }
+  // We have a valid positive exponent in exp_number at this point, except that
+  // it may have overflowed.
+
+  // If there were more than 18 digits, we may have overflowed the integer. We have to do
+  // something!!!!
+  if (simdjson_unlikely(p > start_exp+18)) {
+    // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
+    while (*start_exp == '0') { start_exp++; }
+    // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
+    // support exponents smaller than -999,999,999,999,999,999 and bigger
+    // than 999,999,999,999,999,999.
+    // We can truncate.
+    // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
+    // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
+    // truncate at 324.
+    // Note that there is no reason to fail per se at this point in time.
+    // E.g., 0e999999999999999999999 is a fine number.
+    if (p > start_exp+18) { exp_number = 999999999999999999; }
+  }
+  // At this point, we know that exp_number is a sane, positive, signed integer.
+  // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
+  // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
+  // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
+  // To sum it up: the next line should never overflow.
+  exponent += (neg_exp ? -exp_number : exp_number);
+  return SUCCESS;
+}
+
+simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
+  // It is possible that the integer had an overflow.
+  // We have to handle the case where we have 0.0000somenumber.
+  const uint8_t *start = start_digits;
+  while ((*start == '0') || (*start == '.')) { ++start; }
+  // we over-decrement by one when there is a '.'
+  return digit_count - size_t(start - start_digits);
+}
+
+template<typename W>
+simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon in practice.
+  //
+  // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
+  // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
+  // may not have a decimal separator!
+  if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
+    // Ok, chances are good that we had an overflow!
+    // this is almost never going to get called!!!
+    // we start anew, going slowly!!!
+    // This will happen in the following examples:
+    // 10000000000000000000000000000000000000000000e+308
+    // 3.1415926535897932384626433832795028841971693993751
+    //
+    // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
+    // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
+    // it, it would force it to be stored in memory, preventing the compiler from picking it apart
+    // and putting into registers. i.e. if we pass it as reference, it gets slow.
+    // This is what forces the skip_double, as well.
+    error_code error = slow_float_parsing(src, writer);
+    writer.skip_double();
+    return error;
+  }
+  // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
+  // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+  // To future reader: we'd love if someone found a better way, or at least could explain this result!
+  if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
+    //
+    // Important: smallest_power is such that it leads to a zero value.
+    // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
+    // so something x 10^-343 goes to zero, but not so with  something x 10^-342.
+    static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
+    //
+    if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
+      // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
+      WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
+      return SUCCESS;
+    } else { // (exponent > largest_power) and (i != 0)
+      // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
+      return INVALID_NUMBER(src);
+    }
+  }
+  double d;
+  if (!compute_float_64(exponent, i, negative, d)) {
+    // we are almost never going to get here.
+    if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
+  }
+  WRITE_DOUBLE(d, src, writer);
+  return SUCCESS;
+}
+
+// for performance analysis, it is sometimes  useful to skip parsing
+#ifdef SIMDJSON_SKIPNUMBERPARSING
+
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
+  writer.append_s64(0);        // always write zero
+  return SUCCESS;              // always succeeds
+}
+
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { return ondemand::number_type::signed_integer; }
+#else
+
+// parse the number at src
+// define JSON_TEST_NUMBERS for unit testing
+//
+// It is assumed that the number is followed by a structural ({,},],[) character
+// or a white space character. If that is not the case (e.g., when the JSON
+// document is made of a single number), then it is necessary to copy the
+// content and append a space before calling this function.
+//
+// Our objective is accurate parsing (ULP of 0) at high speed.
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
+
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
+
+  //
+  // Handle floats if there is a . or e (or both)
+  //
+  int64_t exponent = 0;
+  bool is_float = false;
+  if ('.' == *p) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
+    digit_count = int(p - start_digits); // used later to guard against overflows
+  }
+  if (('e' == *p) || ('E' == *p)) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_exponent(src, p, exponent) );
+  }
+  if (is_float) {
+    const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
+    SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
+    if (dirty_end) { return INVALID_NUMBER(src); }
+    return SUCCESS;
+  }
+
+  // The longest negative 64-bit number is 19 digits.
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  size_t longest_digit_count = negative ? 19 : 20;
+  if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
+  if (digit_count == longest_digit_count) {
+    if (negative) {
+      // Anything negative above INT64_MAX+1 is invalid
+      if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src);  }
+      WRITE_INTEGER(~i+1, src, writer);
+      if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+      return SUCCESS;
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    }  else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
+  }
+
+  // Write unsigned if it doesn't fit in a signed integer.
+  if (i > uint64_t(INT64_MAX)) {
+    WRITE_UNSIGNED(i, src, writer);
+  } else {
+    WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
+  }
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+  return SUCCESS;
+}
+
+// Inlineable functions
+namespace {
+
+// This table can be used to characterize the final character of an integer
+// string. For JSON structural character and allowable white space characters,
+// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
+// we return NUMBER_ERROR.
+// Optimization note: we could easily reduce the size of the table by half (to 128)
+// at the cost of an extra branch.
+// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
+static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
+
+const uint8_t integer_string_finisher[256] = {
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   SUCCESS,      NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, SUCCESS,        NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR};
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
+  const uint8_t *p = src + 1;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    // Note: we use src[1] and not src[0] because src[0] is the quote character in this
+    // instance.
+    if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  //
+  // Check for minus sign
+  //
+  if(src == src_end) { return NUMBER_ERROR; }
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = src;
+  uint64_t i = 0;
+  while (parse_digit(*src, i)) { src++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(src - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
+  //  return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(*src != '"') { return NUMBER_ERROR; }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
+  return (*src == '-');
+}
+
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
+  return false;
+}
+
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) {
+    // We have an integer.
+    // If the number is negative and valid, it must be a signed integer.
+    if(negative) { return ondemand::number_type::signed_integer; }
+    // We want values larger or equal to 9223372036854775808 to be unsigned
+    // integers, and the other values to be signed integers.
+    int digit_count = int(p - src);
+    if(digit_count >= 19) {
+      const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
+      if((digit_count >= 20) || (memcmp(src, smaller_big_integer, 19) >= 0)) {
+        return ondemand::number_type::unsigned_integer;
+      }
+    }
+    return ondemand::number_type::signed_integer;
+  }
+  // Hopefully, we have 'e' or 'E' or '.'.
+  return ondemand::number_type::floating_point_number;
+}
+
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
+  if(src == src_end) { return NUMBER_ERROR; }
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  if(p == src_end) { return NUMBER_ERROR; }
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely((p != src_end) && (*p == '.'))) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while ((p != src_end) && parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if ((p != src_end) && (*p == 'e' || *p == 'E')) {
+    p++;
+    if(p == src_end) { return NUMBER_ERROR; }
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while ((p != src_end) && parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+} //namespace {}
+#endif // SIMDJSON_SKIPNUMBERPARSING
+
+} // namespace numberparsing
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdjson
+/* end file include/simdjson/generic/numberparsing.h */
+
+#endif // SIMDJSON_HASWELL_NUMBERPARSING_H
+/* end file include/simdjson/haswell/numberparsing.h */
+/* begin file include/simdjson/haswell/end.h */
+SIMDJSON_UNTARGET_HASWELL
+/* end file include/simdjson/haswell/end.h */
+
+#endif // SIMDJSON_IMPLEMENTATION_HASWELL
+#endif // SIMDJSON_HASWELL_COMMON_H
+/* end file include/simdjson/haswell.h */
+/* begin file include/simdjson/ppc64.h */
+#ifndef SIMDJSON_PPC64_H
+#define SIMDJSON_PPC64_H
+
+
+#if SIMDJSON_IMPLEMENTATION_PPC64
+
+namespace simdjson {
+/**
+ * Implementation for ALTIVEC (PPC64).
+ */
+namespace ppc64 {
+} // namespace ppc64
+} // namespace simdjson
+
+/* begin file include/simdjson/ppc64/implementation.h */
+#ifndef SIMDJSON_PPC64_IMPLEMENTATION_H
+#define SIMDJSON_PPC64_IMPLEMENTATION_H
+
+
+namespace simdjson {
+namespace ppc64 {
+
+namespace {
+using namespace simdjson;
+using namespace simdjson::dom;
+} // namespace
+
+class implementation final : public simdjson::implementation {
+public:
+  simdjson_inline implementation()
+      : simdjson::implementation("ppc64", "PPC64 ALTIVEC",
+                                 internal::instruction_set::ALTIVEC) {}
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+      size_t capacity, size_t max_length,
+      std::unique_ptr<internal::dom_parser_implementation> &dst)
+      const noexcept final;
+  simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len,
+                                         uint8_t *dst,
+                                         size_t &dst_len) const noexcept final;
+  simdjson_warn_unused bool validate_utf8(const char *buf,
+                                          size_t len) const noexcept final;
+};
+
+} // namespace ppc64
+} // namespace simdjson
+
+#endif // SIMDJSON_PPC64_IMPLEMENTATION_H
+/* end file include/simdjson/ppc64/implementation.h */
+
+/* begin file include/simdjson/ppc64/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "ppc64"
+// #define SIMDJSON_IMPLEMENTATION ppc64
+/* end file include/simdjson/ppc64/begin.h */
+
+// Declarations
+/* begin file include/simdjson/generic/dom_parser_implementation.h */
+
+namespace simdjson {
+namespace ppc64 {
+
+// expectation: sizeof(open_container) = 64/8.
+struct open_container {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct open_container
+
+static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<open_container[]> open_containers{};
+  /** Whether each open container is a [ or { */
+  std::unique_ptr<bool[]> is_array{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+
+  inline dom_parser_implementation() noexcept;
+  inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
+  inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+  simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
+  simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
+  simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst) const noexcept final;
+  inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
+  inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
+private:
+  simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
+
+};
+
+} // namespace ppc64
+} // namespace simdjson
+
+namespace simdjson {
+namespace ppc64 {
+
+inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
+inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
+inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
+
+// Leaving these here so they can be inlined if so desired
+inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
+  // Stage 1 index output
+  size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
+  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
+  structural_indexes[0] = 0;
+  n_structural_indexes = 0;
+
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  // Stage 2 stacks
+  open_containers.reset(new (std::nothrow) open_container[max_depth]);
+  is_array.reset(new (std::nothrow) bool[max_depth]);
+  if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
+
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+
+} // namespace ppc64
+} // namespace simdjson
+/* end file include/simdjson/generic/dom_parser_implementation.h */
+/* begin file include/simdjson/ppc64/intrinsics.h */
+#ifndef SIMDJSON_PPC64_INTRINSICS_H
+#define SIMDJSON_PPC64_INTRINSICS_H
+
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <altivec.h>
+
+// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
+#ifdef bool
+#undef bool
+#endif
+
+#ifdef vector
+#undef vector
+#endif
+
+static_assert(sizeof(__vector unsigned char) <= simdjson::SIMDJSON_PADDING, "insufficient padding for ppc64");
+
+#endif //  SIMDJSON_PPC64_INTRINSICS_H
+/* end file include/simdjson/ppc64/intrinsics.h */
+/* begin file include/simdjson/ppc64/bitmanipulation.h */
+#ifndef SIMDJSON_PPC64_BITMANIPULATION_H
+#define SIMDJSON_PPC64_BITMANIPULATION_H
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+SIMDJSON_NO_SANITIZE_UNDEFINED
+simdjson_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  // Search the mask data from least significant bit (LSB)
+  // to the most significant bit (MSB) for a set bit (1).
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else  // SIMDJSON_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return input_num & (input_num - 1);
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline int leading_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+simdjson_inline int count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num); // Visual Studio wants two underscores
+}
+#else
+simdjson_inline int count_ones(uint64_t input_num) {
+  return __builtin_popcountll(input_num);
+}
+#endif
+
+simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                         uint64_t *result) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  *result = value1 + value2;
+  return *result < value1;
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+
+#endif // SIMDJSON_PPC64_BITMANIPULATION_H
+/* end file include/simdjson/ppc64/bitmanipulation.h */
+/* begin file include/simdjson/ppc64/bitmask.h */
+#ifndef SIMDJSON_PPC64_BITMASK_H
+#define SIMDJSON_PPC64_BITMASK_H
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is
+// encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
+  // You can use the version below, however gcc sometimes miscompiles
+  // vec_pmsum_be, it happens somewhere around between 8 and 9th version.
+  // The performance boost was not noticeable, falling back to a usual
+  // implementation.
+  //   __vector unsigned long long all_ones = {~0ull, ~0ull};
+  //   __vector unsigned long long mask = {bitmask, 0};
+  //   // Clang and GCC return different values for pmsum for ull so cast it to one.
+  //   // Generally it is not specified by ALTIVEC ISA what is returned by
+  //   // vec_pmsum_be.
+  // #if defined(__LITTLE_ENDIAN__)
+  //   return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[0]);
+  // #else
+  //   return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[1]);
+  // #endif
+  bitmask ^= bitmask << 1;
+  bitmask ^= bitmask << 2;
+  bitmask ^= bitmask << 4;
+  bitmask ^= bitmask << 8;
+  bitmask ^= bitmask << 16;
+  bitmask ^= bitmask << 32;
+  return bitmask;
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+
+#endif
+/* end file include/simdjson/ppc64/bitmask.h */
+/* begin file include/simdjson/ppc64/simd.h */
+#ifndef SIMDJSON_PPC64_SIMD_H
+#define SIMDJSON_PPC64_SIMD_H
+
+#include <type_traits>
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace simd {
+
+using __m128i = __vector unsigned char;
+
+template <typename Child> struct base {
+  __m128i value;
+
+  // Zero constructor
+  simdjson_inline base() : value{__m128i()} {}
+
+  // Conversion from SIMD register
+  simdjson_inline base(const __m128i _value) : value(_value) {}
+
+  // Conversion to SIMD register
+  simdjson_inline operator const __m128i &() const {
+    return this->value;
+  }
+  simdjson_inline operator __m128i &() { return this->value; }
+
+  // Bit operations
+  simdjson_inline Child operator|(const Child other) const {
+    return vec_or(this->value, (__m128i)other);
+  }
+  simdjson_inline Child operator&(const Child other) const {
+    return vec_and(this->value, (__m128i)other);
+  }
+  simdjson_inline Child operator^(const Child other) const {
+    return vec_xor(this->value, (__m128i)other);
+  }
+  simdjson_inline Child bit_andnot(const Child other) const {
+    return vec_andc(this->value, (__m128i)other);
+  }
+  simdjson_inline Child &operator|=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdjson_inline Child &operator&=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdjson_inline Child &operator^=(const Child other) {
+    auto this_cast = static_cast<Child*>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
+  }
+};
+
+// Forward-declared so they can be used by splat and friends.
+template <typename T> struct simd8;
+
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  simdjson_inline base8() : base<simd8<T>>() {}
+  simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+
+  friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
+    return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
+  }
+
+  static const int SIZE = sizeof(base<simd8<T>>::value);
+
+  template <int N = 1>
+  simdjson_inline simd8<T> prev(simd8<T> prev_chunk) const {
+    __m128i chunk = this->value;
+#ifdef __LITTLE_ENDIAN__
+    chunk = (__m128i)vec_reve(this->value);
+    prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
+#endif
+    chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
+#ifdef __LITTLE_ENDIAN__
+    chunk = (__m128i)vec_reve((__m128i)chunk);
+#endif
+    return chunk;
+  }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+  static simdjson_inline simd8<bool> splat(bool _value) {
+    return (__m128i)vec_splats((unsigned char)(-(!!_value)));
+  }
+
+  simdjson_inline simd8<bool>() : base8() {}
+  simdjson_inline simd8<bool>(const __m128i _value)
+      : base8<bool>(_value) {}
+  // Splat constructor
+  simdjson_inline simd8<bool>(bool _value)
+      : base8<bool>(splat(_value)) {}
+
+  simdjson_inline int to_bitmask() const {
+    __vector unsigned long long result;
+    const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+                               0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
+
+    result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
+                                                       (__m128i)perm_mask));
+#ifdef __LITTLE_ENDIAN__
+    return static_cast<int>(result[1]);
+#else
+    return static_cast<int>(result[0]);
+#endif
+  }
+  simdjson_inline bool any() const {
+    return !vec_all_eq(this->value, (__m128i)vec_splats(0));
+  }
+  simdjson_inline simd8<bool> operator~() const {
+    return this->value ^ (__m128i)splat(true);
+  }
+};
+
+template <typename T> struct base8_numeric : base8<T> {
+  static simdjson_inline simd8<T> splat(T value) {
+    (void)value;
+    return (__m128i)vec_splats(value);
+  }
+  static simdjson_inline simd8<T> zero() { return splat(0); }
+  static simdjson_inline simd8<T> load(const T values[16]) {
+    return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
+  }
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  static simdjson_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+                                                   T v5, T v6, T v7, T v8, T v9,
+                                                   T v10, T v11, T v12, T v13,
+                                                   T v14, T v15) {
+    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+                    v14, v15);
+  }
+
+  simdjson_inline base8_numeric() : base8<T>() {}
+  simdjson_inline base8_numeric(const __m128i _value)
+      : base8<T>(_value) {}
+
+  // Store to array
+  simdjson_inline void store(T dst[16]) const {
+    vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
+  }
+
+  // Override to distinguish from bool version
+  simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdjson_inline simd8<T> operator+(const simd8<T> other) const {
+    return (__m128i)((__m128i)this->value + (__m128i)other);
+  }
+  simdjson_inline simd8<T> operator-(const simd8<T> other) const {
+    return (__m128i)((__m128i)this->value - (__m128i)other);
+  }
+  simdjson_inline simd8<T> &operator+=(const simd8<T> other) {
+    *this = *this + other;
+    return *static_cast<simd8<T> *>(this);
+  }
+  simdjson_inline simd8<T> &operator-=(const simd8<T> other) {
+    *this = *this - other;
+    return *static_cast<simd8<T> *>(this);
+  }
+
+  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+  // for out of range values)
+  template <typename L>
+  simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
+  }
+
+  // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted
+  // as a bitset). Passing a 0 value for mask would be equivalent to writing out
+  // every byte to output. Only the first 16 - count_ones(mask) bytes of the
+  // result are significant but 16 bytes get written. Design consideration: it
+  // seems like a function with the signature simd8<L> compress(uint32_t mask)
+  // would be sensible, but the AVX ISA makes this kind of approach difficult.
+  template <typename L>
+  simdjson_inline void compress(uint16_t mask, L *output) const {
+    using internal::BitsSetTable256mul2;
+    using internal::pshufb_combine_table;
+    using internal::thintable_epi8;
+    // this particular implementation was inspired by work done by @animetosho
+    // we do it in two steps, first 8 bytes and then second 8 bytes
+    uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+    uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+    // next line just loads the 64-bit values thintable_epi8[mask1] and
+    // thintable_epi8[mask2] into a 128-bit register, using only
+    // two instructions on most compilers.
+#ifdef __LITTLE_ENDIAN__
+    __m128i shufmask = (__m128i)(__vector unsigned long long){
+        thintable_epi8[mask1], thintable_epi8[mask2]};
+#else
+    __m128i shufmask = (__m128i)(__vector unsigned long long){
+        thintable_epi8[mask2], thintable_epi8[mask1]};
+    shufmask = (__m128i)vec_reve((__m128i)shufmask);
+#endif
+    // we increment by 0x08 the second half of the mask
+    shufmask = ((__m128i)shufmask) +
+               ((__m128i)(__vector int){0, 0, 0x08080808, 0x08080808});
+
+    // this is the version "nearly pruned"
+    __m128i pruned = vec_perm(this->value, this->value, shufmask);
+    // we still need to put the two halves together.
+    // we compute the popcount of the first half:
+    int pop1 = BitsSetTable256mul2[mask1];
+    // then load the corresponding mask, what it does is to write
+    // only the first pop1 bytes from the first 8 bytes, and then
+    // it fills in with the bytes from the second 8 bytes + some filling
+    // at the end.
+    __m128i compactmask =
+        vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
+    __m128i answer = vec_perm(pruned, (__m128i)vec_splats(0), compactmask);
+    vec_vsx_st(answer, 0, reinterpret_cast<__m128i *>(output));
+  }
+
+  template <typename L>
+  simdjson_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
+  }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+  simdjson_inline simd8() : base8_numeric<int8_t>() {}
+  simdjson_inline simd8(const __m128i _value)
+      : base8_numeric<int8_t>(_value) {}
+  // Splat constructor
+  simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdjson_inline simd8(const int8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdjson_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+                               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+                               int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+                               int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+      : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
+                                              v8, v9, v10, v11, v12, v13, v14,
+                                              v15}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdjson_inline static simd8<int8_t>
+  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                         v13, v14, v15);
+  }
+
+  // Order-sensitive comparisons
+  simdjson_inline simd8<int8_t>
+  max_val(const simd8<int8_t> other) const {
+    return (__m128i)vec_max((__vector signed char)this->value,
+                            (__vector signed char)(__m128i)other);
+  }
+  simdjson_inline simd8<int8_t>
+  min_val(const simd8<int8_t> other) const {
+    return (__m128i)vec_min((__vector signed char)this->value,
+                            (__vector signed char)(__m128i)other);
+  }
+  simdjson_inline simd8<bool>
+  operator>(const simd8<int8_t> other) const {
+    return (__m128i)vec_cmpgt((__vector signed char)this->value,
+                              (__vector signed char)(__m128i)other);
+  }
+  simdjson_inline simd8<bool>
+  operator<(const simd8<int8_t> other) const {
+    return (__m128i)vec_cmplt((__vector signed char)this->value,
+                              (__vector signed char)(__m128i)other);
+  }
+};
+
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+  simdjson_inline simd8() : base8_numeric<uint8_t>() {}
+  simdjson_inline simd8(const __m128i _value)
+      : base8_numeric<uint8_t>(_value) {}
+  // Splat constructor
+  simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdjson_inline simd8(const uint8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdjson_inline
+  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+      : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                        v13, v14, v15}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdjson_inline static simd8<uint8_t>
+  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+            uint8_t v15) {
+    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                          v13, v14, v15);
+  }
+
+  // Saturated math
+  simdjson_inline simd8<uint8_t>
+  saturating_add(const simd8<uint8_t> other) const {
+    return (__m128i)vec_adds(this->value, (__m128i)other);
+  }
+  simdjson_inline simd8<uint8_t>
+  saturating_sub(const simd8<uint8_t> other) const {
+    return (__m128i)vec_subs(this->value, (__m128i)other);
+  }
+
+  // Order-specific operations
+  simdjson_inline simd8<uint8_t>
+  max_val(const simd8<uint8_t> other) const {
+    return (__m128i)vec_max(this->value, (__m128i)other);
+  }
+  simdjson_inline simd8<uint8_t>
+  min_val(const simd8<uint8_t> other) const {
+    return (__m128i)vec_min(this->value, (__m128i)other);
+  }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdjson_inline simd8<uint8_t>
+  gt_bits(const simd8<uint8_t> other) const {
+    return this->saturating_sub(other);
+  }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdjson_inline simd8<uint8_t>
+  lt_bits(const simd8<uint8_t> other) const {
+    return other.saturating_sub(*this);
+  }
+  simdjson_inline simd8<bool>
+  operator<=(const simd8<uint8_t> other) const {
+    return other.max_val(*this) == other;
+  }
+  simdjson_inline simd8<bool>
+  operator>=(const simd8<uint8_t> other) const {
+    return other.min_val(*this) == other;
+  }
+  simdjson_inline simd8<bool>
+  operator>(const simd8<uint8_t> other) const {
+    return this->gt_bits(other).any_bits_set();
+  }
+  simdjson_inline simd8<bool>
+  operator<(const simd8<uint8_t> other) const {
+    return this->gt_bits(other).any_bits_set();
+  }
+
+  // Bit-specific operations
+  simdjson_inline simd8<bool> bits_not_set() const {
+    return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
+  }
+  simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+    return (*this & bits).bits_not_set();
+  }
+  simdjson_inline simd8<bool> any_bits_set() const {
+    return ~this->bits_not_set();
+  }
+  simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+    return ~this->bits_not_set(bits);
+  }
+  simdjson_inline bool bits_not_set_anywhere() const {
+    return vec_all_eq(this->value, (__m128i)vec_splats(0));
+  }
+  simdjson_inline bool any_bits_set_anywhere() const {
+    return !bits_not_set_anywhere();
+  }
+  simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
+    return vec_all_eq(vec_and(this->value, (__m128i)bits),
+                      (__m128i)vec_splats(0));
+  }
+  simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+    return !bits_not_set_anywhere(bits);
+  }
+  template <int N> simdjson_inline simd8<uint8_t> shr() const {
+    return simd8<uint8_t>(
+        (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
+  }
+  template <int N> simdjson_inline simd8<uint8_t> shl() const {
+    return simd8<uint8_t>(
+        (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
+  }
+};
+
+template <typename T> struct simd8x64 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+  static_assert(NUM_CHUNKS == 4,
+                "PPC64 kernel should use four registers per 64-byte block.");
+  const simd8<T> chunks[NUM_CHUNKS];
+
+  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+  simd8x64<T> &
+  operator=(const simd8<T>& other) = delete; // no assignment allowed
+  simd8x64() = delete;                      // no default constructor allowed
+
+  simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+                                  const simd8<T> chunk2, const simd8<T> chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+  simdjson_inline simd8x64(const T ptr[64])
+      : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16),
+               simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
+
+  simdjson_inline void store(T ptr[64]) const {
+    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
+    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
+    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
+    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
+  }
+
+  simdjson_inline simd8<T> reduce_or() const {
+    return (this->chunks[0] | this->chunks[1]) |
+           (this->chunks[2] | this->chunks[3]);
+  }
+
+  simdjson_inline uint64_t compress(uint64_t mask, T *output) const {
+    this->chunks[0].compress(uint16_t(mask), output);
+    this->chunks[1].compress(uint16_t(mask >> 16),
+                             output + 16 - count_ones(mask & 0xFFFF));
+    this->chunks[2].compress(uint16_t(mask >> 32),
+                             output + 32 - count_ones(mask & 0xFFFFFFFF));
+    this->chunks[3].compress(uint16_t(mask >> 48),
+                             output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+    return 64 - count_ones(mask);
+  }
+
+  simdjson_inline uint64_t to_bitmask() const {
+    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+    uint64_t r1 = this->chunks[1].to_bitmask();
+    uint64_t r2 = this->chunks[2].to_bitmask();
+    uint64_t r3 = this->chunks[3].to_bitmask();
+    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+  }
+
+  simdjson_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+                          this->chunks[2] == mask, this->chunks[3] == mask)
+        .to_bitmask();
+  }
+
+  simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+    return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+                          this->chunks[1] == other.chunks[1],
+                          this->chunks[2] == other.chunks[2],
+                          this->chunks[3] == other.chunks[3])
+        .to_bitmask();
+  }
+
+  simdjson_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+                          this->chunks[2] <= mask, this->chunks[3] <= mask)
+        .to_bitmask();
+  }
+}; // struct simd8x64<T>
+
+} // namespace simd
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+
+#endif // SIMDJSON_PPC64_SIMD_INPUT_H
+/* end file include/simdjson/ppc64/simd.h */
+/* begin file include/simdjson/generic/jsoncharutils.h */
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+namespace jsoncharutils {
+
+// return non-zero if not a structural or whitespace char
+// zero otherwise
+simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace_negated[c];
+}
+
+simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace[c];
+}
+
+// returns a value with the high 16 bits set if not valid
+// otherwise returns the conversion of the 4 hex digits at src into the bottom
+// 16 bits of the 32-bit return register
+//
+// see
+// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
+static inline uint32_t hex_to_u32_nocheck(
+    const uint8_t *src) { // strictly speaking, static inline is a C-ism
+  uint32_t v1 = internal::digit_to_val32[630 + src[0]];
+  uint32_t v2 = internal::digit_to_val32[420 + src[1]];
+  uint32_t v3 = internal::digit_to_val32[210 + src[2]];
+  uint32_t v4 = internal::digit_to_val32[0 + src[3]];
+  return v1 | v2 | v3 | v4;
+}
+
+// given a code point cp, writes to c
+// the utf-8 code, outputting the length in
+// bytes, if the length is zero, the code point
+// is invalid
+//
+// This can possibly be made faster using pdep
+// and clz and table lookups, but JSON documents
+// have few escaped code points, and the following
+// function looks cheap.
+//
+// Note: we assume that surrogates are treated separately
+//
+simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
+  if (cp <= 0x7F) {
+    c[0] = uint8_t(cp);
+    return 1; // ascii
+  }
+  if (cp <= 0x7FF) {
+    c[0] = uint8_t((cp >> 6) + 192);
+    c[1] = uint8_t((cp & 63) + 128);
+    return 2; // universal plane
+    //  Surrogates are treated elsewhere...
+    //} //else if (0xd800 <= cp && cp <= 0xdfff) {
+    //  return 0; // surrogates // could put assert here
+  } else if (cp <= 0xFFFF) {
+    c[0] = uint8_t((cp >> 12) + 224);
+    c[1] = uint8_t(((cp >> 6) & 63) + 128);
+    c[2] = uint8_t((cp & 63) + 128);
+    return 3;
+  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
+                               // is not needed
+    c[0] = uint8_t((cp >> 18) + 240);
+    c[1] = uint8_t(((cp >> 12) & 63) + 128);
+    c[2] = uint8_t(((cp >> 6) & 63) + 128);
+    c[3] = uint8_t((cp & 63) + 128);
+    return 4;
+  }
+  // will return 0 when the code point was too large.
+  return 0; // bad r
+}
+
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit
+//
+static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
+  return x * (uint64_t)y;
+}
+static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + !!(lo < bd);
+  return lo;
+}
+#endif
+
+using internal::value128;
+
+simdjson_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
+  value128 answer;
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emultate
+  answer.high = __umulh(value1, value2);
+  answer.low = value1 * value2;
+#else
+  answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+  __uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#endif
+  return answer;
+}
+
+} // namespace jsoncharutils
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file include/simdjson/generic/jsoncharutils.h */
+/* begin file include/simdjson/generic/atomparsing.h */
+namespace simdjson {
+namespace ppc64 {
+namespace {
+/// @private
+namespace atomparsing {
+
+// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
+// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
+// be certain that the character pointer will be properly aligned.
+// You might think that using memcpy makes this function expensive, but you'd be wrong.
+// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
+// to the compile-time constant 1936482662.
+simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
+
+
+// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
+// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
+simdjson_warn_unused
+simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
+  uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
+  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+  std::memcpy(&srcval, src, sizeof(uint32_t));
+  return srcval ^ string_to_uint32(atom);
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
+  return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_true_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "true"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
+  return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
+  if (len > 5) { return is_valid_false_atom(src); }
+  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
+  return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_null_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "null"); }
+  else { return false; }
+}
+
+} // namespace atomparsing
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file include/simdjson/generic/atomparsing.h */
+/* begin file include/simdjson/ppc64/stringparsing.h */
+#ifndef SIMDJSON_PPC64_STRINGPARSING_H
+#define SIMDJSON_PPC64_STRINGPARSING_H
+
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+
+using namespace simd;
+
+// Holds backslashes and quotes locations.
+struct backslash_and_quote {
+public:
+  static constexpr uint32_t BYTES_PROCESSED = 32;
+  simdjson_inline static backslash_and_quote
+  copy_and_find(const uint8_t *src, uint8_t *dst);
+
+  simdjson_inline bool has_quote_first() {
+    return ((bs_bits - 1) & quote_bits) != 0;
+  }
+  simdjson_inline bool has_backslash() { return bs_bits != 0; }
+  simdjson_inline int quote_index() {
+    return trailing_zeroes(quote_bits);
+  }
+  simdjson_inline int backslash_index() {
+    return trailing_zeroes(bs_bits);
+  }
+
+  uint32_t bs_bits;
+  uint32_t quote_bits;
+}; // struct backslash_and_quote
+
+simdjson_inline backslash_and_quote
+backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
+  // this can read up to 31 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1),
+                "backslash and quote finder must process fewer than "
+                "SIMDJSON_PADDING bytes");
+  simd8<uint8_t> v0(src);
+  simd8<uint8_t> v1(src + sizeof(v0));
+  v0.store(dst);
+  v1.store(dst + sizeof(v0));
+
+  // Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on
+  // PPC; therefore, we smash them together into a 64-byte mask and get the
+  // bitmask from there.
+  uint64_t bs_and_quote =
+      simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
+  return {
+      uint32_t(bs_and_quote),      // bs_bits
+      uint32_t(bs_and_quote >> 32) // quote_bits
+  };
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+
+#endif // SIMDJSON_PPC64_STRINGPARSING_H
+/* end file include/simdjson/ppc64/stringparsing.h */
+/* begin file include/simdjson/ppc64/numberparsing.h */
+#ifndef SIMDJSON_PPC64_NUMBERPARSING_H
+#define SIMDJSON_PPC64_NUMBERPARSING_H
+
+#if defined(__linux__)
+#include <byteswap.h>
+#elif defined(__FreeBSD__)
+#include <sys/endian.h>
+#endif
+
+namespace simdjson {
+namespace ppc64 {
+namespace {
+
+// we don't have appropriate instructions, so let us use a scalar function
+// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+static simdjson_inline uint32_t
+parse_eight_digits_unrolled(const uint8_t *chars) {
+  uint64_t val;
+  std::memcpy(&val, chars, sizeof(uint64_t));
+#ifdef __BIG_ENDIAN__
+#if defined(__linux__)
+  val = bswap_64(val);
+#elif defined(__FreeBSD__)
+  val = bswap64(val);
+#endif
+#endif
+  val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
+  val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
+  return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
+}
+
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+
+#define SIMDJSON_SWAR_NUMBER_PARSING 1
+
+/* begin file include/simdjson/generic/numberparsing.h */
+#include <limits>
+
+namespace simdjson {
+namespace ppc64 {
+
+namespace ondemand {
+/**
+ * The type of a JSON number
+ */
+enum class number_type {
+    floating_point_number=1, /// a binary64 number
+    signed_integer,          /// a signed integer that fits in a 64-bit word using two's complement
+    unsigned_integer         /// a positive integer larger or equal to 1<<63
+};
+}
+
+namespace {
+/// @private
+namespace numberparsing {
+
+
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
+#endif
+
+namespace {
+// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
+// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
+// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
+simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
+    double d;
+    mantissa &= ~(1ULL << 52);
+    mantissa |= real_exponent << 52;
+    mantissa |= ((static_cast<uint64_t>(negative)) << 63);
+    std::memcpy(&d, &mantissa, sizeof(d));
+    return d;
+}
+}
+// Attempts to compute i * 10^(power) exactly; and if "negative" is
+// true, negate the result.
+// This function will only work in some cases, when it does not work, success is
+// set to false. This should work *most of the time* (like 99% of the time).
+// We assume that power is in the [smallest_power,
+// largest_power] interval: the caller is responsible for this check.
+simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
+  // we start with a fast path
+  // It was described in
+  // Clinger WD. How to read floating point numbers accurately.
+  // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
+  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
+    // convert the integer into a double. This is lossless since
+    // 0 <= i <= 2^53 - 1.
+    d = double(i);
+    //
+    // The general idea is as follows.
+    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+    // 1) Both s and p can be represented exactly as 64-bit floating-point
+    // values
+    // (binary64).
+    // 2) Because s and p can be represented exactly as floating-point values,
+    // then s * p
+    // and s / p will produce correctly rounded values.
+    //
+    if (power < 0) {
+      d = d / simdjson::internal::power_of_ten[-power];
+    } else {
+      d = d * simdjson::internal::power_of_ten[power];
+    }
+    if (negative) {
+      d = -d;
+    }
+    return true;
+  }
+  // When 22 < power && power <  22 + 16, we could
+  // hope for another, secondary fast path.  It was
+  // described by David M. Gay in  "Correctly rounded
+  // binary-decimal and decimal-binary conversions." (1990)
+  // If you need to compute i * 10^(22 + x) for x < 16,
+  // first compute i * 10^x, if you know that result is exact
+  // (e.g., when i * 10^x < 2^53),
+  // then you can still proceed and do (i * 10^x) * 10^22.
+  // Is this worth your time?
+  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+  // for this second fast path to work.
+  // If you you have 22 < power *and* power <  22 + 16, and then you
+  // optimistically compute "i * 10^(x-22)", there is still a chance that you
+  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+  // this optimization maybe less common than we would like. Source:
+  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+
+  // The fast path has now failed, so we are failing back on the slower path.
+
+  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+  // possible, except if i == 0, so we handle i == 0 separately.
+  if(i == 0) {
+    d = negative ? -0.0 : 0.0;
+    return true;
+  }
+
+
+  // The exponent is 1024 + 63 + power
+  //     + floor(log(5**power)/log(2)).
+  // The 1024 comes from the ieee64 standard.
+  // The 63 comes from the fact that we use a 64-bit word.
+  //
+  // Computing floor(log(5**power)/log(2)) could be
+  // slow. Instead we use a fast function.
+  //
+  // For power in (-400,350), we have that
+  // (((152170 + 65536) * power ) >> 16);
+  // is equal to
+  //  floor(log(5**power)/log(2)) + power when power >= 0
+  // and it is equal to
+  //  ceil(log(5**-power)/log(2)) + power when power < 0
+  //
+  // The 65536 is (1<<16) and corresponds to
+  // (65536 * power) >> 16 ---> power
+  //
+  // ((152170 * power ) >> 16) is equal to
+  // floor(log(5**power)/log(2))
+  //
+  // Note that this is not magic: 152170/(1<<16) is
+  // approximatively equal to log(5)/log(2).
+  // The 1<<16 value is a power of two; we could use a
+  // larger power of 2 if we wanted to.
+  //
+  int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
+
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(i);
+  i <<= lz;
+
+
+  // We are going to need to do some 64-bit arithmetic to get a precise product.
+  // We use a table lookup approach.
+  // It is safe because
+  // power >= smallest_power
+  // and power <= largest_power
+  // We recover the mantissa of the power, it has a leading 1. It is always
+  // rounded down.
+  //
+  // We want the most significant 64 bits of the product. We know
+  // this will be non-zero because the most significant bit of i is
+  // 1.
+  const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
+  // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
+  //
+  // The full_multiplication function computes the 128-bit product of two 64-bit words
+  // with a returned value of type value128 with a "low component" corresponding to the
+  // 64-bit least significant bits of the product and with a "high component" corresponding
+  // to the 64-bit most significant bits of the product.
+  simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
+  // Both i and power_of_five_128[index] have their most significant bit set to 1 which
+  // implies that the either the most or the second most significant bit of the product
+  // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
+  // we make of the product. It also makes it easy to reason about the product: there
+  // is 0 or 1 leading zero in the product.
+
+  // Unless the least significant 9 bits of the high (64-bit) part of the full
+  // product are all 1s, then we know that the most significant 55 bits are
+  // exact and no further work is needed. Having 55 bits is necessary because
+  // we need 53 bits for the mantissa but we have to have one rounding bit and
+  // we can waste a bit if the most significant bit of the product is zero.
+  if((firstproduct.high & 0x1FF) == 0x1FF) {
+    // We want to compute i * 5^q, but only care about the top 55 bits at most.
+    // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
+    // the full computation is wasteful. So we do what is called a "truncated
+    // multiplication".
+    // We take the most significant 64-bits, and we put them in
+    // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
+    // to the desired approximation using one multiplication. Sometimes it does not suffice.
+    // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
+    // then we get a better approximation to i * 5^q. In very rare cases, even that
+    // will not suffice, though it is seemingly very hard to find such a scenario.
+    //
+    // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
+    // more complicated.
+    //
+    // There is an extra layer of complexity in that we need more than 55 bits of
+    // accuracy in the round-to-even scenario.
+    //
+    // The full_multiplication function computes the 128-bit product of two 64-bit words
+    // with a returned value of type value128 with a "low component" corresponding to the
+    // 64-bit least significant bits of the product and with a "high component" corresponding
+    // to the 64-bit most significant bits of the product.
+    simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
+    // At this point, we might need to add at most one to firstproduct, but this
+    // can only change the value of firstproduct.high if firstproduct.low is maximal.
+    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
+      // This is very unlikely, but if so, we need to do much more work!
+      return false;
+    }
+  }
+  uint64_t lower = firstproduct.low;
+  uint64_t upper = firstproduct.high;
+  // The final mantissa should be 53 bits with a leading 1.
+  // We shift it so that it occupies 54 bits with a leading 1.
+  ///////
+  uint64_t upperbit = upper >> 63;
+  uint64_t mantissa = upper >> (upperbit + 9);
+  lz += int(1 ^ upperbit);
+
+  // Here we have mantissa < (1<<54).
+  int64_t real_exponent = exponent - lz;
+  if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
+    // Here have that real_exponent <= 0 so -real_exponent >= 0
+    if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+      d = negative ? -0.0 : 0.0;
+      return true;
+    }
+    // next line is safe because -real_exponent + 1 < 0
+    mantissa >>= -real_exponent + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    mantissa += (mantissa & 1); // round up
+    mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
+    d = to_double(mantissa, real_exponent, negative);
+    return true;
+  }
+  // We have to round to even. The "to even" part
+  // is only a problem when we are right in between two floats
+  // which we guard against.
+  // If we have lots of trailing zeros, we may fall right between two
+  // floating-point values.
+  //
+  // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
+  // times a power of two. That is, it is right between a number with binary significand
+  // m and another number with binary significand m+1; and it must be the case
+  // that it cannot be represented by a float itself.
+  //
+  // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
+  // Recall that 10^q = 5^q * 2^q.
+  // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
+  //  5^23 <=  2^54 and it is the last power of five to qualify, so q <= 23.
+  // When q<0, we have  w  >=  (2m+1) x 5^{-q}.  We must have that w<2^{64} so
+  // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
+  // 2^{53} x 5^{-q} < 2^{64}.
+  // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
+  //
+  // We require lower <= 1 and not lower == 0 because we could not prove that
+  // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
+  if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
+    if((mantissa  << (upperbit + 64 - 53 - 2)) ==  upper) {
+      mantissa &= ~1;             // flip it so that we do not round up
+    }
+  }
+
+  mantissa += mantissa & 1;
+  mantissa >>= 1;
+
+  // Here we have mantissa < (1<<53), unless there was an overflow
+  if (mantissa >= (1ULL << 53)) {
+    //////////
+    // This will happen when parsing values such as 7.2057594037927933e+16
+    ////////
+    mantissa = (1ULL << 52);
+    real_exponent++;
+  }
+  mantissa &= ~(1ULL << 52);
+  // we have to check that real_exponent is in range, otherwise we bail out
+  if (simdjson_unlikely(real_exponent > 2046)) {
+    // We have an infinite value!!! We could actually throw an error here if we could.
+    return false;
+  }
+  d = to_double(mantissa, real_exponent, negative);
+  return true;
+}
+
+// We call a fallback floating-point parser that might be slow. Note
+// it will accept JSON numbers, but the JSON spec. is more restrictive so
+// before you call parse_float_fallback, you need to have validated the input
+// string with the JSON grammar.
+// It will return an error (false) if the parsed number is infinite.
+// The string parsing itself always succeeds. We know that there is at least
+// one digit.
+static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+
+// check quickly whether the next 8 chars are made of digits
+// at a glance, it looks better than Mula's
+// http://0x80.pl/articles/swar-digits-validate.html
+simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
+  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+  std::memcpy(&val, chars, 8);
+  // a branchy method might be faster:
+  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+  //  0x3030303030303030);
+  return (((val & 0xF0F0F0F0F0F0F0F0) |
+           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+          0x3333333333333333);
+}
+
+template<typename W>
+error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
+  double d;
+  if (parse_float_fallback(src, &d)) {
+    writer.append_double(d);
+    return SUCCESS;
+  }
+  return INVALID_NUMBER(src);
+}
+
+template<typename I>
+SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+simdjson_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
+  }
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
+}
+
+simdjson_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
+  // we continue with the fiction that we have an integer. If the
+  // floating point number is representable as x * 10^z for some integer
+  // z that fits in 53 bits, then we will be able to convert back the
+  // the integer into a float in a lossless manner.
+  const uint8_t *const first_after_period = p;
+
+#ifdef SIMDJSON_SWAR_NUMBER_PARSING
+#if SIMDJSON_SWAR_NUMBER_PARSING
+  // this helps if we have lots of decimals!
+  // this turns out to be frequent enough.
+  if (is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p);
+    p += 8;
+  }
+#endif // SIMDJSON_SWAR_NUMBER_PARSING
+#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
+  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
+  if (parse_digit(*p, i)) { ++p; }
+  while (parse_digit(*p, i)) { p++; }
+  exponent = first_after_period - p;
+  // Decimal without digits (123.) is illegal
+  if (exponent == 0) {
+    return INVALID_NUMBER(src);
+  }
+  return SUCCESS;
+}
+
+simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
+  // Exp Sign: -123.456e[-]78
+  bool neg_exp = ('-' == *p);
+  if (neg_exp || '+' == *p) { p++; } // Skip + as well
+
+  // Exponent: -123.456e-[78]
+  auto start_exp = p;
+  int64_t exp_number = 0;
+  while (parse_digit(*p, exp_number)) { ++p; }
+  // It is possible for parse_digit to overflow.
+  // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
+  // Thus we *must* check for possible overflow before we negate exp_number.
+
+  // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
+  // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
+  // not oblige and may, in fact, generate two distinct paths in any case. It might be
+  // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
+  // instructions for a simdjson_likely branch, an unconclusive gain.
+
+  // If there were no digits, it's an error.
+  if (simdjson_unlikely(p == start_exp)) {
+    return INVALID_NUMBER(src);
+  }
+  // We have a valid positive exponent in exp_number at this point, except that
+  // it may have overflowed.
+
+  // If there were more than 18 digits, we may have overflowed the integer. We have to do
+  // something!!!!
+  if (simdjson_unlikely(p > start_exp+18)) {
+    // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
+    while (*start_exp == '0') { start_exp++; }
+    // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
+    // support exponents smaller than -999,999,999,999,999,999 and bigger
+    // than 999,999,999,999,999,999.
+    // We can truncate.
+    // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
+    // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
+    // truncate at 324.
+    // Note that there is no reason to fail per se at this point in time.
+    // E.g., 0e999999999999999999999 is a fine number.
+    if (p > start_exp+18) { exp_number = 999999999999999999; }
+  }
+  // At this point, we know that exp_number is a sane, positive, signed integer.
+  // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
+  // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
+  // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
+  // To sum it up: the next line should never overflow.
+  exponent += (neg_exp ? -exp_number : exp_number);
+  return SUCCESS;
+}
+
+simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
+  // It is possible that the integer had an overflow.
+  // We have to handle the case where we have 0.0000somenumber.
+  const uint8_t *start = start_digits;
+  while ((*start == '0') || (*start == '.')) { ++start; }
+  // we over-decrement by one when there is a '.'
+  return digit_count - size_t(start - start_digits);
+}
+
+template<typename W>
+simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon in practice.
+  //
+  // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
+  // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
+  // may not have a decimal separator!
+  if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
+    // Ok, chances are good that we had an overflow!
+    // this is almost never going to get called!!!
+    // we start anew, going slowly!!!
+    // This will happen in the following examples:
+    // 10000000000000000000000000000000000000000000e+308
+    // 3.1415926535897932384626433832795028841971693993751
+    //
+    // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
+    // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
+    // it, it would force it to be stored in memory, preventing the compiler from picking it apart
+    // and putting into registers. i.e. if we pass it as reference, it gets slow.
+    // This is what forces the skip_double, as well.
+    error_code error = slow_float_parsing(src, writer);
+    writer.skip_double();
+    return error;
+  }
+  // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
+  // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+  // To future reader: we'd love if someone found a better way, or at least could explain this result!
+  if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
+    //
+    // Important: smallest_power is such that it leads to a zero value.
+    // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
+    // so something x 10^-343 goes to zero, but not so with  something x 10^-342.
+    static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
+    //
+    if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
+      // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
+      WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
+      return SUCCESS;
+    } else { // (exponent > largest_power) and (i != 0)
+      // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
+      return INVALID_NUMBER(src);
+    }
+  }
+  double d;
+  if (!compute_float_64(exponent, i, negative, d)) {
+    // we are almost never going to get here.
+    if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
+  }
+  WRITE_DOUBLE(d, src, writer);
+  return SUCCESS;
+}
+
+// for performance analysis, it is sometimes  useful to skip parsing
+#ifdef SIMDJSON_SKIPNUMBERPARSING
+
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
+  writer.append_s64(0);        // always write zero
+  return SUCCESS;              // always succeeds
+}
+
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { return ondemand::number_type::signed_integer; }
+#else
+
+// parse the number at src
+// define JSON_TEST_NUMBERS for unit testing
+//
+// It is assumed that the number is followed by a structural ({,},],[) character
+// or a white space character. If that is not the case (e.g., when the JSON
+// document is made of a single number), then it is necessary to copy the
+// content and append a space before calling this function.
+//
+// Our objective is accurate parsing (ULP of 0) at high speed.
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
+
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
+
+  //
+  // Handle floats if there is a . or e (or both)
+  //
+  int64_t exponent = 0;
+  bool is_float = false;
+  if ('.' == *p) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
+    digit_count = int(p - start_digits); // used later to guard against overflows
+  }
+  if (('e' == *p) || ('E' == *p)) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_exponent(src, p, exponent) );
+  }
+  if (is_float) {
+    const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
+    SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
+    if (dirty_end) { return INVALID_NUMBER(src); }
+    return SUCCESS;
+  }
+
+  // The longest negative 64-bit number is 19 digits.
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  size_t longest_digit_count = negative ? 19 : 20;
+  if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
+  if (digit_count == longest_digit_count) {
+    if (negative) {
+      // Anything negative above INT64_MAX+1 is invalid
+      if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src);  }
+      WRITE_INTEGER(~i+1, src, writer);
+      if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+      return SUCCESS;
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    }  else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
+  }
+
+  // Write unsigned if it doesn't fit in a signed integer.
+  if (i > uint64_t(INT64_MAX)) {
+    WRITE_UNSIGNED(i, src, writer);
+  } else {
+    WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
+  }
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+  return SUCCESS;
+}
+
+// Inlineable functions
+namespace {
+
+// This table can be used to characterize the final character of an integer
+// string. For JSON structural character and allowable white space characters,
+// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
+// we return NUMBER_ERROR.
+// Optimization note: we could easily reduce the size of the table by half (to 128)
+// at the cost of an extra branch.
+// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
+static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
+
+const uint8_t integer_string_finisher[256] = {
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   SUCCESS,      NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, SUCCESS,        NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR};
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
+  const uint8_t *p = src + 1;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    // Note: we use src[1] and not src[0] because src[0] is the quote character in this
+    // instance.
+    if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  //
+  // Check for minus sign
+  //
+  if(src == src_end) { return NUMBER_ERROR; }
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = src;
+  uint64_t i = 0;
+  while (parse_digit(*src, i)) { src++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(src - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
+  //  return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(*src != '"') { return NUMBER_ERROR; }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
+  return (*src == '-');
+}
+
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
+  return false;
+}
+
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) {
+    // We have an integer.
+    // If the number is negative and valid, it must be a signed integer.
+    if(negative) { return ondemand::number_type::signed_integer; }
+    // We want values larger or equal to 9223372036854775808 to be unsigned
+    // integers, and the other values to be signed integers.
+    int digit_count = int(p - src);
+    if(digit_count >= 19) {
+      const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
+      if((digit_count >= 20) || (memcmp(src, smaller_big_integer, 19) >= 0)) {
+        return ondemand::number_type::unsigned_integer;
+      }
+    }
+    return ondemand::number_type::signed_integer;
+  }
+  // Hopefully, we have 'e' or 'E' or '.'.
+  return ondemand::number_type::floating_point_number;
+}
+
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
+  if(src == src_end) { return NUMBER_ERROR; }
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  if(p == src_end) { return NUMBER_ERROR; }
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely((p != src_end) && (*p == '.'))) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while ((p != src_end) && parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if ((p != src_end) && (*p == 'e' || *p == 'E')) {
+    p++;
+    if(p == src_end) { return NUMBER_ERROR; }
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while ((p != src_end) && parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+} //namespace {}
+#endif // SIMDJSON_SKIPNUMBERPARSING
+
+} // namespace numberparsing
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdjson
+/* end file include/simdjson/generic/numberparsing.h */
+
+#endif // SIMDJSON_PPC64_NUMBERPARSING_H
+/* end file include/simdjson/ppc64/numberparsing.h */
+/* begin file include/simdjson/ppc64/end.h */
+/* end file include/simdjson/ppc64/end.h */
+
+#endif // SIMDJSON_IMPLEMENTATION_PPC64
+
+#endif // SIMDJSON_PPC64_H
+/* end file include/simdjson/ppc64.h */
+/* begin file include/simdjson/westmere.h */
+#ifndef SIMDJSON_WESTMERE_H
+#define SIMDJSON_WESTMERE_H
+
+
+#if SIMDJSON_IMPLEMENTATION_WESTMERE
+
+#if SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
+#define SIMDJSON_TARGET_WESTMERE
+#define SIMDJSON_UNTARGET_WESTMERE
+#else
+#define SIMDJSON_TARGET_WESTMERE SIMDJSON_TARGET_REGION("sse4.2,pclmul")
+#define SIMDJSON_UNTARGET_WESTMERE SIMDJSON_UNTARGET_REGION
+#endif
+
+namespace simdjson {
+/**
+ * Implementation for Westmere (Intel SSE4.2).
+ */
+namespace westmere {
+} // namespace westmere
+} // namespace simdjson
+
+//
+// These two need to be included outside SIMDJSON_TARGET_WESTMERE
+//
+/* begin file include/simdjson/westmere/implementation.h */
+#ifndef SIMDJSON_WESTMERE_IMPLEMENTATION_H
+#define SIMDJSON_WESTMERE_IMPLEMENTATION_H
+
+
+// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE
+namespace simdjson {
+namespace westmere {
+
+namespace {
+using namespace simdjson;
+using namespace simdjson::dom;
+}
+
+class implementation final : public simdjson::implementation {
+public:
+  simdjson_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {}
+  simdjson_warn_unused error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
+  simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+};
+
+} // namespace westmere
+} // namespace simdjson
+
+#endif // SIMDJSON_WESTMERE_IMPLEMENTATION_H
+/* end file include/simdjson/westmere/implementation.h */
+/* begin file include/simdjson/westmere/intrinsics.h */
+#ifndef SIMDJSON_WESTMERE_INTRINSICS_H
+#define SIMDJSON_WESTMERE_INTRINSICS_H
+
+#ifdef SIMDJSON_VISUAL_STUDIO
+// under clang within visual studio, this will include <x86intrin.h>
+#include <intrin.h> // visual studio or clang
+#else
+#include <x86intrin.h> // elsewhere
+#endif // SIMDJSON_VISUAL_STUDIO
+
+
+#ifdef SIMDJSON_CLANG_VISUAL_STUDIO
+/**
+ * You are not supposed, normally, to include these
+ * headers directly. Instead you should either include intrin.h
+ * or x86intrin.h. However, when compiling with clang
+ * under Windows (i.e., when _MSC_VER is set), these headers
+ * only get included *if* the corresponding features are detected
+ * from macros:
+ */
+#include <smmintrin.h>  // for _mm_alignr_epi8
+#include <wmmintrin.h>  // for  _mm_clmulepi64_si128
+#endif
+
+static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for westmere");
+
+#endif // SIMDJSON_WESTMERE_INTRINSICS_H
+/* end file include/simdjson/westmere/intrinsics.h */
+
+//
+// The rest need to be inside the region
+//
+/* begin file include/simdjson/westmere/begin.h */
+// redefining SIMDJSON_IMPLEMENTATION to "westmere"
+// #define SIMDJSON_IMPLEMENTATION westmere
+SIMDJSON_TARGET_WESTMERE
+/* end file include/simdjson/westmere/begin.h */
+
+// Declarations
+/* begin file include/simdjson/generic/dom_parser_implementation.h */
+
+namespace simdjson {
+namespace westmere {
+
+// expectation: sizeof(open_container) = 64/8.
+struct open_container {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct open_container
+
+static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<open_container[]> open_containers{};
+  /** Whether each open container is a [ or { */
+  std::unique_ptr<bool[]> is_array{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+
+  inline dom_parser_implementation() noexcept;
+  inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
+  inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
+
+  simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
+  simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
+  simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
+  simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst) const noexcept final;
+  inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
+  inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
+private:
+  simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
+
+};
+
+} // namespace westmere
+} // namespace simdjson
+
+namespace simdjson {
+namespace westmere {
+
+inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
+inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
+inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
+
+// Leaving these here so they can be inlined if so desired
+inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
+  // Stage 1 index output
+  size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
+  structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
+  structural_indexes[0] = 0;
+  n_structural_indexes = 0;
+
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  // Stage 2 stacks
+  open_containers.reset(new (std::nothrow) open_container[max_depth]);
+  is_array.reset(new (std::nothrow) bool[max_depth]);
+  if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
+
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+
+} // namespace westmere
+} // namespace simdjson
+/* end file include/simdjson/generic/dom_parser_implementation.h */
+/* begin file include/simdjson/westmere/bitmanipulation.h */
+#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H
+#define SIMDJSON_WESTMERE_BITMANIPULATION_H
+
+namespace simdjson {
+namespace westmere {
+namespace {
+
+// We sometimes call trailing_zero on inputs that are zero,
+// but the algorithms do not end up using the returned value.
+// Sadly, sanitizers are not smart enough to figure it out.
+SIMDJSON_NO_SANITIZE_UNDEFINED
+simdjson_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  // Search the mask data from least significant bit (LSB)
+  // to the most significant bit (MSB) for a set bit (1).
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else // SIMDJSON_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
+  return input_num & (input_num-1);
+}
+
+/* result might be undefined when input_num is zero */
+simdjson_inline int leading_zeroes(uint64_t input_num) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  unsigned long leading_zero = 0;
+  // Search the mask data from most significant bit (MSB)
+  // to least significant bit (LSB) for a set bit (1).
+  if (_BitScanReverse64(&leading_zero, input_num))
+    return (int)(63 - leading_zero);
+  else
+    return 64;
+#else
+  return __builtin_clzll(input_num);
+#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
+}
+
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
+  // note: we do not support legacy 32-bit Windows
+  return __popcnt64(input_num);// Visual Studio wants two underscores
+}
+#else
+simdjson_inline long long int count_ones(uint64_t input_num) {
+  return _popcnt64(input_num);
+}
+#endif
+
+simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
+                                uint64_t *result) {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+  return _addcarry_u64(0, value1, value2,
+                       reinterpret_cast<unsigned __int64 *>(result));
+#else
+  return __builtin_uaddll_overflow(value1, value2,
+                                   reinterpret_cast<unsigned long long *>(result));
+#endif
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+
+#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H
+/* end file include/simdjson/westmere/bitmanipulation.h */
+/* begin file include/simdjson/westmere/bitmask.h */
+#ifndef SIMDJSON_WESTMERE_BITMASK_H
+#define SIMDJSON_WESTMERE_BITMASK_H
+
+namespace simdjson {
+namespace westmere {
+namespace {
+
+//
+// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
+//
+// For example, prefix_xor(00100100) == 00011100
+//
+simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
+  // There should be no such thing with a processing supporting avx2
+  // but not clmul.
+  __m128i all_ones = _mm_set1_epi8('\xFF');
+  __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
+  return _mm_cvtsi128_si64(result);
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+
+#endif // SIMDJSON_WESTMERE_BITMASK_H
+/* end file include/simdjson/westmere/bitmask.h */
+/* begin file include/simdjson/westmere/simd.h */
+#ifndef SIMDJSON_WESTMERE_SIMD_H
+#define SIMDJSON_WESTMERE_SIMD_H
+
+
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace simd {
+
+  template<typename Child>
+  struct base {
+    __m128i value;
+
+    // Zero constructor
+    simdjson_inline base() : value{__m128i()} {}
+
+    // Conversion from SIMD register
+    simdjson_inline base(const __m128i _value) : value(_value) {}
+
+    // Conversion to SIMD register
+    simdjson_inline operator const __m128i&() const { return this->value; }
+    simdjson_inline operator __m128i&() { return this->value; }
+
+    // Bit operations
+    simdjson_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
+    simdjson_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
+    simdjson_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
+    simdjson_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
+    simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
+    simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
+    simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+  };
+
+  // Forward-declared so they can be used by splat and friends.
+  template<typename T>
+  struct simd8;
+
+  template<typename T, typename Mask=simd8<bool>>
+  struct base8: base<simd8<T>> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
+
+    simdjson_inline base8() : base<simd8<T>>() {}
+    simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+
+    friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
+
+    static const int SIZE = sizeof(base<simd8<T>>::value);
+
+    template<int N=1>
+    simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+      return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    }
+  };
+
+  // SIMD byte mask type (returned by things like eq and gt)
+  template<>
+  struct simd8<bool>: base8<bool> {
+    static simdjson_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
+
+    simdjson_inline simd8<bool>() : base8() {}
+    simdjson_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+
+    simdjson_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
+    simdjson_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+    simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
+  };
+
+  template<typename T>
+  struct base8_numeric: base8<T> {
+    static simdjson_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
+    static simdjson_inline simd8<T> zero() { return _mm_setzero_si128(); }
+    static simdjson_inline simd8<T> load(const T values[16]) {
+      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    static simdjson_inline simd8<T> repeat_16(
+      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
+      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
+    ) {
+      return simd8<T>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    simdjson_inline base8_numeric() : base8<T>() {}
+    simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
+
+    // Store to array
+    simdjson_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+
+    // Override to distinguish from bool version
+    simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
+    simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
+    simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
+    simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+      return _mm_shuffle_epi8(lookup_table, *this);
+    }
+
+    // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
+    // Passing a 0 value for mask would be equivalent to writing out every byte to output.
+    // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
+    // get written.
+    // Design consideration: it seems like a function with the
+    // signature simd8<L> compress(uint32_t mask) would be
+    // sensible, but the AVX ISA makes this kind of approach difficult.
+    template<typename L>
+    simdjson_inline void compress(uint16_t mask, L * output) const {
+      using internal::thintable_epi8;
+      using internal::BitsSetTable256mul2;
+      using internal::pshufb_combine_table;
+      // this particular implementation was inspired by work done by @animetosho
+      // we do it in two steps, first 8 bytes and then second 8 bytes
+      uint8_t mask1 = uint8_t(mask); // least significant 8 bits
+      uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+      // next line just loads the 64-bit values thintable_epi8[mask1] and
+      // thintable_epi8[mask2] into a 128-bit register, using only
+      // two instructions on most compilers.
+      __m128i shufmask =  _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
+      // we increment by 0x08 the second half of the mask
+      shufmask =
+      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+      // this is the version "nearly pruned"
+      __m128i pruned = _mm_shuffle_epi8(*this, shufmask);
+      // we still need to put the two halves together.
+      // we compute the popcount of the first half:
+      int pop1 = BitsSetTable256mul2[mask1];
+      // then load the corresponding mask, what it does is to write
+      // only the first pop1 bytes from the first 8 bytes, and then
+      // it fills in with the bytes from the second 8 bytes + some filling
+      // at the end.
+      __m128i compactmask =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8));
+      __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+    }
+
+    template<typename L>
+    simdjson_inline simd8<L> lookup_16(
+        L replace0,  L replace1,  L replace2,  L replace3,
+        L replace4,  L replace5,  L replace6,  L replace7,
+        L replace8,  L replace9,  L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const {
+      return lookup_16(simd8<L>::repeat_16(
+        replace0,  replace1,  replace2,  replace3,
+        replace4,  replace5,  replace6,  replace7,
+        replace8,  replace9,  replace10, replace11,
+        replace12, replace13, replace14, replace15
+      ));
+    }
+  };
+
+  // Signed bytes
+  template<>
+  struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdjson_inline simd8() : base8_numeric<int8_t>() {}
+    simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdjson_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<int8_t> repeat_16(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) {
+      return simd8<int8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Order-sensitive comparisons
+    simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
+    simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
+    simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
+    simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
+  };
+
+  // Unsigned bytes
+  template<>
+  struct simd8<uint8_t>: base8_numeric<uint8_t> {
+    simdjson_inline simd8() : base8_numeric<uint8_t>() {}
+    simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
+    // Splat constructor
+    simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    // Array constructor
+    simdjson_inline simd8(const uint8_t* values) : simd8(load(values)) {}
+    // Member-by-member initialization
+    simdjson_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(_mm_setr_epi8(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdjson_inline static simd8<uint8_t> repeat_16(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) {
+      return simd8<uint8_t>(
+        v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10,v11,v12,v13,v14,v15
+      );
+    }
+
+    // Saturated math
+    simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
+    simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
+
+    // Order-specific operations
+    simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
+    simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
+    simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
+    simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
+    simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
+    simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
+    simdjson_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
+    simdjson_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+    simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
+    simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
+    template<int N>
+    simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdjson_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
+  };
+
+  template<typename T>
+  struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
+    const simd8<T> chunks[NUM_CHUNKS];
+
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
+
+    simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
+
+    simdjson_inline void store(T ptr[64]) const {
+      this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
+      this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
+      this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
+      this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
+    }
+
+    simdjson_inline simd8<T> reduce_or() const {
+      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
+      this->chunks[0].compress(uint16_t(mask), output);
+      this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
+      this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
+      this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
+      return 64 - count_ones(mask);
+    }
+
+    simdjson_inline uint64_t to_bitmask() const {
+      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
+      uint64_t r1 =          this->chunks[1].to_bitmask() ;
+      uint64_t r2 =          this->chunks[2].to_bitmask() ;
+      uint64_t r3 =          this->chunks[3].to_bitmask() ;
+      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdjson_inline uint64_t eq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] == mask,
+        this->chunks[1] == mask,
+        this->chunks[2] == mask,
+        this->chunks[3] == mask
+      ).to_bitmask();
+    }
+
+    simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+      return  simd8x64<bool>(
+        this->chunks[0] == other.chunks[0],
+        this->chunks[1] == other.chunks[1],
+        this->chunks[2] == other.chunks[2],
+        this->chunks[3] == other.chunks[3]
+      ).to_bitmask();
+    }
+
+    simdjson_inline uint64_t lteq(const T m) const {
+      const simd8<T> mask = simd8<T>::splat(m);
+      return  simd8x64<bool>(
+        this->chunks[0] <= mask,
+        this->chunks[1] <= mask,
+        this->chunks[2] <= mask,
+        this->chunks[3] <= mask
+      ).to_bitmask();
+    }
+  }; // struct simd8x64<T>
+
+} // namespace simd
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+
+#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
+/* end file include/simdjson/westmere/simd.h */
+/* begin file include/simdjson/generic/jsoncharutils.h */
+
+namespace simdjson {
+namespace westmere {
+namespace {
+namespace jsoncharutils {
+
+// return non-zero if not a structural or whitespace char
+// zero otherwise
+simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace_negated[c];
+}
+
+simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
+  return internal::structural_or_whitespace[c];
+}
+
+// returns a value with the high 16 bits set if not valid
+// otherwise returns the conversion of the 4 hex digits at src into the bottom
+// 16 bits of the 32-bit return register
+//
+// see
+// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
+static inline uint32_t hex_to_u32_nocheck(
+    const uint8_t *src) { // strictly speaking, static inline is a C-ism
+  uint32_t v1 = internal::digit_to_val32[630 + src[0]];
+  uint32_t v2 = internal::digit_to_val32[420 + src[1]];
+  uint32_t v3 = internal::digit_to_val32[210 + src[2]];
+  uint32_t v4 = internal::digit_to_val32[0 + src[3]];
+  return v1 | v2 | v3 | v4;
+}
+
+// given a code point cp, writes to c
+// the utf-8 code, outputting the length in
+// bytes, if the length is zero, the code point
+// is invalid
+//
+// This can possibly be made faster using pdep
+// and clz and table lookups, but JSON documents
+// have few escaped code points, and the following
+// function looks cheap.
+//
+// Note: we assume that surrogates are treated separately
+//
+simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
+  if (cp <= 0x7F) {
+    c[0] = uint8_t(cp);
+    return 1; // ascii
+  }
+  if (cp <= 0x7FF) {
+    c[0] = uint8_t((cp >> 6) + 192);
+    c[1] = uint8_t((cp & 63) + 128);
+    return 2; // universal plane
+    //  Surrogates are treated elsewhere...
+    //} //else if (0xd800 <= cp && cp <= 0xdfff) {
+    //  return 0; // surrogates // could put assert here
+  } else if (cp <= 0xFFFF) {
+    c[0] = uint8_t((cp >> 12) + 224);
+    c[1] = uint8_t(((cp >> 6) & 63) + 128);
+    c[2] = uint8_t((cp & 63) + 128);
+    return 3;
+  } else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
+                               // is not needed
+    c[0] = uint8_t((cp >> 18) + 240);
+    c[1] = uint8_t(((cp >> 12) & 63) + 128);
+    c[2] = uint8_t(((cp >> 6) & 63) + 128);
+    c[3] = uint8_t((cp & 63) + 128);
+    return 4;
+  }
+  // will return 0 when the code point was too large.
+  return 0; // bad r
+}
+
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit
+//
+static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
+  return x * (uint64_t)y;
+}
+static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
+  uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
+  uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
+  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t lo = bd + (adbc << 32);
+  *hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
+        (adbc_carry << 32) + !!(lo < bd);
+  return lo;
+}
+#endif
+
+using internal::value128;
+
+simdjson_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
+  value128 answer;
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emultate
+  answer.high = __umulh(value1, value2);
+  answer.low = value1 * value2;
+#else
+  answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
+  __uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
+  answer.low = uint64_t(r);
+  answer.high = uint64_t(r >> 64);
+#endif
+  return answer;
+}
+
+} // namespace jsoncharutils
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file include/simdjson/generic/jsoncharutils.h */
+/* begin file include/simdjson/generic/atomparsing.h */
+namespace simdjson {
+namespace westmere {
+namespace {
+/// @private
+namespace atomparsing {
+
+// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
+// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
+// be certain that the character pointer will be properly aligned.
+// You might think that using memcpy makes this function expensive, but you'd be wrong.
+// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
+// to the compile-time constant 1936482662.
+simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
+
+
+// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
+// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
+simdjson_warn_unused
+simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
+  uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
+  static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
+  std::memcpy(&srcval, src, sizeof(uint32_t));
+  return srcval ^ string_to_uint32(atom);
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
+  return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_true_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "true"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
+  return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
+  if (len > 5) { return is_valid_false_atom(src); }
+  else if (len == 5) { return !str4ncmp(src+1, "alse"); }
+  else { return false; }
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
+  return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
+}
+
+simdjson_warn_unused
+simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
+  if (len > 4) { return is_valid_null_atom(src); }
+  else if (len == 4) { return !str4ncmp(src, "null"); }
+  else { return false; }
+}
+
+} // namespace atomparsing
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file include/simdjson/generic/atomparsing.h */
+/* begin file include/simdjson/westmere/stringparsing.h */
+#ifndef SIMDJSON_WESTMERE_STRINGPARSING_H
+#define SIMDJSON_WESTMERE_STRINGPARSING_H
+
+namespace simdjson {
+namespace westmere {
+namespace {
+
+using namespace simd;
+
+// Holds backslashes and quotes locations.
+struct backslash_and_quote {
+public:
+  static constexpr uint32_t BYTES_PROCESSED = 32;
+  simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
+
+  simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
+  simdjson_inline bool has_backslash() { return bs_bits != 0; }
+  simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
+  simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
+
+  uint32_t bs_bits;
+  uint32_t quote_bits;
+}; // struct backslash_and_quote
+
+simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
+  // this can read up to 31 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
+  simd8<uint8_t> v0(src);
+  simd8<uint8_t> v1(src + 16);
+  v0.store(dst);
+  v1.store(dst + 16);
+  uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
+  return {
+    uint32_t(bs_and_quote),      // bs_bits
+    uint32_t(bs_and_quote >> 32) // quote_bits
+  };
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+
+#endif // SIMDJSON_WESTMERE_STRINGPARSING_H
+/* end file include/simdjson/westmere/stringparsing.h */
+/* begin file include/simdjson/westmere/numberparsing.h */
+#ifndef SIMDJSON_WESTMERE_NUMBERPARSING_H
+#define SIMDJSON_WESTMERE_NUMBERPARSING_H
+
+namespace simdjson {
+namespace westmere {
+namespace {
+
+static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
+  // this actually computes *16* values so we are being wasteful.
+  const __m128i ascii0 = _mm_set1_epi8('0');
+  const __m128i mul_1_10 =
+      _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
+  const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
+  const __m128i mul_1_10000 =
+      _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
+  const __m128i input = _mm_sub_epi8(
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
+  const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
+  const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
+  const __m128i t3 = _mm_packus_epi32(t2, t2);
+  const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
+  return _mm_cvtsi128_si32(
+      t4); // only captures the sum of the first 8 digits, drop the rest
+}
+
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+
+#define SIMDJSON_SWAR_NUMBER_PARSING 1
+
+/* begin file include/simdjson/generic/numberparsing.h */
+#include <limits>
+
+namespace simdjson {
+namespace westmere {
+
+namespace ondemand {
+/**
+ * The type of a JSON number
+ */
+enum class number_type {
+    floating_point_number=1, /// a binary64 number
+    signed_integer,          /// a signed integer that fits in a 64-bit word using two's complement
+    unsigned_integer         /// a positive integer larger or equal to 1<<63
+};
+}
+
+namespace {
+/// @private
+namespace numberparsing {
+
+
+
+#ifdef JSON_TEST_NUMBERS
+#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
+#else
+#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
+#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
+#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
+#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
+#endif
+
+namespace {
+// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
+// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
+// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
+simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
+    double d;
+    mantissa &= ~(1ULL << 52);
+    mantissa |= real_exponent << 52;
+    mantissa |= ((static_cast<uint64_t>(negative)) << 63);
+    std::memcpy(&d, &mantissa, sizeof(d));
+    return d;
+}
+}
+// Attempts to compute i * 10^(power) exactly; and if "negative" is
+// true, negate the result.
+// This function will only work in some cases, when it does not work, success is
+// set to false. This should work *most of the time* (like 99% of the time).
+// We assume that power is in the [smallest_power,
+// largest_power] interval: the caller is responsible for this check.
+simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
+  // we start with a fast path
+  // It was described in
+  // Clinger WD. How to read floating point numbers accurately.
+  // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
+  if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
+    // convert the integer into a double. This is lossless since
+    // 0 <= i <= 2^53 - 1.
+    d = double(i);
+    //
+    // The general idea is as follows.
+    // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
+    // 1) Both s and p can be represented exactly as 64-bit floating-point
+    // values
+    // (binary64).
+    // 2) Because s and p can be represented exactly as floating-point values,
+    // then s * p
+    // and s / p will produce correctly rounded values.
+    //
+    if (power < 0) {
+      d = d / simdjson::internal::power_of_ten[-power];
+    } else {
+      d = d * simdjson::internal::power_of_ten[power];
+    }
+    if (negative) {
+      d = -d;
+    }
+    return true;
+  }
+  // When 22 < power && power <  22 + 16, we could
+  // hope for another, secondary fast path.  It was
+  // described by David M. Gay in  "Correctly rounded
+  // binary-decimal and decimal-binary conversions." (1990)
+  // If you need to compute i * 10^(22 + x) for x < 16,
+  // first compute i * 10^x, if you know that result is exact
+  // (e.g., when i * 10^x < 2^53),
+  // then you can still proceed and do (i * 10^x) * 10^22.
+  // Is this worth your time?
+  // You need  22 < power *and* power <  22 + 16 *and* (i * 10^(x-22) < 2^53)
+  // for this second fast path to work.
+  // If you you have 22 < power *and* power <  22 + 16, and then you
+  // optimistically compute "i * 10^(x-22)", there is still a chance that you
+  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
+  // this optimization maybe less common than we would like. Source:
+  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
+  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
+
+  // The fast path has now failed, so we are failing back on the slower path.
+
+  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
+  // possible, except if i == 0, so we handle i == 0 separately.
+  if(i == 0) {
+    d = negative ? -0.0 : 0.0;
+    return true;
+  }
+
+
+  // The exponent is 1024 + 63 + power
+  //     + floor(log(5**power)/log(2)).
+  // The 1024 comes from the ieee64 standard.
+  // The 63 comes from the fact that we use a 64-bit word.
+  //
+  // Computing floor(log(5**power)/log(2)) could be
+  // slow. Instead we use a fast function.
+  //
+  // For power in (-400,350), we have that
+  // (((152170 + 65536) * power ) >> 16);
+  // is equal to
+  //  floor(log(5**power)/log(2)) + power when power >= 0
+  // and it is equal to
+  //  ceil(log(5**-power)/log(2)) + power when power < 0
+  //
+  // The 65536 is (1<<16) and corresponds to
+  // (65536 * power) >> 16 ---> power
+  //
+  // ((152170 * power ) >> 16) is equal to
+  // floor(log(5**power)/log(2))
+  //
+  // Note that this is not magic: 152170/(1<<16) is
+  // approximatively equal to log(5)/log(2).
+  // The 1<<16 value is a power of two; we could use a
+  // larger power of 2 if we wanted to.
+  //
+  int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
+
+
+  // We want the most significant bit of i to be 1. Shift if needed.
+  int lz = leading_zeroes(i);
+  i <<= lz;
+
+
+  // We are going to need to do some 64-bit arithmetic to get a precise product.
+  // We use a table lookup approach.
+  // It is safe because
+  // power >= smallest_power
+  // and power <= largest_power
+  // We recover the mantissa of the power, it has a leading 1. It is always
+  // rounded down.
+  //
+  // We want the most significant 64 bits of the product. We know
+  // this will be non-zero because the most significant bit of i is
+  // 1.
+  const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
+  // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
+  //
+  // The full_multiplication function computes the 128-bit product of two 64-bit words
+  // with a returned value of type value128 with a "low component" corresponding to the
+  // 64-bit least significant bits of the product and with a "high component" corresponding
+  // to the 64-bit most significant bits of the product.
+  simdjson::internal::value128 firstproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index]);
+  // Both i and power_of_five_128[index] have their most significant bit set to 1 which
+  // implies that the either the most or the second most significant bit of the product
+  // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
+  // we make of the product. It also makes it easy to reason about the product: there
+  // is 0 or 1 leading zero in the product.
+
+  // Unless the least significant 9 bits of the high (64-bit) part of the full
+  // product are all 1s, then we know that the most significant 55 bits are
+  // exact and no further work is needed. Having 55 bits is necessary because
+  // we need 53 bits for the mantissa but we have to have one rounding bit and
+  // we can waste a bit if the most significant bit of the product is zero.
+  if((firstproduct.high & 0x1FF) == 0x1FF) {
+    // We want to compute i * 5^q, but only care about the top 55 bits at most.
+    // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
+    // the full computation is wasteful. So we do what is called a "truncated
+    // multiplication".
+    // We take the most significant 64-bits, and we put them in
+    // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
+    // to the desired approximation using one multiplication. Sometimes it does not suffice.
+    // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
+    // then we get a better approximation to i * 5^q. In very rare cases, even that
+    // will not suffice, though it is seemingly very hard to find such a scenario.
+    //
+    // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
+    // more complicated.
+    //
+    // There is an extra layer of complexity in that we need more than 55 bits of
+    // accuracy in the round-to-even scenario.
+    //
+    // The full_multiplication function computes the 128-bit product of two 64-bit words
+    // with a returned value of type value128 with a "low component" corresponding to the
+    // 64-bit least significant bits of the product and with a "high component" corresponding
+    // to the 64-bit most significant bits of the product.
+    simdjson::internal::value128 secondproduct = jsoncharutils::full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
+    firstproduct.low += secondproduct.high;
+    if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
+    // At this point, we might need to add at most one to firstproduct, but this
+    // can only change the value of firstproduct.high if firstproduct.low is maximal.
+    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
+      // This is very unlikely, but if so, we need to do much more work!
+      return false;
+    }
+  }
+  uint64_t lower = firstproduct.low;
+  uint64_t upper = firstproduct.high;
+  // The final mantissa should be 53 bits with a leading 1.
+  // We shift it so that it occupies 54 bits with a leading 1.
+  ///////
+  uint64_t upperbit = upper >> 63;
+  uint64_t mantissa = upper >> (upperbit + 9);
+  lz += int(1 ^ upperbit);
+
+  // Here we have mantissa < (1<<54).
+  int64_t real_exponent = exponent - lz;
+  if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
+    // Here have that real_exponent <= 0 so -real_exponent >= 0
+    if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+      d = negative ? -0.0 : 0.0;
+      return true;
+    }
+    // next line is safe because -real_exponent + 1 < 0
+    mantissa >>= -real_exponent + 1;
+    // Thankfully, we can't have both "round-to-even" and subnormals because
+    // "round-to-even" only occurs for powers close to 0.
+    mantissa += (mantissa & 1); // round up
+    mantissa >>= 1;
+    // There is a weird scenario where we don't have a subnormal but just.
+    // Suppose we start with 2.2250738585072013e-308, we end up
+    // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
+    // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
+    // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
+    // subnormal, but we can only know this after rounding.
+    // So we only declare a subnormal if we are smaller than the threshold.
+    real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
+    d = to_double(mantissa, real_exponent, negative);
+    return true;
+  }
+  // We have to round to even. The "to even" part
+  // is only a problem when we are right in between two floats
+  // which we guard against.
+  // If we have lots of trailing zeros, we may fall right between two
+  // floating-point values.
+  //
+  // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
+  // times a power of two. That is, it is right between a number with binary significand
+  // m and another number with binary significand m+1; and it must be the case
+  // that it cannot be represented by a float itself.
+  //
+  // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
+  // Recall that 10^q = 5^q * 2^q.
+  // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
+  //  5^23 <=  2^54 and it is the last power of five to qualify, so q <= 23.
+  // When q<0, we have  w  >=  (2m+1) x 5^{-q}.  We must have that w<2^{64} so
+  // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
+  // 2^{53} x 5^{-q} < 2^{64}.
+  // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
+  //
+  // We require lower <= 1 and not lower == 0 because we could not prove that
+  // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
+  if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
+    if((mantissa  << (upperbit + 64 - 53 - 2)) ==  upper) {
+      mantissa &= ~1;             // flip it so that we do not round up
+    }
+  }
+
+  mantissa += mantissa & 1;
+  mantissa >>= 1;
+
+  // Here we have mantissa < (1<<53), unless there was an overflow
+  if (mantissa >= (1ULL << 53)) {
+    //////////
+    // This will happen when parsing values such as 7.2057594037927933e+16
+    ////////
+    mantissa = (1ULL << 52);
+    real_exponent++;
+  }
+  mantissa &= ~(1ULL << 52);
+  // we have to check that real_exponent is in range, otherwise we bail out
+  if (simdjson_unlikely(real_exponent > 2046)) {
+    // We have an infinite value!!! We could actually throw an error here if we could.
+    return false;
+  }
+  d = to_double(mantissa, real_exponent, negative);
+  return true;
+}
+
+// We call a fallback floating-point parser that might be slow. Note
+// it will accept JSON numbers, but the JSON spec. is more restrictive so
+// before you call parse_float_fallback, you need to have validated the input
+// string with the JSON grammar.
+// It will return an error (false) if the parsed number is infinite.
+// The string parsing itself always succeeds. We know that there is at least
+// one digit.
+static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
+  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
+  // We do not accept infinite values.
+
+  // Detecting finite values in a portable manner is ridiculously hard, ideally
+  // we would want to do:
+  // return !std::isfinite(*outDouble);
+  // but that mysteriously fails under legacy/old libc++ libraries, see
+  // https://github.com/simdjson/simdjson/issues/1286
+  //
+  // Therefore, fall back to this solution (the extra parens are there
+  // to handle that max may be a macro on windows).
+  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
+}
+
+// check quickly whether the next 8 chars are made of digits
+// at a glance, it looks better than Mula's
+// http://0x80.pl/articles/swar-digits-validate.html
+simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
+  uint64_t val;
+  // this can read up to 7 bytes beyond the buffer size, but we require
+  // SIMDJSON_PADDING of padding
+  static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
+  std::memcpy(&val, chars, 8);
+  // a branchy method might be faster:
+  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
+  //  && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
+  //  0x3030303030303030);
+  return (((val & 0xF0F0F0F0F0F0F0F0) |
+           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
+          0x3333333333333333);
+}
+
+template<typename W>
+error_code slow_float_parsing(simdjson_unused const uint8_t * src, W writer) {
+  double d;
+  if (parse_float_fallback(src, &d)) {
+    writer.append_double(d);
+    return SUCCESS;
+  }
+  return INVALID_NUMBER(src);
+}
+
+template<typename I>
+SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
+simdjson_inline bool parse_digit(const uint8_t c, I &i) {
+  const uint8_t digit = static_cast<uint8_t>(c - '0');
+  if (digit > 9) {
+    return false;
+  }
+  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
+  i = 10 * i + digit; // might overflow, we will handle the overflow later
+  return true;
+}
+
+simdjson_inline error_code parse_decimal(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
+  // we continue with the fiction that we have an integer. If the
+  // floating point number is representable as x * 10^z for some integer
+  // z that fits in 53 bits, then we will be able to convert back the
+  // the integer into a float in a lossless manner.
+  const uint8_t *const first_after_period = p;
+
+#ifdef SIMDJSON_SWAR_NUMBER_PARSING
+#if SIMDJSON_SWAR_NUMBER_PARSING
+  // this helps if we have lots of decimals!
+  // this turns out to be frequent enough.
+  if (is_made_of_eight_digits_fast(p)) {
+    i = i * 100000000 + parse_eight_digits_unrolled(p);
+    p += 8;
+  }
+#endif // SIMDJSON_SWAR_NUMBER_PARSING
+#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
+  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
+  if (parse_digit(*p, i)) { ++p; }
+  while (parse_digit(*p, i)) { p++; }
+  exponent = first_after_period - p;
+  // Decimal without digits (123.) is illegal
+  if (exponent == 0) {
+    return INVALID_NUMBER(src);
+  }
+  return SUCCESS;
+}
+
+simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
+  // Exp Sign: -123.456e[-]78
+  bool neg_exp = ('-' == *p);
+  if (neg_exp || '+' == *p) { p++; } // Skip + as well
+
+  // Exponent: -123.456e-[78]
+  auto start_exp = p;
+  int64_t exp_number = 0;
+  while (parse_digit(*p, exp_number)) { ++p; }
+  // It is possible for parse_digit to overflow.
+  // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
+  // Thus we *must* check for possible overflow before we negate exp_number.
+
+  // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
+  // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
+  // not oblige and may, in fact, generate two distinct paths in any case. It might be
+  // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
+  // instructions for a simdjson_likely branch, an unconclusive gain.
+
+  // If there were no digits, it's an error.
+  if (simdjson_unlikely(p == start_exp)) {
+    return INVALID_NUMBER(src);
+  }
+  // We have a valid positive exponent in exp_number at this point, except that
+  // it may have overflowed.
+
+  // If there were more than 18 digits, we may have overflowed the integer. We have to do
+  // something!!!!
+  if (simdjson_unlikely(p > start_exp+18)) {
+    // Skip leading zeroes: 1e000000000000000000001 is technically valid and doesn't overflow
+    while (*start_exp == '0') { start_exp++; }
+    // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
+    // support exponents smaller than -999,999,999,999,999,999 and bigger
+    // than 999,999,999,999,999,999.
+    // We can truncate.
+    // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
+    // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
+    // truncate at 324.
+    // Note that there is no reason to fail per se at this point in time.
+    // E.g., 0e999999999999999999999 is a fine number.
+    if (p > start_exp+18) { exp_number = 999999999999999999; }
+  }
+  // At this point, we know that exp_number is a sane, positive, signed integer.
+  // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
+  // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
+  // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
+  // To sum it up: the next line should never overflow.
+  exponent += (neg_exp ? -exp_number : exp_number);
+  return SUCCESS;
+}
+
+simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
+  // It is possible that the integer had an overflow.
+  // We have to handle the case where we have 0.0000somenumber.
+  const uint8_t *start = start_digits;
+  while ((*start == '0') || (*start == '.')) { ++start; }
+  // we over-decrement by one when there is a '.'
+  return digit_count - size_t(start - start_digits);
+}
+
+template<typename W>
+simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
+  // If we frequently had to deal with long strings of digits,
+  // we could extend our code by using a 128-bit integer instead
+  // of a 64-bit integer. However, this is uncommon in practice.
+  //
+  // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
+  // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
+  // may not have a decimal separator!
+  if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
+    // Ok, chances are good that we had an overflow!
+    // this is almost never going to get called!!!
+    // we start anew, going slowly!!!
+    // This will happen in the following examples:
+    // 10000000000000000000000000000000000000000000e+308
+    // 3.1415926535897932384626433832795028841971693993751
+    //
+    // NOTE: This makes a *copy* of the writer and passes it to slow_float_parsing. This happens
+    // because slow_float_parsing is a non-inlined function. If we passed our writer reference to
+    // it, it would force it to be stored in memory, preventing the compiler from picking it apart
+    // and putting into registers. i.e. if we pass it as reference, it gets slow.
+    // This is what forces the skip_double, as well.
+    error_code error = slow_float_parsing(src, writer);
+    writer.skip_double();
+    return error;
+  }
+  // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
+  // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
+  // To future reader: we'd love if someone found a better way, or at least could explain this result!
+  if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
+    //
+    // Important: smallest_power is such that it leads to a zero value.
+    // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
+    // so something x 10^-343 goes to zero, but not so with  something x 10^-342.
+    static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
+    //
+    if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
+      // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
+      WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
+      return SUCCESS;
+    } else { // (exponent > largest_power) and (i != 0)
+      // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
+      return INVALID_NUMBER(src);
+    }
+  }
+  double d;
+  if (!compute_float_64(exponent, i, negative, d)) {
+    // we are almost never going to get here.
+    if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
+  }
+  WRITE_DOUBLE(d, src, writer);
+  return SUCCESS;
+}
+
+// for performance analysis, it is sometimes  useful to skip parsing
+#ifdef SIMDJSON_SKIPNUMBERPARSING
+
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
+  writer.append_s64(0);        // always write zero
+  return SUCCESS;              // always succeeds
+}
+
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept  { return false; }
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept { return ondemand::number_type::signed_integer; }
+#else
+
+// parse the number at src
+// define JSON_TEST_NUMBERS for unit testing
+//
+// It is assumed that the number is followed by a structural ({,},],[) character
+// or a white space character. If that is not the case (e.g., when the JSON
+// document is made of a single number), then it is necessary to copy the
+// content and append a space before calling this function.
+//
+// Our objective is accurate parsing (ULP of 0) at high speed.
+template<typename W>
+simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
+
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
+
+  //
+  // Handle floats if there is a . or e (or both)
+  //
+  int64_t exponent = 0;
+  bool is_float = false;
+  if ('.' == *p) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_decimal(src, p, i, exponent) );
+    digit_count = int(p - start_digits); // used later to guard against overflows
+  }
+  if (('e' == *p) || ('E' == *p)) {
+    is_float = true;
+    ++p;
+    SIMDJSON_TRY( parse_exponent(src, p, exponent) );
+  }
+  if (is_float) {
+    const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
+    SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
+    if (dirty_end) { return INVALID_NUMBER(src); }
+    return SUCCESS;
+  }
+
+  // The longest negative 64-bit number is 19 digits.
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  size_t longest_digit_count = negative ? 19 : 20;
+  if (digit_count > longest_digit_count) { return INVALID_NUMBER(src); }
+  if (digit_count == longest_digit_count) {
+    if (negative) {
+      // Anything negative above INT64_MAX+1 is invalid
+      if (i > uint64_t(INT64_MAX)+1) { return INVALID_NUMBER(src);  }
+      WRITE_INTEGER(~i+1, src, writer);
+      if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+      return SUCCESS;
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    }  else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
+  }
+
+  // Write unsigned if it doesn't fit in a signed integer.
+  if (i > uint64_t(INT64_MAX)) {
+    WRITE_UNSIGNED(i, src, writer);
+  } else {
+    WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
+  }
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
+  return SUCCESS;
+}
+
+// Inlineable functions
+namespace {
+
+// This table can be used to characterize the final character of an integer
+// string. For JSON structural character and allowable white space characters,
+// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
+// we return NUMBER_ERROR.
+// Optimization note: we could easily reduce the size of the table by half (to 128)
+// at the cost of an extra branch.
+// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
+static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
+static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
+
+const uint8_t integer_string_finisher[256] = {
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   SUCCESS,      NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, SUCCESS,        NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, SUCCESS,      NUMBER_ERROR,
+    SUCCESS,      NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR, NUMBER_ERROR,   NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
+    NUMBER_ERROR};
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  const uint8_t *p = src;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from 0 to 18,446,744,073,709,551,615
+simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
+  const uint8_t *p = src + 1;
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // The longest positive 64-bit number is 20 digits.
+  // We do it this way so we don't trigger this branch unless we must.
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > 20))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  if (digit_count == 20) {
+    // Positive overflow check:
+    // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
+    //   biggest uint64_t.
+    // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
+    //   If we got here, it's a 20 digit number starting with the digit "1".
+    // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
+    //   than 1,553,255,926,290,448,384.
+    // - That is smaller than the smallest possible 20-digit number the user could write:
+    //   10,000,000,000,000,000,000.
+    // - Therefore, if the number is positive and lower than that, it's overflow.
+    // - The value we are looking at is less than or equal to INT64_MAX.
+    //
+    // Note: we use src[1] and not src[0] because src[0] is the quote character in this
+    // instance.
+    if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
+  }
+
+  return i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while (parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
+  //
+  // Check for minus sign
+  //
+  if(src == src_end) { return NUMBER_ERROR; }
+  bool negative = (*src == '-');
+  const uint8_t *p = src + uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = p;
+  uint64_t i = 0;
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(p - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
+  //  return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+// Parse any number from  -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
+  const uint8_t *const start_digits = src;
+  uint64_t i = 0;
+  while (parse_digit(*src, i)) { src++; }
+
+  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
+  // Optimization note: size_t is expected to be unsigned.
+  size_t digit_count = size_t(src - start_digits);
+  // We go from
+  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
+  // so we can never represent numbers that have more than 19 digits.
+  size_t longest_digit_count = 19;
+  // Optimization note: the compiler can probably merge
+  // ((digit_count == 0) || (digit_count > longest_digit_count))
+  // into a single  branch since digit_count is unsigned.
+  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
+  // Here digit_count > 0.
+  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
+  // We can do the following...
+  // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
+  //  return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
+  // }
+  // as a single table lookup:
+  if(*src != '"') { return NUMBER_ERROR; }
+  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
+  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
+  // so cheap that we might as well always make it.
+  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
+  return negative ? (~i+1) : i;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
+  return (*src == '-');
+}
+
+simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
+  return false;
+}
+
+simdjson_unused simdjson_inline simdjson_result<ondemand::number_type> get_number_type(const uint8_t * src) noexcept {
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+  const uint8_t *p = src;
+  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
+  if ( p == src ) { return NUMBER_ERROR; }
+  if (jsoncharutils::is_structural_or_whitespace(*p)) {
+    // We have an integer.
+    // If the number is negative and valid, it must be a signed integer.
+    if(negative) { return ondemand::number_type::signed_integer; }
+    // We want values larger or equal to 9223372036854775808 to be unsigned
+    // integers, and the other values to be signed integers.
+    int digit_count = int(p - src);
+    if(digit_count >= 19) {
+      const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
+      if((digit_count >= 20) || (memcmp(src, smaller_big_integer, 19) >= 0)) {
+        return ondemand::number_type::unsigned_integer;
+      }
+    }
+    return ondemand::number_type::signed_integer;
+  }
+  // Hopefully, we have 'e' or 'E' or '.'.
+  return ondemand::number_type::floating_point_number;
+}
+
+// Never read at src_end or beyond
+simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
+  if(src == src_end) { return NUMBER_ERROR; }
+  //
+  // Check for minus sign
+  //
+  bool negative = (*src == '-');
+  src += uint8_t(negative);
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  if(p == src_end) { return NUMBER_ERROR; }
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while ((p != src_end) && parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely((p != src_end) && (*p == '.'))) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while ((p != src_end) && parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if ((p != src_end) && (*p == 'e' || *p == 'E')) {
+    p++;
+    if(p == src_end) { return NUMBER_ERROR; }
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while ((p != src_end) && parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+
+simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
+  //
+  // Check for minus sign
+  //
+  bool negative = (*(src + 1) == '-');
+  src += uint8_t(negative) + 1;
+
+  //
+  // Parse the integer part.
+  //
+  uint64_t i = 0;
+  const uint8_t *p = src;
+  p += parse_digit(*p, i);
+  bool leading_zero = (i == 0);
+  while (parse_digit(*p, i)) { p++; }
+  // no integer digits, or 0123 (zero must be solo)
+  if ( p == src ) { return INCORRECT_TYPE; }
+  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
+
+  //
+  // Parse the decimal part.
+  //
+  int64_t exponent = 0;
+  bool overflow;
+  if (simdjson_likely(*p == '.')) {
+    p++;
+    const uint8_t *start_decimal_digits = p;
+    if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
+    p++;
+    while (parse_digit(*p, i)) { p++; }
+    exponent = -(p - start_decimal_digits);
+
+    // Overflow check. More than 19 digits (minus the decimal) may be overflow.
+    overflow = p-src-1 > 19;
+    if (simdjson_unlikely(overflow && leading_zero)) {
+      // Skip leading 0.00000 and see if it still overflows
+      const uint8_t *start_digits = src + 2;
+      while (*start_digits == '0') { start_digits++; }
+      overflow = start_digits-src > 19;
+    }
+  } else {
+    overflow = p-src > 19;
+  }
+
+  //
+  // Parse the exponent
+  //
+  if (*p == 'e' || *p == 'E') {
+    p++;
+    bool exp_neg = *p == '-';
+    p += exp_neg || *p == '+';
+
+    uint64_t exp = 0;
+    const uint8_t *start_exp_digits = p;
+    while (parse_digit(*p, exp)) { p++; }
+    // no exp digits, or 20+ exp digits
+    if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
+
+    exponent += exp_neg ? 0-exp : exp;
+  }
+
+  if (*p != '"') { return NUMBER_ERROR; }
+
+  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
+
+  //
+  // Assemble (or slow-parse) the float
+  //
+  double d;
+  if (simdjson_likely(!overflow)) {
+    if (compute_float_64(exponent, i, negative, d)) { return d; }
+  }
+  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
+    return NUMBER_ERROR;
+  }
+  return d;
+}
+} //namespace {}
+#endif // SIMDJSON_SKIPNUMBERPARSING
+
+} // namespace numberparsing
+} // unnamed namespace
+} // namespace westmere
+} // namespace simdjson
+/* end file include/simdjson/generic/numberparsing.h */
+
+#endif //  SIMDJSON_WESTMERE_NUMBERPARSING_H
+/* end file include/simdjson/westmere/numberparsing.h */
+/* begin file include/simdjson/westmere/end.h */
+SIMDJSON_UNTARGET_WESTMERE
+/* end file include/simdjson/westmere/end.h */
+
+#endif // SIMDJSON_IMPLEMENTATION_WESTMERE
+#endif // SIMDJSON_WESTMERE_COMMON_H
+/* end file include/simdjson/westmere.h */
+
+// Builtin implementation
+
+SIMDJSON_POP_DISABLE_WARNINGS
+
+#endif // SIMDJSON_IMPLEMENTATIONS_H
+/* end file include/simdjson/implementations.h */
+
+// Determine the best builtin implementation
+#ifndef SIMDJSON_BUILTIN_IMPLEMENTATION
+#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
+#define SIMDJSON_BUILTIN_IMPLEMENTATION icelake
+#elif SIMDJSON_CAN_ALWAYS_RUN_HASWELL
+#define SIMDJSON_BUILTIN_IMPLEMENTATION haswell
+#elif SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
+#define SIMDJSON_BUILTIN_IMPLEMENTATION westmere
+#elif SIMDJSON_CAN_ALWAYS_RUN_ARM64
+#define SIMDJSON_BUILTIN_IMPLEMENTATION arm64
+#elif SIMDJSON_CAN_ALWAYS_RUN_PPC64
+#define SIMDJSON_BUILTIN_IMPLEMENTATION ppc64
+#elif SIMDJSON_CAN_ALWAYS_RUN_FALLBACK
+#define SIMDJSON_BUILTIN_IMPLEMENTATION fallback
+#else
+#error "All possible implementations (including fallback) have been disabled! simdjson will not run."
+#endif
+#endif // SIMDJSON_BUILTIN_IMPLEMENTATION
+
+// redefining SIMDJSON_IMPLEMENTATION to "SIMDJSON_BUILTIN_IMPLEMENTATION"
+// #define SIMDJSON_IMPLEMENTATION SIMDJSON_BUILTIN_IMPLEMENTATION
+
+// ondemand is only compiled as part of the builtin implementation at present
+
+// Interface declarations
+/* begin file include/simdjson/generic/implementation_simdjson_result_base.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+
+// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
+// so we can avoid inlining errors
+// TODO reconcile these!
+/**
+ * The result of a simdjson operation that could fail.
+ *
+ * Gives the option of reading error codes, or throwing an exception by casting to the desired result.
+ *
+ * This is a base class for implementations that want to add functions to the result type for
+ * chaining.
+ *
+ * Override like:
+ *
+ *   struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
+ *     simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
+ *     simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
+ *     simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
+ *     simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
+ *     // Your extra methods here
+ *   }
+ *
+ * Then any method returning simdjson_result<T> will be chainable with your methods.
+ */
+template<typename T>
+struct implementation_simdjson_result_base {
+
+  /**
+   * Create a new empty result with error = UNINITIALIZED.
+   */
+  simdjson_inline implementation_simdjson_result_base() noexcept = default;
+
+  /**
+   * Create a new error result.
+   */
+  simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
+
+  /**
+   * Create a new successful result.
+   */
+  simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
+
+  /**
+   * Create a new result with both things (use if you don't want to branch when creating the result).
+   */
+  simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
+
+  /**
+   * Move the value and the error to the provided variables.
+   *
+   * @param value The variable to assign the value to. May not be set if there is an error.
+   * @param error The variable to assign the error to. Set to SUCCESS if there is no error.
+   */
+  simdjson_inline void tie(T &value, error_code &error) && noexcept;
+
+  /**
+   * Move the value to the provided variable.
+   *
+   * @param value The variable to assign the value to. May not be set if there is an error.
+   */
+  simdjson_inline error_code get(T &value) && noexcept;
+
+  /**
+   * The error.
+   */
+  simdjson_inline error_code error() const noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+
+  /**
+   * Get the result value.
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T& value() & noexcept(false);
+
+  /**
+   * Take the result value (move it).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T&& value() && noexcept(false);
+
+  /**
+   * Take the result value (move it).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline T&& take_value() && noexcept(false);
+
+  /**
+   * Cast to the value (will throw on error).
+   *
+   * @throw simdjson_error if there was an error.
+   */
+  simdjson_inline operator T&&() && noexcept(false);
+
+
+#endif // SIMDJSON_EXCEPTIONS
+
+  /**
+   * Get the result value. This function is safe if and only
+   * the error() method returns a value that evaluates to false.
+   */
+  simdjson_inline const T& value_unsafe() const& noexcept;
+  /**
+   * Get the result value. This function is safe if and only
+   * the error() method returns a value that evaluates to false.
+   */
+  simdjson_inline T& value_unsafe() & noexcept;
+  /**
+   * Take the result value (move it). This function is safe if and only
+   * the error() method returns a value that evaluates to false.
+   */
+  simdjson_inline T&& value_unsafe() && noexcept;
+protected:
+  /** users should never directly access first and second. **/
+  T first{}; /** Users should never directly access 'first'. **/
+  error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
+}; // struct implementation_simdjson_result_base
+
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+/* end file include/simdjson/generic/implementation_simdjson_result_base.h */
+/* begin file include/simdjson/generic/ondemand.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+/**
+ * A fast, simple, DOM-like interface that parses JSON as you use it.
+ *
+ * Designed for maximum speed and a lower memory profile.
+ */
+namespace ondemand {
+
+/** Represents the depth of a JSON value (number of nested arrays/objects). */
+using depth_t = int32_t;
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+/* begin file include/simdjson/generic/ondemand/json_type.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+/**
+ * The type of a JSON value.
+ */
+enum class json_type {
+    // Start at 1 to catch uninitialized / default values more easily
+    array=1, ///< A JSON array   ( [ 1, 2, 3 ... ] )
+    object,  ///< A JSON object  ( { "a": 1, "b" 2, ... } )
+    number,  ///< A JSON number  ( 1 or -2.3 or 4.5e6 ...)
+    string,  ///< A JSON string  ( "a" or "hello world\n" ...)
+    boolean, ///< A JSON boolean (true or false)
+    null     ///< A JSON null    (null)
+};
+
+class value_iterator;
+
+/**
+ * A type representing a JSON number.
+ * The design of the struct is deliberately straight-forward. All
+ * functions return standard values with no error check.
+ */
+struct number {
+
+  /**
+   * return the automatically determined type of
+   * the number: number_type::floating_point_number,
+   * number_type::signed_integer or number_type::unsigned_integer.
+   *
+   *    enum class number_type {
+   *        floating_point_number=1, /// a binary64 number
+   *        signed_integer,          /// a signed integer that fits in a 64-bit word using two's complement
+   *        unsigned_integer         /// a positive integer larger or equal to 1<<63
+   *    };
+   */
+  simdjson_inline number_type get_number_type() const noexcept;
+  /**
+   * return true if the automatically determined type of
+   * the number is number_type::unsigned_integer.
+   */
+  simdjson_inline bool is_uint64() const noexcept;
+  /**
+   * return the value as a uint64_t, only valid if is_uint64() is true.
+   */
+  simdjson_inline uint64_t get_uint64() const noexcept;
+  simdjson_inline operator uint64_t() const noexcept;
+
+  /**
+   * return true if the automatically determined type of
+   * the number is number_type::signed_integer.
+   */
+  simdjson_inline bool is_int64() const noexcept;
+  /**
+   * return the value as a int64_t, only valid if is_int64() is true.
+   */
+  simdjson_inline int64_t get_int64() const noexcept;
+  simdjson_inline operator int64_t() const noexcept;
+
+
+  /**
+   * return true if the automatically determined type of
+   * the number is number_type::floating_point_number.
+   */
+  simdjson_inline bool is_double() const noexcept;
+  /**
+   * return the value as a double, only valid if is_double() is true.
+   */
+  simdjson_inline double get_double() const noexcept;
+  simdjson_inline operator double() const noexcept;
+
+  /**
+   * Convert the number to a double. Though it always succeed, the conversion
+   * may be lossy if the number cannot be represented exactly.
+   */
+  simdjson_inline double as_double() const noexcept;
+
+
+protected:
+  /**
+   * The next block of declaration is designed so that we can call the number parsing
+   * functions on a number type. They are protected and should never be used outside
+   * of the core simdjson library.
+   */
+  friend class value_iterator;
+  template<typename W>
+  friend error_code numberparsing::write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer);
+  template<typename W>
+  friend error_code numberparsing::parse_number(const uint8_t *const src, W &writer);
+  template<typename W>
+  friend error_code numberparsing::slow_float_parsing(simdjson_unused const uint8_t * src, W writer);
+  /** Store a signed 64-bit value to the number. */
+  simdjson_inline void append_s64(int64_t value) noexcept;
+  /** Store an unsigned 64-bit value to the number. */
+  simdjson_inline void append_u64(uint64_t value) noexcept;
+  /** Store a double value to the number. */
+  simdjson_inline void append_double(double value) noexcept;
+  /** Specifies that the value is a double, but leave it undefined. */
+  simdjson_inline void skip_double() noexcept;
+  /**
+   * End of friend declarations.
+   */
+
+  /**
+   * Our attributes are a union type (size = 64 bits)
+   * followed by a type indicator.
+   */
+  union {
+    double floating_point_number;
+    int64_t signed_integer;
+    uint64_t unsigned_integer;
+  } payload{0};
+  number_type type{number_type::signed_integer};
+};
+
+/**
+ * Write the JSON type to the output stream
+ *
+ * @param out The output stream.
+ * @param type The json_type.
+ */
+inline std::ostream& operator<<(std::ostream& out, json_type type) noexcept;
+inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+/**
+ * Send JSON type to an output stream.
+ *
+ * @param out The output stream.
+ * @param type The json_type.
+ * @throw simdjson_error if the result being printed has an error. If there is an error with the
+ *        underlying output stream, that error will be propagated (simdjson_error will not be
+ *        thrown).
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson_result<json_type> &type) noexcept(false);
+#endif
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+  simdjson_inline ~simdjson_result() noexcept = default; ///< @private
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/json_type.h */
+/* begin file include/simdjson/generic/ondemand/token_position.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+/** @private Position in the JSON buffer indexes */
+using token_position = const uint32_t *;
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/token_position.h */
+/* begin file include/simdjson/generic/ondemand/logger.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class json_iterator;
+class value_iterator;
+
+namespace logger {
+
+#if SIMDJSON_VERBOSE_LOGGING
+  static constexpr const bool LOG_ENABLED = true;
+#else
+  static constexpr const bool LOG_ENABLED = false;
+#endif
+
+// We do not want these functions to be 'really inlined' since real inlining is
+// for performance purposes and if you are using the loggers, you do not care about
+// performance (or should not).
+static inline void log_headers() noexcept;
+static inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept;
+static inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept;
+static inline void log_event(const json_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept;
+static inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept;
+static inline void log_value(const json_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept;
+static inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail="") noexcept;
+static inline void log_start_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept;
+static inline void log_end_value(const json_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept;
+static inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail="") noexcept;
+static inline void log_error(const json_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept;
+
+static inline void log_event(const value_iterator &iter, const char *type, std::string_view detail="", int delta=0, int depth_delta=0) noexcept;
+static inline void log_value(const value_iterator &iter, const char *type, std::string_view detail="", int delta=-1, int depth_delta=0) noexcept;
+static inline void log_start_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept;
+static inline void log_end_value(const value_iterator &iter, const char *type, int delta=-1, int depth_delta=0) noexcept;
+static inline void log_error(const value_iterator &iter, const char *error, const char *detail="", int delta=-1, int depth_delta=0) noexcept;
+
+} // namespace logger
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/logger.h */
+/* begin file include/simdjson/generic/ondemand/raw_json_string.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class object;
+class parser;
+class json_iterator;
+
+/**
+ * A string escaped per JSON rules, terminated with quote ("). They are used to represent
+ * unescaped keys inside JSON documents.
+ *
+ * (In other words, a pointer to the beginning of a string, just after the start quote, inside a
+ * JSON file.)
+ *
+ * This class is deliberately simplistic and has little functionality. You can
+ * compare a raw_json_string instance with an unescaped C string, but
+ * that is nearly all you can do.
+ *
+ * The raw_json_string is unescaped. If you wish to write an unescaped version of it to your own
+ * buffer, you may do so using the parser.unescape(string, buff) method, using an ondemand::parser
+ * instance. Doing so requires you to have a sufficiently large buffer.
+ *
+ * The raw_json_string instances originate typically from field instance which in turn represent
+ * key-value pairs from object instances. From a field instance, you get the raw_json_string
+ * instance by calling key(). You can, if you want a more usable string_view instance, call
+ * the unescaped_key() method on the field instance. You may also create a raw_json_string from
+ * any other string value, with the value.get_raw_json_string() method. Again, you can get
+ * a more usable string_view instance by calling get_string().
+ *
+ */
+class raw_json_string {
+public:
+  /**
+   * Create a new invalid raw_json_string.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline raw_json_string() noexcept = default;
+
+  /**
+   * Create a new invalid raw_json_string pointed at the given location in the JSON.
+   *
+   * The given location must be just *after* the beginning quote (") in the JSON file.
+   *
+   * It *must* be terminated by a ", and be a valid JSON string.
+   */
+  simdjson_inline raw_json_string(const uint8_t * _buf) noexcept;
+  /**
+   * Get the raw pointer to the beginning of the string in the JSON (just after the ").
+   *
+   * It is possible for this function to return a null pointer if the instance
+   * has outlived its existence.
+   */
+  simdjson_inline const char * raw() const noexcept;
+
+  /**
+   * This compares the current instance to the std::string_view target: returns true if
+   * they are byte-by-byte equal (no escaping is done) on target.size() characters,
+   * and if the raw_json_string instance has a quote character at byte index target.size().
+   * We never read more than length + 1 bytes in the raw_json_string instance.
+   * If length is smaller than target.size(), this will return false.
+   *
+   * The std::string_view instance may contain any characters. However, the caller
+   * is responsible for setting length so that length bytes may be read in the
+   * raw_json_string.
+   *
+   * Performance: the comparison may be done using memcmp which may be efficient
+   * for long strings.
+   */
+  simdjson_inline bool unsafe_is_equal(size_t length, std::string_view target) const noexcept;
+
+  /**
+   * This compares the current instance to the std::string_view target: returns true if
+   * they are byte-by-byte equal (no escaping is done).
+   * The std::string_view instance should not contain unescaped quote characters:
+   * the caller is responsible for this check. See is_free_from_unescaped_quote.
+   *
+   * Performance: the comparison is done byte-by-byte which might be inefficient for
+   * long strings.
+   *
+   * If target is a compile-time constant, and your compiler likes you,
+   * you should be able to do the following without performance penalty...
+   *
+   *   static_assert(raw_json_string::is_free_from_unescaped_quote(target), "");
+   *   s.unsafe_is_equal(target);
+   */
+  simdjson_inline bool unsafe_is_equal(std::string_view target) const noexcept;
+
+  /**
+   * This compares the current instance to the C string target: returns true if
+   * they are byte-by-byte equal (no escaping is done).
+   * The provided C string should not contain an unescaped quote character:
+   * the caller is responsible for this check. See is_free_from_unescaped_quote.
+   *
+   * If target is a compile-time constant, and your compiler likes you,
+   * you should be able to do the following without performance penalty...
+   *
+   *   static_assert(raw_json_string::is_free_from_unescaped_quote(target), "");
+   *   s.unsafe_is_equal(target);
+   */
+  simdjson_inline bool unsafe_is_equal(const char* target) const noexcept;
+
+  /**
+   * This compares the current instance to the std::string_view target: returns true if
+   * they are byte-by-byte equal (no escaping is done).
+   */
+  simdjson_inline bool is_equal(std::string_view target) const noexcept;
+
+  /**
+   * This compares the current instance to the C string target: returns true if
+   * they are byte-by-byte equal (no escaping is done).
+   */
+  simdjson_inline bool is_equal(const char* target) const noexcept;
+
+  /**
+   * Returns true if target is free from unescaped quote. If target is known at
+   * compile-time, we might expect the computation to happen at compile time with
+   * many compilers (not all!).
+   */
+  static simdjson_inline bool is_free_from_unescaped_quote(std::string_view target) noexcept;
+  static simdjson_inline bool is_free_from_unescaped_quote(const char* target) noexcept;
+
+private:
+
+
+  /**
+   * This will set the inner pointer to zero, effectively making
+   * this instance unusable.
+   */
+  simdjson_inline void consume() noexcept { buf = nullptr; }
+
+  /**
+   * Checks whether the inner pointer is non-null and thus usable.
+   */
+  simdjson_inline simdjson_warn_unused bool alive() const noexcept { return buf != nullptr; }
+
+  /**
+   * Unescape this JSON string, replacing \\ with \, \n with newline, etc.
+   *
+   * ## IMPORTANT: string_view lifetime
+   *
+   * The string_view is only valid until the next parse() call on the parser.
+   *
+   * @param iter A json_iterator, which contains a buffer where the string will be written.
+   */
+  simdjson_inline simdjson_warn_unused simdjson_result<std::string_view> unescape(json_iterator &iter) const noexcept;
+
+  const uint8_t * buf{};
+  friend class object;
+  friend class field;
+  friend class parser;
+  friend struct simdjson_result<raw_json_string>;
+};
+
+simdjson_unused simdjson_inline std::ostream &operator<<(std::ostream &, const raw_json_string &) noexcept;
+
+/**
+ * Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user is responsible
+ * for providing a string with no unescaped quote. Note that unescaped quotes cannot be present in valid JSON strings.
+ */
+simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept;
+simdjson_unused simdjson_inline bool operator==(std::string_view c, const raw_json_string &a) noexcept;
+simdjson_unused simdjson_inline bool operator!=(const raw_json_string &a, std::string_view c) noexcept;
+simdjson_unused simdjson_inline bool operator!=(std::string_view c, const raw_json_string &a) noexcept;
+
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+  simdjson_inline ~simdjson_result() noexcept = default; ///< @private
+
+  simdjson_inline simdjson_result<const char *> raw() const noexcept;
+  simdjson_inline simdjson_warn_unused simdjson_result<std::string_view> unescape(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator &iter) const noexcept;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/raw_json_string.h */
+/* begin file include/simdjson/generic/ondemand/token_iterator.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+/**
+ * Iterates through JSON tokens (`{` `}` `[` `]` `,` `:` `"<string>"` `123` `true` `false` `null`)
+ * detected by stage 1.
+ *
+ * @private This is not intended for external use.
+ */
+class token_iterator {
+public:
+  /**
+   * Create a new invalid token_iterator.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline token_iterator() noexcept = default;
+  simdjson_inline token_iterator(token_iterator &&other) noexcept = default;
+  simdjson_inline token_iterator &operator=(token_iterator &&other) noexcept = default;
+  simdjson_inline token_iterator(const token_iterator &other) noexcept = default;
+  simdjson_inline token_iterator &operator=(const token_iterator &other) noexcept = default;
+
+  /**
+   * Advance to the next token (returning the current one).
+   */
+  simdjson_inline const uint8_t *return_current_and_advance() noexcept;
+  /**
+   * Reports the current offset in bytes from the start of the underlying buffer.
+   */
+  simdjson_inline uint32_t current_offset() const noexcept;
+  /**
+   * Get the JSON text for a given token (relative).
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * @param delta The relative position of the token to retrieve. e.g. 0 = current token,
+   *              1 = next token, -1 = prev token.
+   *
+   * TODO consider a string_view, assuming the length will get stripped out by the optimizer when
+   * it isn't used ...
+   */
+  simdjson_inline const uint8_t *peek(int32_t delta=0) const noexcept;
+  /**
+   * Get the maximum length of the JSON text for a given token.
+   *
+   * The length will include any whitespace at the end of the token.
+   *
+   * @param delta The relative position of the token to retrieve. e.g. 0 = current token,
+   *              1 = next token, -1 = prev token.
+   */
+  simdjson_inline uint32_t peek_length(int32_t delta=0) const noexcept;
+
+  /**
+   * Get the JSON text for a given token.
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * @param position The position of the token.
+   *
+   */
+  simdjson_inline const uint8_t *peek(token_position position) const noexcept;
+  /**
+   * Get the maximum length of the JSON text for a given token.
+   *
+   * The length will include any whitespace at the end of the token.
+   *
+   * @param position The position of the token.
+   */
+  simdjson_inline uint32_t peek_length(token_position position) const noexcept;
+
+  /**
+   * Return the current index.
+   */
+  simdjson_inline token_position position() const noexcept;
+  /**
+   * Reset to a previously saved index.
+   */
+  simdjson_inline void set_position(token_position target_position) noexcept;
+
+  // NOTE: we don't support a full C++ iterator interface, because we expect people to make
+  // different calls to advance the iterator based on *their own* state.
+
+  simdjson_inline bool operator==(const token_iterator &other) const noexcept;
+  simdjson_inline bool operator!=(const token_iterator &other) const noexcept;
+  simdjson_inline bool operator>(const token_iterator &other) const noexcept;
+  simdjson_inline bool operator>=(const token_iterator &other) const noexcept;
+  simdjson_inline bool operator<(const token_iterator &other) const noexcept;
+  simdjson_inline bool operator<=(const token_iterator &other) const noexcept;
+
+protected:
+  simdjson_inline token_iterator(const uint8_t *buf, token_position position) noexcept;
+
+  /**
+   * Get the index of the JSON text for a given token (relative).
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * @param delta The relative position of the token to retrieve. e.g. 0 = current token,
+   *              1 = next token, -1 = prev token.
+   */
+  simdjson_inline uint32_t peek_index(int32_t delta=0) const noexcept;
+  /**
+   * Get the index of the JSON text for a given token.
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * @param position The position of the token.
+   *
+   */
+  simdjson_inline uint32_t peek_index(token_position position) const noexcept;
+
+  const uint8_t *buf{};
+  token_position _position{};
+
+  friend class json_iterator;
+  friend class value_iterator;
+  friend class object;
+  friend simdjson_inline void logger::log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept;
+  friend simdjson_inline void logger::log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+  simdjson_inline ~simdjson_result() noexcept = default; ///< @private
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/token_iterator.h */
+/* begin file include/simdjson/generic/ondemand/json_iterator.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class document;
+class document_stream;
+class object;
+class array;
+class value;
+class raw_json_string;
+class parser;
+
+/**
+ * Iterates through JSON tokens, keeping track of depth and string buffer.
+ *
+ * @private This is not intended for external use.
+ */
+class json_iterator {
+protected:
+  token_iterator token{};
+  ondemand::parser *parser{};
+  /**
+   * Next free location in the string buffer.
+   *
+   * Used by raw_json_string::unescape() to have a place to unescape strings to.
+   */
+  uint8_t *_string_buf_loc{};
+  /**
+   * JSON error, if there is one.
+   *
+   * INCORRECT_TYPE and NO_SUCH_FIELD are *not* stored here, ever.
+   *
+   * PERF NOTE: we *hope* this will be elided into control flow, as it is only used (a) in the first
+   * iteration of the loop, or (b) for the final iteration after a missing comma is found in ++. If
+   * this is not elided, we should make sure it's at least not using up a register. Failing that,
+   * we should store it in document so there's only one of them.
+   */
+  error_code error{SUCCESS};
+  /**
+   * Depth of the current token in the JSON.
+   *
+   * - 0 = finished with document
+   * - 1 = document root value (could be [ or {, not yet known)
+   * - 2 = , or } inside root array/object
+   * - 3 = key or value inside root array/object.
+   */
+  depth_t _depth{};
+  /**
+   * Beginning of the document indexes.
+   * Normally we have root == parser->implementation->structural_indexes.get()
+   * but this may differ, especially in streaming mode (where we have several
+   * documents);
+   */
+  token_position _root{};
+  /**
+   * Normally, a json_iterator operates over a single document, but in
+   * some cases, we may have a stream of documents. This attribute is meant
+   * as meta-data: the json_iterator works the same irrespective of the
+   * value of this attribute.
+   */
+  bool _streaming{false};
+
+public:
+  simdjson_inline json_iterator() noexcept = default;
+  simdjson_inline json_iterator(json_iterator &&other) noexcept;
+  simdjson_inline json_iterator &operator=(json_iterator &&other) noexcept;
+  simdjson_inline explicit json_iterator(const json_iterator &other) noexcept = default;
+  simdjson_inline json_iterator &operator=(const json_iterator &other) noexcept = default;
+  /**
+   * Skips a JSON value, whether it is a scalar, array or object.
+   */
+  simdjson_warn_unused simdjson_inline error_code skip_child(depth_t parent_depth) noexcept;
+
+  /**
+   * Tell whether the iterator is still at the start
+   */
+  simdjson_inline bool at_root() const noexcept;
+
+  /**
+   * Tell whether we should be expected to run in streaming
+   * mode (iterating over many documents). It is pure metadata
+   * that does not affect how the iterator works. It is used by
+   * start_root_array() and start_root_object().
+   */
+  simdjson_inline bool streaming() const noexcept;
+
+  /**
+   * Get the root value iterator
+   */
+  simdjson_inline token_position root_position() const noexcept;
+  /**
+   * Assert that we are at the document depth (== 1)
+   */
+  simdjson_inline void assert_at_document_depth() const noexcept;
+  /**
+   * Assert that we are at the root of the document
+   */
+  simdjson_inline void assert_at_root() const noexcept;
+
+  /**
+   * Tell whether the iterator is at the EOF mark
+   */
+  simdjson_inline bool at_end() const noexcept;
+
+  /**
+   * Tell whether the iterator is live (has not been moved).
+   */
+  simdjson_inline bool is_alive() const noexcept;
+
+  /**
+   * Abandon this iterator, setting depth to 0 (as if the document is finished).
+   */
+  simdjson_inline void abandon() noexcept;
+
+  /**
+   * Advance the current token without modifying depth.
+   */
+  simdjson_inline const uint8_t *return_current_and_advance() noexcept;
+
+  /**
+   * Returns true if there is a single token in the index (i.e., it is
+   * a JSON with a scalar value such as a single number).
+   *
+   * @return whether there is a single token
+   */
+  simdjson_inline bool is_single_token() const noexcept;
+
+  /**
+   * Assert that there are at least the given number of tokens left.
+   *
+   * Has no effect in release builds.
+   */
+  simdjson_inline void assert_more_tokens(uint32_t required_tokens=1) const noexcept;
+  /**
+   * Assert that the given position addresses an actual token (is within bounds).
+   *
+   * Has no effect in release builds.
+   */
+  simdjson_inline void assert_valid_position(token_position position) const noexcept;
+  /**
+   * Get the JSON text for a given token (relative).
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * @param delta The relative position of the token to retrieve. e.g. 0 = next token, -1 = prev token.
+   *
+   * TODO consider a string_view, assuming the length will get stripped out by the optimizer when
+   * it isn't used ...
+   */
+  simdjson_inline const uint8_t *peek(int32_t delta=0) const noexcept;
+  /**
+   * Get the maximum length of the JSON text for the current token (or relative).
+   *
+   * The length will include any whitespace at the end of the token.
+   *
+   * @param delta The relative position of the token to retrieve. e.g. 0 = next token, -1 = prev token.
+   */
+  simdjson_inline uint32_t peek_length(int32_t delta=0) const noexcept;
+  /**
+   * Get a pointer to the current location in the input buffer.
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * You may be pointing outside of the input buffer: it is not generally
+   * safe to dereference this pointer.
+   */
+  simdjson_inline const uint8_t *unsafe_pointer() const noexcept;
+  /**
+   * Get the JSON text for a given token.
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * @param position The position of the token to retrieve.
+   *
+   * TODO consider a string_view, assuming the length will get stripped out by the optimizer when
+   * it isn't used ...
+   */
+  simdjson_inline const uint8_t *peek(token_position position) const noexcept;
+  /**
+   * Get the maximum length of the JSON text for the current token (or relative).
+   *
+   * The length will include any whitespace at the end of the token.
+   *
+   * @param position The position of the token to retrieve.
+   */
+  simdjson_inline uint32_t peek_length(token_position position) const noexcept;
+  /**
+   * Get the JSON text for the last token in the document.
+   *
+   * This is not null-terminated; it is a view into the JSON.
+   *
+   * TODO consider a string_view, assuming the length will get stripped out by the optimizer when
+   * it isn't used ...
+   */
+  simdjson_inline const uint8_t *peek_last() const noexcept;
+
+  /**
+   * Ascend one level.
+   *
+   * Validates that the depth - 1 == parent_depth.
+   *
+   * @param parent_depth the expected parent depth.
+   */
+  simdjson_inline void ascend_to(depth_t parent_depth) noexcept;
+
+  /**
+   * Descend one level.
+   *
+   * Validates that the new depth == child_depth.
+   *
+   * @param child_depth the expected child depth.
+   */
+  simdjson_inline void descend_to(depth_t child_depth) noexcept;
+  simdjson_inline void descend_to(depth_t child_depth, int32_t delta) noexcept;
+
+  /**
+   * Get current depth.
+   */
+  simdjson_inline depth_t depth() const noexcept;
+
+  /**
+   * Get current (writeable) location in the string buffer.
+   */
+  simdjson_inline uint8_t *&string_buf_loc() noexcept;
+
+  /**
+   * Report an unrecoverable error, preventing further iteration.
+   *
+   * @param error The error to report. Must not be SUCCESS, UNINITIALIZED, INCORRECT_TYPE, or NO_SUCH_FIELD.
+   * @param message An error message to report with the error.
+   */
+  simdjson_inline error_code report_error(error_code error, const char *message) noexcept;
+
+  /**
+   * Log error, but don't stop iteration.
+   * @param error The error to report. Must be INCORRECT_TYPE, or NO_SUCH_FIELD.
+   * @param message An error message to report with the error.
+   */
+  simdjson_inline error_code optional_error(error_code error, const char *message) noexcept;
+
+  template<int N> simdjson_warn_unused simdjson_inline bool copy_to_buffer(const uint8_t *json, uint32_t max_len, uint8_t (&tmpbuf)[N]) noexcept;
+
+  simdjson_inline token_position position() const noexcept;
+  /**
+   * Write the raw_json_string to the string buffer and return a string_view.
+   * Each raw_json_string should be unescaped once, or else the string buffer might
+   * overflow.
+   */
+  simdjson_inline simdjson_result<std::string_view> unescape(raw_json_string in) noexcept;
+  simdjson_inline void reenter_child(token_position position, depth_t child_depth) noexcept;
+
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  simdjson_inline token_position start_position(depth_t depth) const noexcept;
+  simdjson_inline void set_start_position(depth_t depth, token_position position) noexcept;
+#endif
+
+  /* Useful for debugging and logging purposes. */
+  inline std::string to_string() const noexcept;
+
+  /**
+   * Returns the current location in the document if in bounds.
+   */
+  inline simdjson_result<const char *> current_location() noexcept;
+
+  /**
+   * Updates this json iterator so that it is back at the beginning of the document,
+   * as if it had just been created.
+   */
+  inline void rewind() noexcept;
+  /**
+   * This checks whether the {,},[,] are balanced so that the document
+   * ends with proper zero depth. This requires scanning the whole document
+   * and it may be expensive. It is expected that it will be rarely called.
+   * It does not attempt to match { with } and [ with ].
+   */
+  inline bool balanced() const noexcept;
+protected:
+  simdjson_inline json_iterator(const uint8_t *buf, ondemand::parser *parser) noexcept;
+  /// The last token before the end
+  simdjson_inline token_position last_position() const noexcept;
+  /// The token *at* the end. This points at gibberish and should only be used for comparison.
+  simdjson_inline token_position end_position() const noexcept;
+  /// The end of the buffer.
+  simdjson_inline token_position end() const noexcept;
+
+  friend class document;
+  friend class document_stream;
+  friend class object;
+  friend class array;
+  friend class value;
+  friend class raw_json_string;
+  friend class parser;
+  friend class value_iterator;
+  friend simdjson_inline void logger::log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept;
+  friend simdjson_inline void logger::log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept;
+}; // json_iterator
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+
+  simdjson_inline simdjson_result() noexcept = default;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/json_iterator.h */
+/* begin file include/simdjson/generic/ondemand/value_iterator.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class document;
+class object;
+class array;
+class value;
+class raw_json_string;
+class parser;
+
+/**
+ * Iterates through a single JSON value at a particular depth.
+ *
+ * Does not keep track of the type of value: provides methods for objects, arrays and scalars and expects
+ * the caller to call the right ones.
+ *
+ * @private This is not intended for external use.
+ */
+class value_iterator {
+protected:
+  /** The underlying JSON iterator */
+  json_iterator *_json_iter{};
+  /** The depth of this value */
+  depth_t _depth{};
+  /**
+   * The starting token index for this value
+   */
+  token_position _start_position{};
+
+public:
+  simdjson_inline value_iterator() noexcept = default;
+
+  /**
+   * Denote that we're starting a document.
+   */
+  simdjson_inline void start_document() noexcept;
+
+  /**
+   * Skips a non-iterated or partially-iterated JSON value, whether it is a scalar, array or object.
+   *
+   * Optimized for scalars.
+   */
+  simdjson_warn_unused simdjson_inline error_code skip_child() noexcept;
+
+  /**
+   * Tell whether the iterator is at the EOF mark
+   */
+  simdjson_inline bool at_end() const noexcept;
+
+  /**
+   * Tell whether the iterator is at the start of the value
+   */
+  simdjson_inline bool at_start() const noexcept;
+
+  /**
+   * Tell whether the value is open--if the value has not been used, or the array/object is still open.
+   */
+  simdjson_inline bool is_open() const noexcept;
+
+  /**
+   * Tell whether the value is at an object's first field (just after the {).
+   */
+  simdjson_inline bool at_first_field() const noexcept;
+
+  /**
+   * Abandon all iteration.
+   */
+  simdjson_inline void abandon() noexcept;
+
+  /**
+   * Get the child value as a value_iterator.
+   */
+  simdjson_inline value_iterator child_value() const noexcept;
+
+  /**
+   * Get the depth of this value.
+   */
+  simdjson_inline int32_t depth() const noexcept;
+
+  /**
+   * Get the JSON type of this value.
+   *
+   * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse".
+   */
+  simdjson_inline simdjson_result<json_type> type() const noexcept;
+
+  /**
+   * @addtogroup object Object iteration
+   *
+   * Methods to iterate and find object fields. These methods generally *assume* the value is
+   * actually an object; the caller is responsible for keeping track of that fact.
+   *
+   * @{
+   */
+
+  /**
+   * Start an object iteration.
+   *
+   * @returns Whether the object had any fields (returns false for empty).
+   * @error INCORRECT_TYPE if there is no opening {
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> start_object() noexcept;
+  /**
+   * Start an object iteration from the root.
+   *
+   * @returns Whether the object had any fields (returns false for empty).
+   * @error INCORRECT_TYPE if there is no opening {
+   * @error TAPE_ERROR if there is no matching } at end of document
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> start_root_object() noexcept;
+
+  /**
+   * Start an object iteration after the user has already checked and moved past the {.
+   *
+   * Does not move the iterator unless the object is empty ({}).
+   *
+   * @returns Whether the object had any fields (returns false for empty).
+   * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent*
+   *        array or object is incomplete).
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> started_object() noexcept;
+  /**
+   * Start an object iteration from the root, after the user has already checked and moved past the {.
+   *
+   * Does not move the iterator unless the object is empty ({}).
+   *
+   * @returns Whether the object had any fields (returns false for empty).
+   * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent*
+   *        array or object is incomplete).
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> started_root_object() noexcept;
+
+  /**
+   * Moves to the next field in an object.
+   *
+   * Looks for , and }. If } is found, the object is finished and the iterator advances past it.
+   * Otherwise, it advances to the next value.
+   *
+   * @return whether there is another field in the object.
+   * @error TAPE_ERROR If there is a comma missing between fields.
+   * @error TAPE_ERROR If there is a comma, but not enough tokens remaining to have a key, :, and value.
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> has_next_field() noexcept;
+
+  /**
+   * Get the current field's key.
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<raw_json_string> field_key() noexcept;
+
+  /**
+   * Pass the : in the field and move to its value.
+   */
+  simdjson_warn_unused simdjson_inline error_code field_value() noexcept;
+
+  /**
+   * Find the next field with the given key.
+   *
+   * Assumes you have called next_field() or otherwise matched the previous value.
+   *
+   * This means the iterator must be sitting at the next key:
+   *
+   * ```
+   * { "a": 1, "b": 2 }
+   *           ^
+   * ```
+   *
+   * Key is *raw JSON,* meaning it will be matched against the verbatim JSON without attempting to
+   * unescape it. This works well for typical ASCII and UTF-8 keys (almost all of them), but may
+   * fail to match some keys with escapes (\u, \n, etc.).
+   */
+  simdjson_warn_unused simdjson_inline error_code find_field(const std::string_view key) noexcept;
+
+  /**
+   * Find the next field with the given key, *without* unescaping. This assumes object order: it
+   * will not find the field if it was already passed when looking for some *other* field.
+   *
+   * Assumes you have called next_field() or otherwise matched the previous value.
+   *
+   * This means the iterator must be sitting at the next key:
+   *
+   * ```
+   * { "a": 1, "b": 2 }
+   *           ^
+   * ```
+   *
+   * Key is *raw JSON,* meaning it will be matched against the verbatim JSON without attempting to
+   * unescape it. This works well for typical ASCII and UTF-8 keys (almost all of them), but may
+   * fail to match some keys with escapes (\u, \n, etc.).
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> find_field_raw(const std::string_view key) noexcept;
+
+  /**
+   * Find the field with the given key without regard to order, and *without* unescaping.
+   *
+   * This is an unordered object lookup: if the field is not found initially, it will cycle around and scan from the beginning.
+   *
+   * Assumes you have called next_field() or otherwise matched the previous value.
+   *
+   * This means the iterator must be sitting at the next key:
+   *
+   * ```
+   * { "a": 1, "b": 2 }
+   *           ^
+   * ```
+   *
+   * Key is *raw JSON,* meaning it will be matched against the verbatim JSON without attempting to
+   * unescape it. This works well for typical ASCII and UTF-8 keys (almost all of them), but may
+   * fail to match some keys with escapes (\u, \n, etc.).
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> find_field_unordered_raw(const std::string_view key) noexcept;
+
+  /** @} */
+
+  /**
+   * @addtogroup array Array iteration
+   * Methods to iterate over array elements. These methods generally *assume* the value is actually
+   * an object; the caller is responsible for keeping track of that fact.
+   * @{
+   */
+
+  /**
+   * Check for an opening [ and start an array iteration.
+   *
+   * @returns Whether the array had any elements (returns false for empty).
+   * @error INCORRECT_TYPE If there is no [.
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> start_array() noexcept;
+  /**
+   * Check for an opening [ and start an array iteration while at the root.
+   *
+   * @returns Whether the array had any elements (returns false for empty).
+   * @error INCORRECT_TYPE If there is no [.
+   * @error TAPE_ERROR if there is no matching ] at end of document
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> start_root_array() noexcept;
+
+  /**
+   * Start an array iteration, after the user has already checked and moved past the [.
+   *
+   * Does not move the iterator unless the array is empty ([]).
+   *
+   * @returns Whether the array had any elements (returns false for empty).
+   * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent*
+   *        array or object is incomplete).
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> started_array() noexcept;
+  /**
+   * Start an array iteration from the root, after the user has already checked and moved past the [.
+   *
+   * Does not move the iterator unless the array is empty ([]).
+   *
+   * @returns Whether the array had any elements (returns false for empty).
+   * @error INCOMPLETE_ARRAY_OR_OBJECT If there are no more tokens (implying the *parent*
+   *        array or object is incomplete).
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> started_root_array() noexcept;
+
+  /**
+   * Moves to the next element in an array.
+   *
+   * Looks for , and ]. If ] is found, the array is finished and the iterator advances past it.
+   * Otherwise, it advances to the next value.
+   *
+   * @return Whether there is another element in the array.
+   * @error TAPE_ERROR If there is a comma missing between elements.
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> has_next_element() noexcept;
+
+  /**
+   * Get a child value iterator.
+   */
+  simdjson_warn_unused simdjson_inline value_iterator child() const noexcept;
+
+  /** @} */
+
+  /**
+   * @defgroup scalar Scalar values
+   * @addtogroup scalar
+   * @{
+   */
+
+  simdjson_warn_unused simdjson_inline simdjson_result<std::string_view> get_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<raw_json_string> get_raw_json_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> get_uint64() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> get_uint64_in_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<int64_t> get_int64() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<int64_t> get_int64_in_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<double> get_double() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<double> get_double_in_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> get_bool() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> is_null() noexcept;
+  simdjson_warn_unused simdjson_inline bool is_negative() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> is_integer() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<number_type> get_number_type() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<number> get_number() noexcept;
+
+  simdjson_warn_unused simdjson_inline simdjson_result<std::string_view> get_root_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<raw_json_string> get_root_raw_json_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> get_root_uint64() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> get_root_uint64_in_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<int64_t> get_root_int64() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<int64_t> get_root_int64_in_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<double> get_root_double() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<double> get_root_double_in_string() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> get_root_bool() noexcept;
+  simdjson_warn_unused simdjson_inline bool is_root_negative() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<bool> is_root_integer() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<number_type> get_root_number_type() noexcept;
+  simdjson_warn_unused simdjson_inline simdjson_result<number> get_root_number() noexcept;
+  simdjson_inline bool is_root_null() noexcept;
+
+  simdjson_inline error_code error() const noexcept;
+  simdjson_inline uint8_t *&string_buf_loc() noexcept;
+  simdjson_inline const json_iterator &json_iter() const noexcept;
+  simdjson_inline json_iterator &json_iter() noexcept;
+
+  simdjson_inline void assert_is_valid() const noexcept;
+  simdjson_inline bool is_valid() const noexcept;
+
+  /** @} */
+protected:
+  /**
+   * Restarts an array iteration.
+   * @returns Whether the array has any elements (returns false for empty).
+   */
+  simdjson_inline simdjson_result<bool> reset_array() noexcept;
+  /**
+   * Restarts an object iteration.
+   * @returns Whether the object has any fields (returns false for empty).
+   */
+  simdjson_inline simdjson_result<bool> reset_object() noexcept;
+  /**
+   * move_at_start(): moves us so that we are pointing at the beginning of
+   * the container. It updates the index so that at_start() is true and it
+   * syncs the depth. The user can then create a new container instance.
+   *
+   * Usage: used with value::count_elements().
+   **/
+  simdjson_inline void move_at_start() noexcept;
+
+  /**
+   * move_at_container_start(): moves us so that we are pointing at the beginning of
+   * the container so that assert_at_container_start() passes.
+   *
+   * Usage: used with reset_array() and reset_object().
+   **/
+   simdjson_inline void move_at_container_start() noexcept;
+  /* Useful for debugging and logging purposes. */
+  inline std::string to_string() const noexcept;
+  simdjson_inline value_iterator(json_iterator *json_iter, depth_t depth, token_position start_index) noexcept;
+
+  simdjson_inline simdjson_result<bool> parse_null(const uint8_t *json) const noexcept;
+  simdjson_inline simdjson_result<bool> parse_bool(const uint8_t *json) const noexcept;
+  simdjson_inline const uint8_t *peek_start() const noexcept;
+  simdjson_inline uint32_t peek_start_length() const noexcept;
+
+  /**
+   * The general idea of the advance_... methods and the peek_* methods
+   * is that you first peek and check that you have desired type. If you do,
+   * and only if you do, then you advance.
+   *
+   * We used to unconditionally advance. But this made reasoning about our
+   * current state difficult.
+   * Suppose you always advance. Look at the 'value' matching the key
+   * "shadowable" in the following example...
+   *
+   * ({"globals":{"a":{"shadowable":[}}}})
+   *
+   * If the user thinks it is a Boolean and asks for it, then we check the '[',
+   * decide it is not a Boolean, but still move into the next character ('}'). Now
+   * we are left pointing at '}' right after a '['. And we have not yet reported
+   * an error, only that we do not have a Boolean.
+   *
+   * If, instead, you just stand your ground until it is content that you know, then
+   * you will only even move beyond the '[' if the user tells you that you have an
+   * array. So you will be at the '}' character inside the array and, hopefully, you
+   * will then catch the error because an array cannot start with '}', but the code
+   * processing Boolean values does not know this.
+   *
+   * So the contract is: first call 'peek_...' and then call 'advance_...' only
+   * if you have determined that it is a type you can handle.
+   *
+   * Unfortunately, it makes the code more verbose, longer and maybe more error prone.
+   */
+
+  simdjson_inline void advance_scalar(const char *type) noexcept;
+  simdjson_inline void advance_root_scalar(const char *type) noexcept;
+  simdjson_inline void advance_non_root_scalar(const char *type) noexcept;
+
+  simdjson_inline const uint8_t *peek_scalar(const char *type) noexcept;
+  simdjson_inline const uint8_t *peek_root_scalar(const char *type) noexcept;
+  simdjson_inline const uint8_t *peek_non_root_scalar(const char *type) noexcept;
+
+
+  simdjson_inline error_code start_container(uint8_t start_char, const char *incorrect_type_message, const char *type) noexcept;
+  simdjson_inline error_code end_container() noexcept;
+
+  /**
+   * Advance to a place expecting a value (increasing depth).
+   *
+   * @return The current token (the one left behind).
+   * @error TAPE_ERROR If the document ended early.
+   */
+  simdjson_inline simdjson_result<const uint8_t *> advance_to_value() noexcept;
+
+  simdjson_inline error_code incorrect_type_error(const char *message) const noexcept;
+  simdjson_inline error_code error_unless_more_tokens(uint32_t tokens=1) const noexcept;
+
+  simdjson_inline bool is_at_start() const noexcept;
+  /**
+   * is_at_iterator_start() returns true on an array or object after it has just been
+   * created, whether the instance is empty or not.
+   *
+   * Usage: used by array::begin() in debug mode (SIMDJSON_DEVELOPMENT_CHECKS)
+   */
+  simdjson_inline bool is_at_iterator_start() const noexcept;
+
+  /**
+   * Assuming that we are within an object, this returns true if we
+   * are pointing at a key.
+   *
+   * Usage: the skip_child() method should never be used while we are pointing
+   * at a key inside an object.
+   */
+  simdjson_inline bool is_at_key() const noexcept;
+
+  inline void assert_at_start() const noexcept;
+  inline void assert_at_container_start() const noexcept;
+  inline void assert_at_root() const noexcept;
+  inline void assert_at_child() const noexcept;
+  inline void assert_at_next() const noexcept;
+  inline void assert_at_non_root_start() const noexcept;
+
+  /** Get the starting position of this value */
+  simdjson_inline token_position start_position() const noexcept;
+
+  /** @copydoc error_code json_iterator::position() const noexcept; */
+  simdjson_inline token_position position() const noexcept;
+  /** @copydoc error_code json_iterator::end_position() const noexcept; */
+  simdjson_inline token_position last_position() const noexcept;
+  /** @copydoc error_code json_iterator::end_position() const noexcept; */
+  simdjson_inline token_position end_position() const noexcept;
+  /** @copydoc error_code json_iterator::report_error(error_code error, const char *message) noexcept; */
+  simdjson_inline error_code report_error(error_code error, const char *message) noexcept;
+
+  friend class document;
+  friend class object;
+  friend class array;
+  friend class value;
+}; // value_iterator
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/value_iterator.h */
+/* begin file include/simdjson/generic/ondemand/array_iterator.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class array;
+class value;
+class document;
+
+/**
+ * A forward-only JSON array.
+ *
+ * This is an input_iterator, meaning:
+ * - It is forward-only
+ * - * must be called exactly once per element.
+ * - ++ must be called exactly once in between each * (*, ++, *, ++, * ...)
+ */
+class array_iterator {
+public:
+  /** Create a new, invalid array iterator. */
+  simdjson_inline array_iterator() noexcept = default;
+
+  //
+  // Iterator interface
+  //
+
+  /**
+   * Get the current element.
+   *
+   * Part of the std::iterator interface.
+   */
+  simdjson_inline simdjson_result<value> operator*() noexcept; // MUST ONLY BE CALLED ONCE PER ITERATION.
+  /**
+   * Check if we are at the end of the JSON.
+   *
+   * Part of the std::iterator interface.
+   *
+   * @return true if there are no more elements in the JSON array.
+   */
+  simdjson_inline bool operator==(const array_iterator &) const noexcept;
+  /**
+   * Check if there are more elements in the JSON array.
+   *
+   * Part of the std::iterator interface.
+   *
+   * @return true if there are more elements in the JSON array.
+   */
+  simdjson_inline bool operator!=(const array_iterator &) const noexcept;
+  /**
+   * Move to the next element.
+   *
+   * Part of the std::iterator interface.
+   */
+  simdjson_inline array_iterator &operator++() noexcept;
+
+private:
+  value_iterator iter{};
+
+  simdjson_inline array_iterator(const value_iterator &iter) noexcept;
+
+  friend class array;
+  friend class value;
+  friend struct simdjson_result<array_iterator>;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+
+  //
+  // Iterator interface
+  //
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator*() noexcept; // MUST ONLY BE CALLED ONCE PER ITERATION.
+  simdjson_inline bool operator==(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> &) const noexcept;
+  simdjson_inline bool operator!=(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> &) const noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> &operator++() noexcept;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/array_iterator.h */
+/* begin file include/simdjson/generic/ondemand/object_iterator.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class field;
+
+class object_iterator {
+public:
+  /**
+   * Create a new invalid object_iterator.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline object_iterator() noexcept = default;
+
+  //
+  // Iterator interface
+  //
+
+  // Reads key and value, yielding them to the user.
+  // MUST ONLY BE CALLED ONCE PER ITERATION.
+  simdjson_inline simdjson_result<field> operator*() noexcept;
+  // Assumes it's being compared with the end. true if depth < iter->depth.
+  simdjson_inline bool operator==(const object_iterator &) const noexcept;
+  // Assumes it's being compared with the end. true if depth >= iter->depth.
+  simdjson_inline bool operator!=(const object_iterator &) const noexcept;
+  // Checks for ']' and ','
+  simdjson_inline object_iterator &operator++() noexcept;
+
+private:
+  /**
+   * The underlying JSON iterator.
+   *
+   * PERF NOTE: expected to be elided in favor of the parent document: this is set when the object
+   * is first used, and never changes afterwards.
+   */
+  value_iterator iter{};
+
+  simdjson_inline object_iterator(const value_iterator &iter) noexcept;
+  friend struct simdjson_result<object_iterator>;
+  friend class object;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+
+  //
+  // Iterator interface
+  //
+
+  // Reads key and value, yielding them to the user.
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field> operator*() noexcept; // MUST ONLY BE CALLED ONCE PER ITERATION.
+  // Assumes it's being compared with the end. true if depth < iter->depth.
+  simdjson_inline bool operator==(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> &) const noexcept;
+  // Assumes it's being compared with the end. true if depth >= iter->depth.
+  simdjson_inline bool operator!=(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> &) const noexcept;
+  // Checks for ']' and ','
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> &operator++() noexcept;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/object_iterator.h */
+/* begin file include/simdjson/generic/ondemand/array.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class value;
+class document;
+
+/**
+ * A forward-only JSON array.
+ */
+class array {
+public:
+  /**
+   * Create a new invalid array.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline array() noexcept = default;
+
+  /**
+   * Begin array iteration.
+   *
+   * Part of the std::iterable interface.
+   */
+  simdjson_inline simdjson_result<array_iterator> begin() noexcept;
+  /**
+   * Sentinel representing the end of the array.
+   *
+   * Part of the std::iterable interface.
+   */
+  simdjson_inline simdjson_result<array_iterator> end() noexcept;
+  /**
+   * This method scans the array and counts the number of elements.
+   * The count_elements method should always be called before you have begun
+   * iterating through the array: it is expected that you are pointing at
+   * the beginning of the array.
+   * The runtime complexity is linear in the size of the array. After
+   * calling this function, if successful, the array is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   *
+   * To check that an array is empty, it is more performant to use
+   * the is_empty() method.
+   */
+  simdjson_inline simdjson_result<size_t> count_elements() & noexcept;
+  /**
+   * This method scans the beginning of the array and checks whether the
+   * array is empty.
+   * The runtime complexity is constant time. After
+   * calling this function, if successful, the array is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   */
+  simdjson_inline simdjson_result<bool> is_empty() & noexcept;
+  /**
+   * Reset the iterator so that we are pointing back at the
+   * beginning of the array. You should still consume values only once even if you
+   * can iterate through the array more than once. If you unescape a string
+   * within the array more than once, you have unsafe code. Note that rewinding
+   * an array means that you may need to reparse it anew: it is not a free
+   * operation.
+   *
+   * @returns true if the array contains some elements (not empty)
+   */
+  inline simdjson_result<bool> reset() & noexcept;
+  /**
+   * Get the value associated with the given JSON pointer.  We use the RFC 6901
+   * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node
+   * as the root of its own JSON document.
+   *
+   *   ondemand::parser parser;
+   *   auto json = R"([ { "foo": { "a": [ 10, 20, 30 ] }} ])"_padded;
+   *   auto doc = parser.iterate(json);
+   *   doc.at_pointer("/0/foo/a/1") == 20
+   *
+   * Note that at_pointer() called on the document automatically calls the document's rewind
+   * method between each call. It invalidates all previously accessed arrays, objects and values
+   * that have not been consumed. Yet it is not the case when calling at_pointer on an array
+   * instance: there is no rewind and no invalidation.
+   *
+   * You may only call at_pointer on an array after it has been created, but before it has
+   * been first accessed. When calling at_pointer on an array, the pointer is advanced to
+   * the location indicated by the JSON pointer (in case of success). It is no longer possible
+   * to call at_pointer on the same array.
+   *
+   * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching.
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   */
+  inline simdjson_result<value> at_pointer(std::string_view json_pointer) noexcept;
+  /**
+   * Consumes the array and returns a string_view instance corresponding to the
+   * array as represented in JSON. It points inside the original document.
+   */
+  simdjson_inline simdjson_result<std::string_view> raw_json() noexcept;
+
+  /**
+   * Get the value at the given index. This function has linear-time complexity.
+   * This function should only be called once on an array instance since the array iterator is not reset between each call.
+   *
+   * @return The value at the given index, or:
+   *         - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length
+   */
+  simdjson_inline simdjson_result<value> at(size_t index) noexcept;
+protected:
+  /**
+   * Go to the end of the array, no matter where you are right now.
+   */
+  simdjson_inline error_code consume() noexcept;
+
+  /**
+   * Begin array iteration.
+   *
+   * @param iter The iterator. Must be where the initial [ is expected. Will be *moved* into the
+   *        resulting array.
+   * @error INCORRECT_TYPE if the iterator is not at [.
+   */
+  static simdjson_inline simdjson_result<array> start(value_iterator &iter) noexcept;
+  /**
+   * Begin array iteration from the root.
+   *
+   * @param iter The iterator. Must be where the initial [ is expected. Will be *moved* into the
+   *        resulting array.
+   * @error INCORRECT_TYPE if the iterator is not at [.
+   * @error TAPE_ERROR if there is no closing ] at the end of the document.
+   */
+  static simdjson_inline simdjson_result<array> start_root(value_iterator &iter) noexcept;
+  /**
+   * Begin array iteration.
+   *
+   * This version of the method should be called after the initial [ has been verified, and is
+   * intended for use by switch statements that check the type of a value.
+   *
+   * @param iter The iterator. Must be after the initial [. Will be *moved* into the resulting array.
+   */
+  static simdjson_inline simdjson_result<array> started(value_iterator &iter) noexcept;
+
+  /**
+   * Create an array at the given Internal array creation. Call array::start() or array::started() instead of this.
+   *
+   * @param iter The iterator. Must either be at the start of the first element with iter.is_alive()
+   *        == true, or past the [] with is_alive() == false if the array is empty. Will be *moved*
+   *        into the resulting array.
+   */
+  simdjson_inline array(const value_iterator &iter) noexcept;
+
+  /**
+   * Iterator marking current position.
+   *
+   * iter.is_alive() == false indicates iteration is complete.
+   */
+  value_iterator iter{};
+
+  friend class value;
+  friend class document;
+  friend struct simdjson_result<value>;
+  friend struct simdjson_result<array>;
+  friend class array_iterator;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> begin() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> end() noexcept;
+  inline simdjson_result<size_t> count_elements() & noexcept;
+  inline simdjson_result<bool> is_empty() & noexcept;
+  inline simdjson_result<bool> reset() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at(size_t index) noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at_pointer(std::string_view json_pointer) noexcept;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/array.h */
+/* begin file include/simdjson/generic/ondemand/document.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class parser;
+class array;
+class object;
+class value;
+class raw_json_string;
+class array_iterator;
+class document_stream;
+
+/**
+ * A JSON document. It holds a json_iterator instance.
+ *
+ * Used by tokens to get text, and string buffer location.
+ *
+ * You must keep the document around during iteration.
+ */
+class document {
+public:
+  /**
+   * Create a new invalid document.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline document() noexcept = default;
+  simdjson_inline document(const document &other) noexcept = delete; // pass your documents by reference, not by copy
+  simdjson_inline document(document &&other) noexcept = default;
+  simdjson_inline document &operator=(const document &other) noexcept = delete;
+  simdjson_inline document &operator=(document &&other) noexcept = default;
+
+  /**
+   * Cast this JSON value to an array.
+   *
+   * @returns An object that can be used to iterate the array.
+   * @returns INCORRECT_TYPE If the JSON value is not an array.
+   */
+  simdjson_inline simdjson_result<array> get_array() & noexcept;
+  /**
+   * Cast this JSON value to an object.
+   *
+   * @returns An object that can be used to look up or iterate fields.
+   * @returns INCORRECT_TYPE If the JSON value is not an object.
+   */
+  simdjson_inline simdjson_result<object> get_object() & noexcept;
+  /**
+   * Cast this JSON value to an unsigned integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer.
+   */
+  simdjson_inline simdjson_result<uint64_t> get_uint64() noexcept;
+  /**
+   * Cast this JSON value (inside string) to an unsigned integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer.
+   */
+  simdjson_inline simdjson_result<uint64_t> get_uint64_in_string() noexcept;
+  /**
+   * Cast this JSON value to a signed integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer.
+   */
+  simdjson_inline simdjson_result<int64_t> get_int64() noexcept;
+  /**
+   * Cast this JSON value (inside string) to a signed integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer.
+   */
+  simdjson_inline simdjson_result<int64_t> get_int64_in_string() noexcept;
+  /**
+   * Cast this JSON value to a double.
+   *
+   * @returns A double.
+   * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number.
+   */
+  simdjson_inline simdjson_result<double> get_double() noexcept;
+
+  /**
+   * Cast this JSON value (inside string) to a double.
+   *
+   * @returns A double.
+   * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number.
+   */
+  simdjson_inline simdjson_result<double> get_double_in_string() noexcept;
+  /**
+   * Cast this JSON value to a string.
+   *
+   * The string is guaranteed to be valid UTF-8.
+   *
+   * Important: Calling get_string() twice on the same document is an error.
+   *
+   * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next
+   *          time it parses a document or when it is destroyed.
+   * @returns INCORRECT_TYPE if the JSON value is not a string.
+   */
+  simdjson_inline simdjson_result<std::string_view> get_string() noexcept;
+  /**
+   * Cast this JSON value to a raw_json_string.
+   *
+   * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n).
+   *
+   * @returns A pointer to the raw JSON for the given string.
+   * @returns INCORRECT_TYPE if the JSON value is not a string.
+   */
+  simdjson_inline simdjson_result<raw_json_string> get_raw_json_string() noexcept;
+  /**
+   * Cast this JSON value to a bool.
+   *
+   * @returns A bool value.
+   * @returns INCORRECT_TYPE if the JSON value is not true or false.
+   */
+  simdjson_inline simdjson_result<bool> get_bool() noexcept;
+  /**
+   * Cast this JSON value to a value when the document is an object or an array.
+   *
+   * @returns A value if a JSON array or object cannot be found.
+   * @returns SCALAR_DOCUMENT_AS_VALUE error is the document is a scalar (see is_scalar() function).
+   */
+  simdjson_inline simdjson_result<value> get_value() noexcept;
+
+  /**
+   * Checks if this JSON value is null.  If and only if the value is
+   * null, then it is consumed (we advance). If we find a token that
+   * begins with 'n' but is not 'null', then an error is returned.
+   *
+   * @returns Whether the value is null.
+   * @returns INCORRECT_TYPE If the JSON value begins with 'n' and is not 'null'.
+   */
+  simdjson_inline simdjson_result<bool> is_null() noexcept;
+
+  /**
+   * Get this value as the given type.
+   *
+   * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool
+   *
+   * You may use get_double(), get_bool(), get_uint64(), get_int64(),
+   * get_object(), get_array(), get_raw_json_string(), or get_string() instead.
+   *
+   * @returns A value of the given type, parsed from the JSON.
+   * @returns INCORRECT_TYPE If the JSON value is not the given type.
+   */
+  template<typename T> simdjson_inline simdjson_result<T> get() & noexcept {
+    // Unless the simdjson library provides an inline implementation, calling this method should
+    // immediately fail.
+    static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library.");
+  }
+  /** @overload template<typename T> simdjson_result<T> get() & noexcept */
+  template<typename T> simdjson_inline simdjson_result<T> get() && noexcept {
+    // Unless the simdjson library provides an inline implementation, calling this method should
+    // immediately fail.
+    static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library.");
+  }
+
+  /**
+   * Get this value as the given type.
+   *
+   * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool, value
+   *
+   * Be mindful that the document instance must remain in scope while you are accessing object, array and value instances.
+   *
+   * @param out This is set to a value of the given type, parsed from the JSON. If there is an error, this may not be initialized.
+   * @returns INCORRECT_TYPE If the JSON value is not an object.
+   * @returns SUCCESS If the parse succeeded and the out parameter was set to the value.
+   */
+  template<typename T> simdjson_inline error_code get(T &out) & noexcept;
+  /** @overload template<typename T> error_code get(T &out) & noexcept */
+  template<typename T> simdjson_inline error_code get(T &out) && noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  /**
+   * Cast this JSON value to an array.
+   *
+   * @returns An object that can be used to iterate the array.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an array.
+   */
+  simdjson_inline operator array() & noexcept(false);
+  /**
+   * Cast this JSON value to an object.
+   *
+   * @returns An object that can be used to look up or iterate fields.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an object.
+   */
+  simdjson_inline operator object() & noexcept(false);
+  /**
+   * Cast this JSON value to an unsigned integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit unsigned integer.
+   */
+  simdjson_inline operator uint64_t() noexcept(false);
+  /**
+   * Cast this JSON value to a signed integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit integer.
+   */
+  simdjson_inline operator int64_t() noexcept(false);
+  /**
+   * Cast this JSON value to a double.
+   *
+   * @returns A double.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a valid floating-point number.
+   */
+  simdjson_inline operator double() noexcept(false);
+  /**
+   * Cast this JSON value to a string.
+   *
+   * The string is guaranteed to be valid UTF-8.
+   *
+   * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next
+   *          time it parses a document or when it is destroyed.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string.
+   */
+  simdjson_inline operator std::string_view() noexcept(false);
+  /**
+   * Cast this JSON value to a raw_json_string.
+   *
+   * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n).
+   *
+   * @returns A pointer to the raw JSON for the given string.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string.
+   */
+  simdjson_inline operator raw_json_string() noexcept(false);
+  /**
+   * Cast this JSON value to a bool.
+   *
+   * @returns A bool value.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not true or false.
+   */
+  simdjson_inline operator bool() noexcept(false);
+  /**
+   * Cast this JSON value to a value.
+   *
+   * @returns A value value.
+   * @exception if a JSON value cannot be found
+   */
+  simdjson_inline operator value() noexcept(false);
+#endif
+  /**
+   * This method scans the array and counts the number of elements.
+   * The count_elements method should always be called before you have begun
+   * iterating through the array: it is expected that you are pointing at
+   * the beginning of the array.
+   * The runtime complexity is linear in the size of the array. After
+   * calling this function, if successful, the array is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   */
+  simdjson_inline simdjson_result<size_t> count_elements() & noexcept;
+   /**
+   * This method scans the object and counts the number of key-value pairs.
+   * The count_fields method should always be called before you have begun
+   * iterating through the object: it is expected that you are pointing at
+   * the beginning of the object.
+   * The runtime complexity is linear in the size of the object. After
+   * calling this function, if successful, the object is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   *
+   * To check that an object is empty, it is more performant to use
+   * the is_empty() method.
+   */
+  simdjson_inline simdjson_result<size_t> count_fields() & noexcept;
+  /**
+   * Get the value at the given index in the array. This function has linear-time complexity.
+   * This function should only be called once on an array instance since the array iterator is not reset between each call.
+   *
+   * @return The value at the given index, or:
+   *         - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length
+   */
+  simdjson_inline simdjson_result<value> at(size_t index) & noexcept;
+  /**
+   * Begin array iteration.
+   *
+   * Part of the std::iterable interface.
+   */
+  simdjson_inline simdjson_result<array_iterator> begin() & noexcept;
+  /**
+   * Sentinel representing the end of the array.
+   *
+   * Part of the std::iterable interface.
+   */
+  simdjson_inline simdjson_result<array_iterator> end() & noexcept;
+
+  /**
+   * Look up a field by name on an object (order-sensitive).
+   *
+   * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the
+   * JSON `{ "x": 1, "y": 2, "z": 3 }`:
+   *
+   * ```c++
+   * simdjson::ondemand::parser parser;
+   * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded);
+   * double z = obj.find_field("z");
+   * double y = obj.find_field("y");
+   * double x = obj.find_field("x");
+   * ```
+   *
+   * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys.
+   * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`.
+   *
+   *
+   * You must consume the fields on an object one at a time. A request for a new key
+   * invalidates previous field values: it makes them unsafe. E.g., the array
+   * given by content["bids"].get_array() should not be accessed after you have called
+   * content["asks"].get_array(). You can detect such mistakes by first compiling and running
+   * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an
+   * OUT_OF_ORDER_ITERATION error is generated.
+   *
+   * You are expected to access keys only once. You should access the value corresponding to
+   * a key a single time. Doing object["mykey"].to_string()and then again object["mykey"].to_string()
+   * is an error.
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<value> find_field(std::string_view key) & noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> find_field(const char *key) & noexcept;
+
+  /**
+   * Look up a field by name on an object, without regard to key order.
+   *
+   * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies
+   * and often appears negligible. It starts out normally, starting out at the last field; but if
+   * the field is not found, it scans from the beginning of the object to see if it missed it. That
+   * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object
+   * in question is large. The fact that the extra code is there also bumps the executable size.
+   *
+   * It is the default, however, because it would be highly surprising (and hard to debug) if the
+   * default behavior failed to look up a field just because it was in the wrong order--and many
+   * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order.
+   *
+   * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the
+   * field wasn't there when they aren't).
+   *
+   * You must consume the fields on an object one at a time. A request for a new key
+   * invalidates previous field values: it makes them unsafe. E.g., the array
+   * given by content["bids"].get_array() should not be accessed after you have called
+   * content["asks"].get_array(). You can detect such mistakes by first compiling and running
+   * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an
+   * OUT_OF_ORDER_ITERATION error is generated.
+   *
+   * You are expected to access keys only once. You should access the value corresponding to a key
+   * a single time. Doing object["mykey"].to_string() and then again object["mykey"].to_string()
+   * is an error.
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> find_field_unordered(const char *key) & noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> operator[](std::string_view key) & noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> operator[](const char *key) & noexcept;
+
+  /**
+   * Get the type of this JSON value. It does not validate or consume the value.
+   * E.g., you must still call "is_null()" to check that a value is null even if
+   * "type()" returns json_type::null.
+   *
+   * NOTE: If you're only expecting a value to be one type (a typical case), it's generally
+   * better to just call .get_double, .get_string, etc. and check for INCORRECT_TYPE (or just
+   * let it throw an exception).
+   *
+   * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse".
+   */
+  simdjson_inline simdjson_result<json_type> type() noexcept;
+
+  /**
+   * Checks whether the document is a scalar (string, number, null, Boolean).
+   * Returns false when there it is an array or object.
+   *
+   * @returns true if the type is string, number, null, Boolean
+   * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse".
+   */
+  simdjson_inline simdjson_result<bool> is_scalar() noexcept;
+
+  /**
+   * Checks whether the document is a negative number.
+   *
+   * @returns true if the number if negative.
+   */
+  simdjson_inline bool is_negative() noexcept;
+  /**
+   * Checks whether the document is an integer number. Note that
+   * this requires to partially parse the number string. If
+   * the value is determined to be an integer, it may still
+   * not parse properly as an integer in subsequent steps
+   * (e.g., it might overflow).
+   *
+   * @returns true if the number if negative.
+   */
+  simdjson_inline simdjson_result<bool> is_integer() noexcept;
+  /**
+   * Determine the number type (integer or floating-point number) as quickly
+   * as possible. This function does not fully validate the input. It is
+   * useful when you only need to classify the numbers, without parsing them.
+   *
+   * If you are planning to retrieve the value or you need full validation,
+   * consider using the get_number() method instead: it will fully parse
+   * and validate the input, and give you access to the type:
+   * get_number().get_number_type().
+   *
+   * get_number_type() is number_type::unsigned_integer if we have
+   * an integer greater or equal to 9223372036854775808
+   * get_number_type() is number_type::signed_integer if we have an
+   * integer that is less than 9223372036854775808
+   * Otherwise, get_number_type() has value number_type::floating_point_number
+   *
+   * This function requires processing the number string, but it is expected
+   * to be faster than get_number().get_number_type() because it is does not
+   * parse the number value.
+   *
+   * @returns the type of the number
+   */
+  simdjson_inline simdjson_result<number_type> get_number_type() noexcept;
+
+  /**
+   * Attempt to parse an ondemand::number. An ondemand::number may
+   * contain an integer value or a floating-point value, the simdjson
+   * library will autodetect the type. Thus it is a dynamically typed
+   * number. Before accessing the value, you must determine the detected
+   * type.
+   *
+   * number.get_number_type() is number_type::signed_integer if we have
+   * an integer in [-9223372036854775808,9223372036854775808)
+   * You can recover the value by calling number.get_int64() and you
+   * have that number.is_int64() is true.
+   *
+   * number.get_number_type() is number_type::unsigned_integer if we have
+   * an integer in [9223372036854775808,18446744073709551616)
+   * You can recover the value by calling number.get_uint64() and you
+   * have that number.is_uint64() is true.
+   *
+   * Otherwise, number.get_number_type() has value number_type::floating_point_number
+   * and we have a binary64 number.
+   * You can recover the value by calling number.get_double() and you
+   * have that number.is_double() is true.
+   *
+   * You must check the type before accessing the value: it is an error
+   * to call "get_int64()" when number.get_number_type() is not
+   * number_type::signed_integer and when number.is_int64() is false.
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<number> get_number() noexcept;
+
+  /**
+   * Get the raw JSON for this token.
+   *
+   * The string_view will always point into the input buffer.
+   *
+   * The string_view will start at the beginning of the token, and include the entire token
+   * *as well as all spaces until the next token (or EOF).* This means, for example, that a
+   * string token always begins with a " and is always terminated by the final ", possibly
+   * followed by a number of spaces.
+   *
+   * The string_view is *not* null-terminated. If this is a scalar (string, number,
+   * boolean, or null), the character after the end of the string_view may be the padded buffer.
+   *
+   * Tokens include:
+   * - {
+   * - [
+   * - "a string (possibly with UTF-8 or backslashed characters like \\\")".
+   * - -1.2e-100
+   * - true
+   * - false
+   * - null
+   */
+  simdjson_inline simdjson_result<std::string_view> raw_json_token() noexcept;
+
+  /**
+   * Reset the iterator inside the document instance so we are pointing back at the
+   * beginning of the document, as if it had just been created. It invalidates all
+   * values, objects and arrays that you have created so far (including unescaped strings).
+   */
+  inline void rewind() noexcept;
+  /**
+   * Returns debugging information.
+   */
+  inline std::string to_debug_string() noexcept;
+  /**
+   * Some unrecoverable error conditions may render the document instance unusable.
+   * The is_alive() method returns true when the document is still suitable.
+   */
+  inline bool is_alive() noexcept;
+
+  /**
+   * Returns the current location in the document if in bounds.
+   */
+  inline simdjson_result<const char *> current_location() noexcept;
+
+  /**
+   * Returns the current depth in the document if in bounds.
+   *
+   * E.g.,
+   *  0 = finished with document
+   *  1 = document root value (could be [ or {, not yet known)
+   *  2 = , or } inside root array/object
+   *  3 = key or value inside root array/object.
+   */
+  simdjson_inline int32_t current_depth() const noexcept;
+
+  /**
+   * Get the value associated with the given JSON pointer.  We use the RFC 6901
+   * https://tools.ietf.org/html/rfc6901 standard.
+   *
+   *   ondemand::parser parser;
+   *   auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded;
+   *   auto doc = parser.iterate(json);
+   *   doc.at_pointer("/foo/a/1") == 20
+   *
+   * It is allowed for a key to be the empty string:
+   *
+   *   ondemand::parser parser;
+   *   auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded;
+   *   auto doc = parser.iterate(json);
+   *   doc.at_pointer("//a/1") == 20
+   *
+   * Note that at_pointer() automatically calls rewind between each call. Thus
+   * all values, objects and arrays that you have created so far (including unescaped strings)
+   * are invalidated. After calling at_pointer, you need to consume the result: string values
+   * should be stored in your own variables, arrays should be decoded and stored in your own array-like
+   * structures and so forth.
+   *
+   * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   *         - SCALAR_DOCUMENT_AS_VALUE if the json_pointer is empty and the document is not a scalar (see is_scalar() function).
+   */
+  simdjson_inline simdjson_result<value> at_pointer(std::string_view json_pointer) noexcept;
+  /**
+   * Consumes the document and returns a string_view instance corresponding to the
+   * document as represented in JSON. It points inside the original byte array containing
+   * the JSON document.
+   */
+  simdjson_inline simdjson_result<std::string_view> raw_json() noexcept;
+protected:
+  /**
+   * Consumes the document.
+   */
+  simdjson_inline error_code consume() noexcept;
+
+  simdjson_inline document(ondemand::json_iterator &&iter) noexcept;
+  simdjson_inline const uint8_t *text(uint32_t idx) const noexcept;
+
+  simdjson_inline value_iterator resume_value_iterator() noexcept;
+  simdjson_inline value_iterator get_root_value_iterator() noexcept;
+  simdjson_inline simdjson_result<object> start_or_resume_object() noexcept;
+  static simdjson_inline document start(ondemand::json_iterator &&iter) noexcept;
+
+  //
+  // Fields
+  //
+  json_iterator iter{}; ///< Current position in the document
+  static constexpr depth_t DOCUMENT_DEPTH = 0; ///< document depth is always 0
+
+  friend class array_iterator;
+  friend class value;
+  friend class ondemand::parser;
+  friend class object;
+  friend class array;
+  friend class field;
+  friend class token;
+  friend class document_stream;
+};
+
+
+/**
+ * A document_reference is a thin wrapper around a document reference instance.
+ */
+class document_reference {
+public:
+  simdjson_inline document_reference() noexcept;
+  simdjson_inline document_reference(document &d) noexcept;
+  simdjson_inline document_reference(const document_reference &other) noexcept = default;
+  simdjson_inline document_reference& operator=(const document_reference &other) noexcept = default;
+  simdjson_inline void rewind() noexcept;
+  simdjson_inline simdjson_result<array> get_array() & noexcept;
+  simdjson_inline simdjson_result<object> get_object() & noexcept;
+  simdjson_inline simdjson_result<uint64_t> get_uint64() noexcept;
+  simdjson_inline simdjson_result<int64_t> get_int64() noexcept;
+  simdjson_inline simdjson_result<double> get_double() noexcept;
+  simdjson_inline simdjson_result<std::string_view> get_string() noexcept;
+  simdjson_inline simdjson_result<raw_json_string> get_raw_json_string() noexcept;
+  simdjson_inline simdjson_result<bool> get_bool() noexcept;
+  simdjson_inline simdjson_result<value> get_value() noexcept;
+
+  simdjson_inline simdjson_result<bool> is_null() noexcept;
+  simdjson_inline simdjson_result<std::string_view> raw_json() noexcept;
+  simdjson_inline operator document&() const noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  simdjson_inline operator array() & noexcept(false);
+  simdjson_inline operator object() & noexcept(false);
+  simdjson_inline operator uint64_t() noexcept(false);
+  simdjson_inline operator int64_t() noexcept(false);
+  simdjson_inline operator double() noexcept(false);
+  simdjson_inline operator std::string_view() noexcept(false);
+  simdjson_inline operator raw_json_string() noexcept(false);
+  simdjson_inline operator bool() noexcept(false);
+  simdjson_inline operator value() noexcept(false);
+#endif
+  simdjson_inline simdjson_result<size_t> count_elements() & noexcept;
+  simdjson_inline simdjson_result<size_t> count_fields() & noexcept;
+  simdjson_inline simdjson_result<value> at(size_t index) & noexcept;
+  simdjson_inline simdjson_result<array_iterator> begin() & noexcept;
+  simdjson_inline simdjson_result<array_iterator> end() & noexcept;
+  simdjson_inline simdjson_result<value> find_field(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<value> find_field(const char *key) & noexcept;
+  simdjson_inline simdjson_result<value> operator[](std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<value> operator[](const char *key) & noexcept;
+  simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<value> find_field_unordered(const char *key) & noexcept;
+
+  simdjson_inline simdjson_result<json_type> type() noexcept;
+  simdjson_inline simdjson_result<bool> is_scalar() noexcept;
+
+  simdjson_inline simdjson_result<const char *> current_location() noexcept;
+  simdjson_inline int32_t current_depth() const noexcept;
+  simdjson_inline bool is_negative() noexcept;
+  simdjson_inline simdjson_result<bool> is_integer() noexcept;
+  simdjson_inline simdjson_result<number_type> get_number_type() noexcept;
+  simdjson_inline simdjson_result<number> get_number() noexcept;
+  simdjson_inline simdjson_result<std::string_view> raw_json_token() noexcept;
+  simdjson_inline simdjson_result<value> at_pointer(std::string_view json_pointer) noexcept;
+private:
+  document *doc{nullptr};
+};
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+  simdjson_inline error_code rewind() noexcept;
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> get_array() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> get_object() & noexcept;
+  simdjson_inline simdjson_result<uint64_t> get_uint64() noexcept;
+  simdjson_inline simdjson_result<int64_t> get_int64() noexcept;
+  simdjson_inline simdjson_result<double> get_double() noexcept;
+  simdjson_inline simdjson_result<double> get_double_from_string() noexcept;
+  simdjson_inline simdjson_result<std::string_view> get_string() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> get_raw_json_string() noexcept;
+  simdjson_inline simdjson_result<bool> get_bool() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> get_value() noexcept;
+  simdjson_inline simdjson_result<bool> is_null() noexcept;
+
+  template<typename T> simdjson_inline simdjson_result<T> get() & noexcept;
+  template<typename T> simdjson_inline simdjson_result<T> get() && noexcept;
+
+  template<typename T> simdjson_inline error_code get(T &out) & noexcept;
+  template<typename T> simdjson_inline error_code get(T &out) && noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() & noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() & noexcept(false);
+  simdjson_inline operator uint64_t() noexcept(false);
+  simdjson_inline operator int64_t() noexcept(false);
+  simdjson_inline operator double() noexcept(false);
+  simdjson_inline operator std::string_view() noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false);
+  simdjson_inline operator bool() noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value() noexcept(false);
+#endif
+  simdjson_inline simdjson_result<size_t> count_elements() & noexcept;
+  simdjson_inline simdjson_result<size_t> count_fields() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at(size_t index) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> begin() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> end() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(const char *key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](const char *key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(const char *key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> type() noexcept;
+  simdjson_inline simdjson_result<bool> is_scalar() noexcept;
+  simdjson_inline simdjson_result<const char *> current_location() noexcept;
+  simdjson_inline int32_t current_depth() const noexcept;
+  simdjson_inline bool is_negative() noexcept;
+  simdjson_inline simdjson_result<bool> is_integer() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number_type> get_number_type() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number> get_number() noexcept;
+  /** @copydoc simdjson_inline std::string_view document::raw_json_token() const noexcept */
+  simdjson_inline simdjson_result<std::string_view> raw_json_token() noexcept;
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at_pointer(std::string_view json_pointer) noexcept;
+};
+
+
+} // namespace simdjson
+
+
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference value, error_code error) noexcept;
+  simdjson_inline simdjson_result() noexcept = default;
+  simdjson_inline error_code rewind() noexcept;
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> get_array() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> get_object() & noexcept;
+  simdjson_inline simdjson_result<uint64_t> get_uint64() noexcept;
+  simdjson_inline simdjson_result<int64_t> get_int64() noexcept;
+  simdjson_inline simdjson_result<double> get_double() noexcept;
+  simdjson_inline simdjson_result<std::string_view> get_string() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> get_raw_json_string() noexcept;
+  simdjson_inline simdjson_result<bool> get_bool() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> get_value() noexcept;
+  simdjson_inline simdjson_result<bool> is_null() noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() & noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() & noexcept(false);
+  simdjson_inline operator uint64_t() noexcept(false);
+  simdjson_inline operator int64_t() noexcept(false);
+  simdjson_inline operator double() noexcept(false);
+  simdjson_inline operator std::string_view() noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false);
+  simdjson_inline operator bool() noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value() noexcept(false);
+#endif
+  simdjson_inline simdjson_result<size_t> count_elements() & noexcept;
+  simdjson_inline simdjson_result<size_t> count_fields() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at(size_t index) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> begin() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> end() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(const char *key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](const char *key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(const char *key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> type() noexcept;
+  simdjson_inline simdjson_result<bool> is_scalar() noexcept;
+  simdjson_inline simdjson_result<const char *> current_location() noexcept;
+  simdjson_inline simdjson_result<int32_t> current_depth() const noexcept;
+  simdjson_inline simdjson_result<bool> is_negative() noexcept;
+  simdjson_inline simdjson_result<bool> is_integer() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number_type> get_number_type() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number> get_number() noexcept;
+  /** @copydoc simdjson_inline std::string_view document_reference::raw_json_token() const noexcept */
+  simdjson_inline simdjson_result<std::string_view> raw_json_token() noexcept;
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at_pointer(std::string_view json_pointer) noexcept;
+};
+
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/document.h */
+/* begin file include/simdjson/generic/ondemand/value.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class array;
+class document;
+class field;
+class object;
+class raw_json_string;
+
+/**
+ * An ephemeral JSON value returned during iteration. It is only valid for as long as you do
+ * not access more data in the JSON document.
+ */
+class value {
+public:
+  /**
+   * Create a new invalid value.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline value() noexcept = default;
+
+  /**
+   * Get this value as the given type.
+   *
+   * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool
+   *
+   * You may use get_double(), get_bool(), get_uint64(), get_int64(),
+   * get_object(), get_array(), get_raw_json_string(), or get_string() instead.
+   *
+   * @returns A value of the given type, parsed from the JSON.
+   * @returns INCORRECT_TYPE If the JSON value is not the given type.
+   */
+  template<typename T> simdjson_inline simdjson_result<T> get() noexcept {
+    // Unless the simdjson library provides an inline implementation, calling this method should
+    // immediately fail.
+    static_assert(!sizeof(T), "The get method with given type is not implemented by the simdjson library.");
+  }
+
+  /**
+   * Get this value as the given type.
+   *
+   * Supported types: object, array, raw_json_string, string_view, uint64_t, int64_t, double, bool
+   *
+   * @param out This is set to a value of the given type, parsed from the JSON. If there is an error, this may not be initialized.
+   * @returns INCORRECT_TYPE If the JSON value is not an object.
+   * @returns SUCCESS If the parse succeeded and the out parameter was set to the value.
+   */
+  template<typename T> simdjson_inline error_code get(T &out) noexcept;
+
+  /**
+   * Cast this JSON value to an array.
+   *
+   * @returns An object that can be used to iterate the array.
+   * @returns INCORRECT_TYPE If the JSON value is not an array.
+   */
+  simdjson_inline simdjson_result<array> get_array() noexcept;
+
+  /**
+   * Cast this JSON value to an object.
+   *
+   * @returns An object that can be used to look up or iterate fields.
+   * @returns INCORRECT_TYPE If the JSON value is not an object.
+   */
+  simdjson_inline simdjson_result<object> get_object() noexcept;
+
+  /**
+   * Cast this JSON value to an unsigned integer.
+   *
+   * @returns A unsigned 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer.
+   */
+  simdjson_inline simdjson_result<uint64_t> get_uint64() noexcept;
+
+  /**
+   * Cast this JSON value (inside string) to a unsigned integer.
+   *
+   * @returns A unsigned 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit unsigned integer.
+   */
+  simdjson_inline simdjson_result<uint64_t> get_uint64_in_string() noexcept;
+
+  /**
+   * Cast this JSON value to a signed integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer.
+   */
+  simdjson_inline simdjson_result<int64_t> get_int64() noexcept;
+
+  /**
+   * Cast this JSON value (inside string) to a signed integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @returns INCORRECT_TYPE If the JSON value is not a 64-bit integer.
+   */
+  simdjson_inline simdjson_result<int64_t> get_int64_in_string() noexcept;
+
+  /**
+   * Cast this JSON value to a double.
+   *
+   * @returns A double.
+   * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number.
+   */
+  simdjson_inline simdjson_result<double> get_double() noexcept;
+
+  /**
+   * Cast this JSON value (inside string) to a double
+   *
+   * @returns A double.
+   * @returns INCORRECT_TYPE If the JSON value is not a valid floating-point number.
+   */
+  simdjson_inline simdjson_result<double> get_double_in_string() noexcept;
+
+  /**
+   * Cast this JSON value to a string.
+   *
+   * The string is guaranteed to be valid UTF-8.
+   *
+   * Equivalent to get<std::string_view>().
+   *
+   * Important: a value should be consumed once. Calling get_string() twice on the same value
+   * is an error.
+   *
+   * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next
+   *          time it parses a document or when it is destroyed.
+   * @returns INCORRECT_TYPE if the JSON value is not a string.
+   */
+  simdjson_inline simdjson_result<std::string_view> get_string() noexcept;
+
+  /**
+   * Cast this JSON value to a raw_json_string.
+   *
+   * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n).
+   *
+   * @returns A pointer to the raw JSON for the given string.
+   * @returns INCORRECT_TYPE if the JSON value is not a string.
+   */
+  simdjson_inline simdjson_result<raw_json_string> get_raw_json_string() noexcept;
+
+  /**
+   * Cast this JSON value to a bool.
+   *
+   * @returns A bool value.
+   * @returns INCORRECT_TYPE if the JSON value is not true or false.
+   */
+  simdjson_inline simdjson_result<bool> get_bool() noexcept;
+
+  /**
+   * Checks if this JSON value is null. If and only if the value is
+   * null, then it is consumed (we advance). If we find a token that
+   * begins with 'n' but is not 'null', then an error is returned.
+   *
+   * @returns Whether the value is null.
+   * @returns INCORRECT_TYPE If the JSON value begins with 'n' and is not 'null'.
+   */
+  simdjson_inline simdjson_result<bool> is_null() noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  /**
+   * Cast this JSON value to an array.
+   *
+   * @returns An object that can be used to iterate the array.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an array.
+   */
+  simdjson_inline operator array() noexcept(false);
+  /**
+   * Cast this JSON value to an object.
+   *
+   * @returns An object that can be used to look up or iterate fields.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not an object.
+   */
+  simdjson_inline operator object() noexcept(false);
+  /**
+   * Cast this JSON value to an unsigned integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit unsigned integer.
+   */
+  simdjson_inline operator uint64_t() noexcept(false);
+  /**
+   * Cast this JSON value to a signed integer.
+   *
+   * @returns A signed 64-bit integer.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a 64-bit integer.
+   */
+  simdjson_inline operator int64_t() noexcept(false);
+  /**
+   * Cast this JSON value to a double.
+   *
+   * @returns A double.
+   * @exception simdjson_error(INCORRECT_TYPE) If the JSON value is not a valid floating-point number.
+   */
+  simdjson_inline operator double() noexcept(false);
+  /**
+   * Cast this JSON value to a string.
+   *
+   * The string is guaranteed to be valid UTF-8.
+   *
+   * Equivalent to get<std::string_view>().
+   *
+   * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next
+   *          time it parses a document or when it is destroyed.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string.
+   */
+  simdjson_inline operator std::string_view() noexcept(false);
+  /**
+   * Cast this JSON value to a raw_json_string.
+   *
+   * The string is guaranteed to be valid UTF-8, and may have escapes in it (e.g. \\ or \n).
+   *
+   * @returns A pointer to the raw JSON for the given string.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not a string.
+   */
+  simdjson_inline operator raw_json_string() noexcept(false);
+  /**
+   * Cast this JSON value to a bool.
+   *
+   * @returns A bool value.
+   * @exception simdjson_error(INCORRECT_TYPE) if the JSON value is not true or false.
+   */
+  simdjson_inline operator bool() noexcept(false);
+#endif
+
+  /**
+   * Begin array iteration.
+   *
+   * Part of the std::iterable interface.
+   *
+   * @returns INCORRECT_TYPE If the JSON value is not an array.
+   */
+  simdjson_inline simdjson_result<array_iterator> begin() & noexcept;
+  /**
+   * Sentinel representing the end of the array.
+   *
+   * Part of the std::iterable interface.
+   */
+  simdjson_inline simdjson_result<array_iterator> end() & noexcept;
+  /**
+   * This method scans the array and counts the number of elements.
+   * The count_elements method should always be called before you have begun
+   * iterating through the array: it is expected that you are pointing at
+   * the beginning of the array.
+   * The runtime complexity is linear in the size of the array. After
+   * calling this function, if successful, the array is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   *
+   * Performance hint: You should only call count_elements() as a last
+   * resort as it may require scanning the document twice or more.
+   */
+  simdjson_inline simdjson_result<size_t> count_elements() & noexcept;
+  /**
+   * This method scans the object and counts the number of key-value pairs.
+   * The count_fields method should always be called before you have begun
+   * iterating through the object: it is expected that you are pointing at
+   * the beginning of the object.
+   * The runtime complexity is linear in the size of the object. After
+   * calling this function, if successful, the object is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   *
+   * To check that an object is empty, it is more performant to use
+   * the is_empty() method on the object instance.
+   *
+   * Performance hint: You should only call count_fields() as a last
+   * resort as it may require scanning the document twice or more.
+   */
+  simdjson_inline simdjson_result<size_t> count_fields() & noexcept;
+  /**
+   * Get the value at the given index in the array. This function has linear-time complexity.
+   * This function should only be called once on an array instance since the array iterator is not reset between each call.
+   *
+   * @return The value at the given index, or:
+   *         - INDEX_OUT_OF_BOUNDS if the array index is larger than an array length
+   */
+  simdjson_inline simdjson_result<value> at(size_t index) noexcept;
+  /**
+   * Look up a field by name on an object (order-sensitive).
+   *
+   * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the
+   * JSON `{ "x": 1, "y": 2, "z": 3 }`:
+   *
+   * ```c++
+   * simdjson::ondemand::parser parser;
+   * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded);
+   * double z = obj.find_field("z");
+   * double y = obj.find_field("y");
+   * double x = obj.find_field("x");
+   * ```
+   * If you have multiple fields with a matching key ({"x": 1,  "x": 1}) be mindful
+   * that only one field is returned.
+
+   * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys.
+   * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`.
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<value> find_field(std::string_view key) noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<value> find_field(const char *key) noexcept;
+
+  /**
+   * Look up a field by name on an object, without regard to key order.
+   *
+   * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies
+   * and often appears negligible. It starts out normally, starting out at the last field; but if
+   * the field is not found, it scans from the beginning of the object to see if it missed it. That
+   * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object
+   * in question is large. The fact that the extra code is there also bumps the executable size.
+   *
+   * It is the default, however, because it would be highly surprising (and hard to debug) if the
+   * default behavior failed to look up a field just because it was in the wrong order--and many
+   * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order.
+   *
+   * If you have multiple fields with a matching key ({"x": 1,  "x": 1}) be mindful
+   * that only one field is returned.
+   *
+   * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the
+   * field wasn't there when they aren't).
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<value> find_field_unordered(const char *key) noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<value> operator[](std::string_view key) noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<value> operator[](const char *key) noexcept;
+
+  /**
+   * Get the type of this JSON value. It does not validate or consume the value.
+   * E.g., you must still call "is_null()" to check that a value is null even if
+   * "type()" returns json_type::null.
+   *
+   * NOTE: If you're only expecting a value to be one type (a typical case), it's generally
+   * better to just call .get_double, .get_string, etc. and check for INCORRECT_TYPE (or just
+   * let it throw an exception).
+   *
+   * @return The type of JSON value (json_type::array, json_type::object, json_type::string,
+   *     json_type::number, json_type::boolean, or json_type::null).
+   * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse".
+   */
+  simdjson_inline simdjson_result<json_type> type() noexcept;
+
+  /**
+   * Checks whether the value is a scalar (string, number, null, Boolean).
+   * Returns false when there it is an array or object.
+   *
+   * @returns true if the type is string, number, null, Boolean
+   * @error TAPE_ERROR when the JSON value is a bad token like "}" "," or "alse".
+   */
+  simdjson_inline simdjson_result<bool> is_scalar() noexcept;
+
+  /**
+   * Checks whether the value is a negative number.
+   *
+   * @returns true if the number if negative.
+   */
+  simdjson_inline bool is_negative() noexcept;
+  /**
+   * Checks whether the value is an integer number. Note that
+   * this requires to partially parse the number string. If
+   * the value is determined to be an integer, it may still
+   * not parse properly as an integer in subsequent steps
+   * (e.g., it might overflow).
+   *
+   * Performance note: if you call this function systematically
+   * before parsing a number, you may have fallen for a performance
+   * anti-pattern.
+   *
+   * @returns true if the number if negative.
+   */
+  simdjson_inline simdjson_result<bool> is_integer() noexcept;
+  /**
+   * Determine the number type (integer or floating-point number) as quickly
+   * as possible. This function does not fully validate the input. It is
+   * useful when you only need to classify the numbers, without parsing them.
+   *
+   * If you are planning to retrieve the value or you need full validation,
+   * consider using the get_number() method instead: it will fully parse
+   * and validate the input, and give you access to the type:
+   * get_number().get_number_type().
+   *
+   * get_number_type() is number_type::unsigned_integer if we have
+   * an integer greater or equal to 9223372036854775808
+   * get_number_type() is number_type::signed_integer if we have an
+   * integer that is less than 9223372036854775808
+   * Otherwise, get_number_type() has value number_type::floating_point_number
+   *
+   * This function requires processing the number string, but it is expected
+   * to be faster than get_number().get_number_type() because it is does not
+   * parse the number value.
+   *
+   * @returns the type of the number
+   */
+  simdjson_inline simdjson_result<number_type> get_number_type() noexcept;
+
+  /**
+   * Attempt to parse an ondemand::number. An ondemand::number may
+   * contain an integer value or a floating-point value, the simdjson
+   * library will autodetect the type. Thus it is a dynamically typed
+   * number. Before accessing the value, you must determine the detected
+   * type.
+   *
+   * number.get_number_type() is number_type::signed_integer if we have
+   * an integer in [-9223372036854775808,9223372036854775808)
+   * You can recover the value by calling number.get_int64() and you
+   * have that number.is_int64() is true.
+   *
+   * number.get_number_type() is number_type::unsigned_integer if we have
+   * an integer in [9223372036854775808,18446744073709551616)
+   * You can recover the value by calling number.get_uint64() and you
+   * have that number.is_uint64() is true.
+   *
+   * Otherwise, number.get_number_type() has value number_type::floating_point_number
+   * and we have a binary64 number.
+   * You can recover the value by calling number.get_double() and you
+   * have that number.is_double() is true.
+   *
+   * You must check the type before accessing the value: it is an error
+   * to call "get_int64()" when number.get_number_type() is not
+   * number_type::signed_integer and when number.is_int64() is false.
+   *
+   * Performance note: this is designed with performance in mind. When
+   * calling 'get_number()', you scan the number string only once, determining
+   * efficiently the type and storing it in an efficient manner.
+   */
+  simdjson_warn_unused simdjson_inline simdjson_result<number> get_number() noexcept;
+
+
+  /**
+   * Get the raw JSON for this token.
+   *
+   * The string_view will always point into the input buffer.
+   *
+   * The string_view will start at the beginning of the token, and include the entire token
+   * *as well as all spaces until the next token (or EOF).* This means, for example, that a
+   * string token always begins with a " and is always terminated by the final ", possibly
+   * followed by a number of spaces.
+   *
+   * The string_view is *not* null-terminated. However, if this is a scalar (string, number,
+   * boolean, or null), the character after the end of the string_view is guaranteed to be
+   * a non-space token.
+   *
+   * Tokens include:
+   * - {
+   * - [
+   * - "a string (possibly with UTF-8 or backslashed characters like \\\")".
+   * - -1.2e-100
+   * - true
+   * - false
+   * - null
+   */
+  simdjson_inline std::string_view raw_json_token() noexcept;
+
+  /**
+   * Returns the current location in the document if in bounds.
+   */
+  simdjson_inline simdjson_result<const char *> current_location() noexcept;
+
+  /**
+   * Returns the current depth in the document if in bounds.
+   *
+   * E.g.,
+   *  0 = finished with document
+   *  1 = document root value (could be [ or {, not yet known)
+   *  2 = , or } inside root array/object
+   *  3 = key or value inside root array/object.
+   */
+  simdjson_inline int32_t current_depth() const noexcept;
+
+  /**
+   * Get the value associated with the given JSON pointer.  We use the RFC 6901
+   * https://tools.ietf.org/html/rfc6901 standard.
+   *
+   *   ondemand::parser parser;
+   *   auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded;
+   *   auto doc = parser.iterate(json);
+   *   doc.at_pointer("/foo/a/1") == 20
+   *
+   * It is allowed for a key to be the empty string:
+   *
+   *   ondemand::parser parser;
+   *   auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded;
+   *   auto doc = parser.iterate(json);
+   *   doc.at_pointer("//a/1") == 20
+   *
+   * Note that at_pointer() called on the document automatically calls the document's rewind
+   * method between each call. It invalidates all previously accessed arrays, objects and values
+   * that have not been consumed.
+   *
+   * Calling at_pointer() on non-document instances (e.g., arrays and objects) is not
+   * standardized (by RFC 6901). We provide some experimental support for JSON pointers
+   * on non-document instances.  Yet it is not the case when calling at_pointer on an array
+   * or an object instance: there is no rewind and no invalidation.
+   *
+   * You may only call at_pointer on an array after it has been created, but before it has
+   * been first accessed. When calling at_pointer on an array, the pointer is advanced to
+   * the location indicated by the JSON pointer (in case of success). It is no longer possible
+   * to call at_pointer on the same array.
+   *
+   * You may call at_pointer more than once on an object, but each time the pointer is advanced
+   * to be within the value matched by the key indicated by the JSON pointer query. Thus any preceding
+   * key (as well as the current key) can no longer be used with following JSON pointer calls.
+   *
+   * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   */
+  simdjson_inline simdjson_result<value> at_pointer(std::string_view json_pointer) noexcept;
+
+protected:
+  /**
+   * Create a value.
+   */
+  simdjson_inline value(const value_iterator &iter) noexcept;
+
+  /**
+   * Skip this value, allowing iteration to continue.
+   */
+  simdjson_inline void skip() noexcept;
+
+  /**
+   * Start a value at the current position.
+   *
+   * (It should already be started; this is just a self-documentation method.)
+   */
+  static simdjson_inline value start(const value_iterator &iter) noexcept;
+
+  /**
+   * Resume a value.
+   */
+  static simdjson_inline value resume(const value_iterator &iter) noexcept;
+
+  /**
+   * Get the object, starting or resuming it as necessary
+   */
+  simdjson_inline simdjson_result<object> start_or_resume_object() noexcept;
+
+  // simdjson_inline void log_value(const char *type) const noexcept;
+  // simdjson_inline void log_error(const char *message) const noexcept;
+
+  value_iterator iter{};
+
+  friend class document;
+  friend class array_iterator;
+  friend class field;
+  friend class object;
+  friend struct simdjson_result<value>;
+  friend struct simdjson_result<field>;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> get_array() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> get_object() noexcept;
+
+  simdjson_inline simdjson_result<uint64_t> get_uint64() noexcept;
+  simdjson_inline simdjson_result<uint64_t> get_uint64_in_string() noexcept;
+  simdjson_inline simdjson_result<int64_t> get_int64() noexcept;
+  simdjson_inline simdjson_result<int64_t> get_int64_in_string() noexcept;
+  simdjson_inline simdjson_result<double> get_double() noexcept;
+  simdjson_inline simdjson_result<double> get_double_in_string() noexcept;
+  simdjson_inline simdjson_result<std::string_view> get_string() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> get_raw_json_string() noexcept;
+  simdjson_inline simdjson_result<bool> get_bool() noexcept;
+  simdjson_inline simdjson_result<bool> is_null() noexcept;
+
+  template<typename T> simdjson_inline simdjson_result<T> get() noexcept;
+
+  template<typename T> simdjson_inline error_code get(T &out) noexcept;
+
+#if SIMDJSON_EXCEPTIONS
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() noexcept(false);
+  simdjson_inline operator uint64_t() noexcept(false);
+  simdjson_inline operator int64_t() noexcept(false);
+  simdjson_inline operator double() noexcept(false);
+  simdjson_inline operator std::string_view() noexcept(false);
+  simdjson_inline operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false);
+  simdjson_inline operator bool() noexcept(false);
+#endif
+  simdjson_inline simdjson_result<size_t> count_elements() & noexcept;
+  simdjson_inline simdjson_result<size_t> count_fields() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at(size_t index) noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> begin() & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> end() & noexcept;
+
+  /**
+   * Look up a field by name on an object (order-sensitive).
+   *
+   * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the
+   * JSON `{ "x": 1, "y": 2, "z": 3 }`:
+   *
+   * ```c++
+   * simdjson::ondemand::parser parser;
+   * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded);
+   * double z = obj.find_field("z");
+   * double y = obj.find_field("y");
+   * double x = obj.find_field("x");
+   * ```
+   *
+   * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys.
+   * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`.
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(std::string_view key) noexcept;
+  /** @overload simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(const char *key) noexcept;
+
+  /**
+   * Look up a field by name on an object, without regard to key order.
+   *
+   * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies
+   * and often appears negligible. It starts out normally, starting out at the last field; but if
+   * the field is not found, it scans from the beginning of the object to see if it missed it. That
+   * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object
+   * in question is large. The fact that the extra code is there also bumps the executable size.
+   *
+   * It is the default, however, because it would be highly surprising (and hard to debug) if the
+   * default behavior failed to look up a field just because it was in the wrong order--and many
+   * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order.
+   *
+   * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the
+   * field wasn't there when they aren't).
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) noexcept;
+  /** @overload simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(const char *key) noexcept;
+  /** @overload simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](std::string_view key) noexcept;
+  /** @overload simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) noexcept; */
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](const char *key) noexcept;
+
+  /**
+   * Get the type of this JSON value.
+   *
+   * NOTE: If you're only expecting a value to be one type (a typical case), it's generally
+   * better to just call .get_double, .get_string, etc. and check for INCORRECT_TYPE (or just
+   * let it throw an exception).
+   */
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> type() noexcept;
+  simdjson_inline simdjson_result<bool> is_scalar() noexcept;
+  simdjson_inline simdjson_result<bool> is_negative() noexcept;
+  simdjson_inline simdjson_result<bool> is_integer() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number_type> get_number_type() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number> get_number() noexcept;
+
+  /** @copydoc simdjson_inline std::string_view value::raw_json_token() const noexcept */
+  simdjson_inline simdjson_result<std::string_view> raw_json_token() noexcept;
+
+  /** @copydoc simdjson_inline simdjson_result<const char *> current_location() noexcept */
+  simdjson_inline simdjson_result<const char *> current_location() noexcept;
+  /** @copydoc simdjson_inline int32_t current_depth() const noexcept */
+  simdjson_inline simdjson_result<int32_t> current_depth() const noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at_pointer(std::string_view json_pointer) noexcept;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/value.h */
+/* begin file include/simdjson/generic/ondemand/field.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+/**
+ * A JSON field (key/value pair) in an object.
+ *
+ * Returned from object iteration.
+ *
+ * Extends from std::pair<raw_json_string, value> so you can use C++ algorithms that rely on pairs.
+ */
+class field : public std::pair<raw_json_string, value> {
+public:
+  /**
+   * Create a new invalid field.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline field() noexcept;
+
+  /**
+   * Get the key as a string_view (for higher speed, consider raw_key).
+   * We deliberately use a more cumbersome name (unescaped_key) to force users
+   * to think twice about using it.
+   *
+   * This consumes the key: once you have called unescaped_key(), you cannot
+   * call it again nor can you call key().
+   */
+  simdjson_inline simdjson_warn_unused simdjson_result<std::string_view> unescaped_key() noexcept;
+  /**
+   * Get the key as a raw_json_string. Can be used for direct comparison with
+   * an unescaped C string: e.g., key() == "test".
+   */
+  simdjson_inline raw_json_string key() const noexcept;
+  /**
+   * Get the field value.
+   */
+  simdjson_inline ondemand::value &value() & noexcept;
+  /**
+   * @overload ondemand::value &ondemand::value() & noexcept
+   */
+  simdjson_inline ondemand::value value() && noexcept;
+
+protected:
+  simdjson_inline field(raw_json_string key, ondemand::value &&value) noexcept;
+  static simdjson_inline simdjson_result<field> start(value_iterator &parent_iter) noexcept;
+  static simdjson_inline simdjson_result<field> start(const value_iterator &parent_iter, raw_json_string key) noexcept;
+  friend struct simdjson_result<field>;
+  friend class object_iterator;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+
+  simdjson_inline simdjson_result<std::string_view> unescaped_key() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> key() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> value() noexcept;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/field.h */
+/* begin file include/simdjson/generic/ondemand/object.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+/**
+ * A forward-only JSON object field iterator.
+ */
+class object {
+public:
+  /**
+   * Create a new invalid object.
+   *
+   * Exists so you can declare a variable and later assign to it before use.
+   */
+  simdjson_inline object() noexcept = default;
+
+  simdjson_inline simdjson_result<object_iterator> begin() noexcept;
+  simdjson_inline simdjson_result<object_iterator> end() noexcept;
+  /**
+   * Look up a field by name on an object (order-sensitive).
+   *
+   * The following code reads z, then y, then x, and thus will not retrieve x or y if fed the
+   * JSON `{ "x": 1, "y": 2, "z": 3 }`:
+   *
+   * ```c++
+   * simdjson::ondemand::parser parser;
+   * auto obj = parser.parse(R"( { "x": 1, "y": 2, "z": 3 } )"_padded);
+   * double z = obj.find_field("z");
+   * double y = obj.find_field("y");
+   * double x = obj.find_field("x");
+   * ```
+   * If you have multiple fields with a matching key ({"x": 1,  "x": 1}) be mindful
+   * that only one field is returned.
+   *
+   * **Raw Keys:** The lookup will be done against the *raw* key, and will not unescape keys.
+   * e.g. `object["a"]` will match `{ "a": 1 }`, but will *not* match `{ "\u0061": 1 }`.
+   *
+   * You must consume the fields on an object one at a time. A request for a new key
+   * invalidates previous field values: it makes them unsafe. The value instance you get
+   * from  `content["bids"]` becomes invalid when you call `content["asks"]`. The array
+   * given by content["bids"].get_array() should not be accessed after you have called
+   * content["asks"].get_array(). You can detect such mistakes by first compiling and running
+   * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an
+   * OUT_OF_ORDER_ITERATION error is generated.
+   *
+   * You are expected to access keys only once. You should access the value corresponding to a
+   * key a single time. Doing object["mykey"].to_string() and then again object["mykey"].to_string()
+   * is an error.
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<value> find_field(std::string_view key) & noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> find_field(std::string_view key) && noexcept;
+
+  /**
+   * Look up a field by name on an object, without regard to key order.
+   *
+   * **Performance Notes:** This is a bit less performant than find_field(), though its effect varies
+   * and often appears negligible. It starts out normally, starting out at the last field; but if
+   * the field is not found, it scans from the beginning of the object to see if it missed it. That
+   * missing case has a non-cache-friendly bump and lots of extra scanning, especially if the object
+   * in question is large. The fact that the extra code is there also bumps the executable size.
+   *
+   * It is the default, however, because it would be highly surprising (and hard to debug) if the
+   * default behavior failed to look up a field just because it was in the wrong order--and many
+   * APIs assume this. Therefore, you must be explicit if you want to treat objects as out of order.
+   *
+   * Use find_field() if you are sure fields will be in order (or are willing to treat it as if the
+   * field wasn't there when they aren't).
+   *
+   * If you have multiple fields with a matching key ({"x": 1,  "x": 1}) be mindful
+   * that only one field is returned.
+   *
+   * You must consume the fields on an object one at a time. A request for a new key
+   * invalidates previous field values: it makes them unsafe. The value instance you get
+   * from  `content["bids"]` becomes invalid when you call `content["asks"]`. The array
+   * given by content["bids"].get_array() should not be accessed after you have called
+   * content["asks"].get_array(). You can detect such mistakes by first compiling and running
+   * the code in Debug mode (or with the macro `SIMDJSON_DEVELOPMENT_CHECKS` set to 1): an
+   * OUT_OF_ORDER_ITERATION error is generated.
+   *
+   * You are expected to access keys only once. You should access the value corresponding to a key
+   * a single time. Doing object["mykey"].to_string() and then again object["mykey"].to_string() is an error.
+   *
+   * @param key The key to look up.
+   * @returns The value of the field, or NO_SUCH_FIELD if the field is not in the object.
+   */
+  simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) && noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> operator[](std::string_view key) & noexcept;
+  /** @overload simdjson_inline simdjson_result<value> find_field_unordered(std::string_view key) & noexcept; */
+  simdjson_inline simdjson_result<value> operator[](std::string_view key) && noexcept;
+
+  /**
+   * Get the value associated with the given JSON pointer. We use the RFC 6901
+   * https://tools.ietf.org/html/rfc6901 standard, interpreting the current node
+   * as the root of its own JSON document.
+   *
+   *   ondemand::parser parser;
+   *   auto json = R"({ "foo": { "a": [ 10, 20, 30 ] }})"_padded;
+   *   auto doc = parser.iterate(json);
+   *   doc.at_pointer("/foo/a/1") == 20
+   *
+   * It is allowed for a key to be the empty string:
+   *
+   *   ondemand::parser parser;
+   *   auto json = R"({ "": { "a": [ 10, 20, 30 ] }})"_padded;
+   *   auto doc = parser.iterate(json);
+   *   doc.at_pointer("//a/1") == 20
+   *
+   * Note that at_pointer() called on the document automatically calls the document's rewind
+   * method between each call. It invalidates all previously accessed arrays, objects and values
+   * that have not been consumed. Yet it is not the case when calling at_pointer on an object
+   * instance: there is no rewind and no invalidation.
+   *
+   * You may call at_pointer more than once on an object, but each time the pointer is advanced
+   * to be within the value matched by the key indicated by the JSON pointer query. Thus any preceding
+   * key (as well as the current key) can no longer be used with following JSON pointer calls.
+   *
+   * Also note that at_pointer() relies on find_field() which implies that we do not unescape keys when matching.
+   *
+   * @return The value associated with the given JSON pointer, or:
+   *         - NO_SUCH_FIELD if a field does not exist in an object
+   *         - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length
+   *         - INCORRECT_TYPE if a non-integer is used to access an array
+   *         - INVALID_JSON_POINTER if the JSON pointer is invalid and cannot be parsed
+   */
+  inline simdjson_result<value> at_pointer(std::string_view json_pointer) noexcept;
+
+  /**
+   * Reset the iterator so that we are pointing back at the
+   * beginning of the object. You should still consume values only once even if you
+   * can iterate through the object more than once. If you unescape a string within
+   * the object more than once, you have unsafe code. Note that rewinding an object
+   * means that you may need to reparse it anew: it is not a free operation.
+   *
+   * @returns true if the object contains some elements (not empty)
+   */
+  inline simdjson_result<bool> reset() & noexcept;
+  /**
+   * This method scans the beginning of the object and checks whether the
+   * object is empty.
+   * The runtime complexity is constant time. After
+   * calling this function, if successful, the object is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   */
+  inline simdjson_result<bool> is_empty() & noexcept;
+  /**
+   * This method scans the object and counts the number of key-value pairs.
+   * The count_fields method should always be called before you have begun
+   * iterating through the object: it is expected that you are pointing at
+   * the beginning of the object.
+   * The runtime complexity is linear in the size of the object. After
+   * calling this function, if successful, the object is 'rewinded' at its
+   * beginning as if it had never been accessed. If the JSON is malformed (e.g.,
+   * there is a missing comma), then an error is returned and it is no longer
+   * safe to continue.
+   *
+   * To check that an object is empty, it is more performant to use
+   * the is_empty() method.
+   *
+   * Performance hint: You should only call count_fields() as a last
+   * resort as it may require scanning the document twice or more.
+   */
+  simdjson_inline simdjson_result<size_t> count_fields() & noexcept;
+  /**
+   * Consumes the object and returns a string_view instance corresponding to the
+   * object as represented in JSON. It points inside the original byte array containing
+   * the JSON document.
+   */
+  simdjson_inline simdjson_result<std::string_view> raw_json() noexcept;
+
+protected:
+  /**
+   * Go to the end of the object, no matter where you are right now.
+   */
+  simdjson_inline error_code consume() noexcept;
+  static simdjson_inline simdjson_result<object> start(value_iterator &iter) noexcept;
+  static simdjson_inline simdjson_result<object> start_root(value_iterator &iter) noexcept;
+  static simdjson_inline simdjson_result<object> started(value_iterator &iter) noexcept;
+  static simdjson_inline object resume(const value_iterator &iter) noexcept;
+  simdjson_inline object(const value_iterator &iter) noexcept;
+
+  simdjson_warn_unused simdjson_inline error_code find_field_raw(const std::string_view key) noexcept;
+
+  value_iterator iter{};
+
+  friend class value;
+  friend class document;
+  friend struct simdjson_result<object>;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> begin() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> end() noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field(std::string_view key) && noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> find_field_unordered(std::string_view key) && noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](std::string_view key) & noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> operator[](std::string_view key) && noexcept;
+  simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> at_pointer(std::string_view json_pointer) noexcept;
+  inline simdjson_result<bool> reset() noexcept;
+  inline simdjson_result<bool> is_empty() noexcept;
+  inline simdjson_result<size_t> count_fields() & noexcept;
+
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/object.h */
+/* begin file include/simdjson/generic/ondemand/parser.h */
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class array;
+class object;
+class value;
+class raw_json_string;
+class document_stream;
+
+/**
+ * The default batch size for document_stream instances for this On Demand kernel.
+ * Note that different On Demand kernel may use a different DEFAULT_BATCH_SIZE value
+ * in the future.
+ */
+static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
+/**
+ * Some adversary might try to set the batch size to 0 or 1, which might cause problems.
+ * We set a minimum of 32B since anything else is highly likely to be an error. In practice,
+ * most users will want a much larger batch size.
+ *
+ * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON
+ * document can ever span 0 or 1 byte and that very large values would create memory allocation issues.
+ */
+static constexpr size_t MINIMAL_BATCH_SIZE = 32;
+
+/**
+ * A JSON fragment iterator.
+ *
+ * This holds the actual iterator as well as the buffer for writing strings.
+ */
+class parser {
+public:
+  /**
+   * Create a JSON parser.
+   *
+   * The new parser will have zero capacity.
+   */
+  inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept;
+
+  inline parser(parser &&other) noexcept = default;
+  simdjson_inline parser(const parser &other) = delete;
+  simdjson_inline parser &operator=(const parser &other) = delete;
+  simdjson_inline parser &operator=(parser &&other) noexcept = default;
+
+  /** Deallocate the JSON parser. */
+  inline ~parser() noexcept = default;
+
+  /**
+   * Start iterating an on-demand JSON document.
+   *
+   *   ondemand::parser parser;
+   *   document doc = parser.iterate(json);
+   *
+   * It is expected that the content is a valid UTF-8 file, containing a valid JSON document.
+   * Otherwise the iterate method may return an error. In particular, the whole input should be
+   * valid: we do not attempt to tolerate incorrect content either before or after a JSON
+   * document.
+   *
+   * ### IMPORTANT: Validate what you use
+   *
+   * Calling iterate on an invalid JSON document may not immediately trigger an error. The call to
+   * iterate does not parse and validate the whole document.
+   *
+   * ### IMPORTANT: Buffer Lifetime
+   *
+   * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as
+   * long as the document iteration.
+   *
+   * ### IMPORTANT: Document Lifetime
+   *
+   * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during
+   * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before
+   * you call parse() again or destroy the parser.
+   *
+   * ### REQUIRED: Buffer Padding
+   *
+   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
+   * those bytes are initialized to, as long as they are allocated.
+   *
+   * @param json The JSON to parse.
+   * @param len The length of the JSON.
+   * @param capacity The number of bytes allocated in the JSON (must be at least len+SIMDJSON_PADDING).
+   *
+   * @return The document, or an error:
+   *         - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes.
+   *         - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory
+   *           allocation fails.
+   *         - EMPTY if the document is all whitespace.
+   *         - UTF8_ERROR if the document is not valid UTF-8.
+   *         - UNESCAPED_CHARS if a string contains control characters that must be escaped
+   *         - UNCLOSED_STRING if there is an unclosed string in the document.
+   */
+  simdjson_warn_unused simdjson_result<document> iterate(const char *json, size_t len, size_t capacity) & noexcept;
+  /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
+simdjson_warn_unused simdjson_result<document> iterate(padded_string_view json) & noexcept;
+/** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
+simdjson_warn_unused simdjson_result<document> iterate(const uint8_t *json, size_t len, size_t capacity) & noexcept;
+  /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
+  simdjson_warn_unused simdjson_result<document> iterate(std::string_view json, size_t capacity) & noexcept;
+  /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
+  simdjson_warn_unused simdjson_result<document> iterate(const std::string &json) & noexcept;
+  /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
+  simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string> &json) & noexcept;
+  /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
+  simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string_view> &json) & noexcept;
+  /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
+  simdjson_warn_unused simdjson_result<document> iterate(padded_string &&json) & noexcept = delete;
+
+  /**
+   * @private
+   *
+   * Start iterating an on-demand JSON document.
+   *
+   *   ondemand::parser parser;
+   *   json_iterator doc = parser.iterate(json);
+   *
+   * ### IMPORTANT: Buffer Lifetime
+   *
+   * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as
+   * long as the document iteration.
+   *
+   * ### IMPORTANT: Document Lifetime
+   *
+   * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during
+   * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before
+   * you call parse() again or destroy the parser.
+   *
+   * The ondemand::document instance holds the iterator. The document must remain in scope
+   * while you are accessing instances of ondemand::value, ondemand::object, ondemand::array.
+   *
+   * ### REQUIRED: Buffer Padding
+   *
+   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
+   * those bytes are initialized to, as long as they are allocated.
+   *
+   * @param json The JSON to parse.
+   *
+   * @return The iterator, or an error:
+   *         - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes.
+   *         - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory
+   *           allocation fails.
+   *         - EMPTY if the document is all whitespace.
+   *         - UTF8_ERROR if the document is not valid UTF-8.
+   *         - UNESCAPED_CHARS if a string contains control characters that must be escaped
+   *         - UNCLOSED_STRING if there is an unclosed string in the document.
+   */
+  simdjson_warn_unused simdjson_result<json_iterator> iterate_raw(padded_string_view json) & noexcept;
+
+
+  /**
+   * Parse a buffer containing many JSON documents.
+   *
+   *   auto json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"_padded;
+   *   ondemand::parser parser;
+   *   ondemand::document_stream docs = parser.iterate_many(json);
+   *   for (auto & doc : docs) {
+   *     std::cout << doc["foo"] << std::endl;
+   *   }
+   *   // Prints 1 2 3
+   *
+   * No copy of the input buffer is made.
+   *
+   * The function is lazy: it may be that no more than one JSON document at a time is parsed.
+   *
+   * The caller is responsabile to ensure that the input string data remains unchanged and is
+   * not deleted during the loop.
+   *
+   * ### Format
+   *
+   * The buffer must contain a series of one or more JSON documents, concatenated into a single
+   * buffer, separated by ASCII whitespace. It effectively parses until it has a fully valid document,
+   * then starts parsing the next document at that point. (It does this with more parallelism and
+   * lookahead than you might think, though.)
+   *
+   * documents that consist of an object or array may omit the whitespace between them, concatenating
+   * with no separator. Documents that consist of a single primitive (i.e. documents that are not
+   * arrays or objects) MUST be separated with ASCII whitespace.
+   *
+   * The characters inside a JSON document, and between JSON documents, must be valid Unicode (UTF-8).
+   *
+   * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
+   * Setting batch_size to excessively large or excessively small values may impact negatively the
+   * performance.
+   *
+   * ### REQUIRED: Buffer Padding
+   *
+   * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
+   * those bytes are initialized to, as long as they are allocated.
+   *
+   * ### Threads
+   *
+   * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
+   * hood to do some lookahead.
+   *
+   * ### Parser Capacity
+   *
+   * If the parser's current capacity is less than batch_size, it will allocate enough capacity
+   * to handle it (up to max_capacity).
+   *
+   * @param buf The concatenated JSON to parse.
+   * @param len The length of the concatenated JSON.
+   * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
+   *                   spot is cache-related: small enough to fit in cache, yet big enough to
+   *                   parse as many documents as possible in one tight loop.
+   *                   Defaults to 10MB, which has been a reasonable sweet spot in our tests.
+   * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
+   *         - MEMALLOC if the parser does not have enough capacity and memory allocation fails
+   *         - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
+   *         - other json errors if parsing fails. You should not rely on these errors to always the same for the
+   *           same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware).
+   */
+  inline simdjson_result<document_stream> iterate_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
+  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
+  inline simdjson_result<document_stream> iterate_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
+  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
+  inline simdjson_result<document_stream> iterate_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
+  inline simdjson_result<document_stream> iterate_many(const std::string &&s, size_t batch_size) = delete;// unsafe
+  /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
+  inline simdjson_result<document_stream> iterate_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
+  inline simdjson_result<document_stream> iterate_many(const padded_string &&s, size_t batch_size) = delete;// unsafe
+
+  /** @private We do not want to allow implicit conversion from C string to std::string. */
+  simdjson_result<document_stream> iterate_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete;
+
+  /** The capacity of this parser (the largest document it can process). */
+  simdjson_inline size_t capacity() const noexcept;
+  /** The maximum capacity of this parser (the largest document it is allowed to process). */
+  simdjson_inline size_t max_capacity() const noexcept;
+  simdjson_inline void set_max_capacity(size_t max_capacity) noexcept;
+  /**
+   * The maximum depth of this parser (the most deeply nested objects and arrays it can process).
+   * This parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true.
+   * The document's instance current_depth() method should be used to monitor the parsing
+   * depth and limit it if desired.
+   */
+  simdjson_inline size_t max_depth() const noexcept;
+
+  /**
+   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
+   * and `max_depth` depth.
+   *
+   * The max_depth parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true.
+   * The document's instance current_depth() method should be used to monitor the parsing
+   * depth and limit it if desired.
+   *
+   * @param capacity The new capacity.
+   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
+   * @return The error, if there is one.
+   */
+  simdjson_warn_unused error_code allocate(size_t capacity, size_t max_depth=DEFAULT_MAX_DEPTH) noexcept;
+
+  #ifdef SIMDJSON_THREADS_ENABLED
+  /**
+   * The parser instance can use threads when they are available to speed up some
+   * operations. It is enabled by default. Changing this attribute will change the
+   * behavior of the parser for future operations.
+   */
+  bool threaded{true};
+  #endif
+
+  /**
+   * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer.
+   * The provided pointer is advanced to the end of the string by reference, and a string_view instance
+   * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least
+   * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer.
+   *
+   * This unescape function is a low-level function. If you want a more user-friendly approach, you should
+   * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string()
+   * instead of get_raw_json_string()).
+   *
+   * ## IMPORTANT: string_view lifetime
+   *
+   * The string_view is only valid as long as the bytes in dst.
+   *
+   * @param in input
+   * @param dst A pointer to a buffer at least large enough to write this string as well as
+   *            an additional SIMDJSON_PADDING bytes.
+   * @return A string_view pointing at the unescaped string in dst
+   * @error STRING_ERROR if escapes are incorrect.
+   */
+  simdjson_inline simdjson_result<std::string_view> unescape(raw_json_string in, uint8_t *&dst) const noexcept;
+private:
+  /** @private [for benchmarking access] The implementation to use */
+  std::unique_ptr<internal::dom_parser_implementation> implementation{};
+  size_t _capacity{0};
+  size_t _max_capacity;
+  size_t _max_depth{DEFAULT_MAX_DEPTH};
+  std::unique_ptr<uint8_t[]> string_buf{};
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  std::unique_ptr<token_position[]> start_positions{};
+#endif
+
+  friend class json_iterator;
+  friend class document_stream;
+};
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/parser.h */
+/* begin file include/simdjson/generic/ondemand/document_stream.h */
+#ifdef SIMDJSON_THREADS_ENABLED
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#endif
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+class parser;
+class json_iterator;
+class document;
+
+#ifdef SIMDJSON_THREADS_ENABLED
+/** @private Custom worker class **/
+struct stage1_worker {
+  stage1_worker() noexcept = default;
+  stage1_worker(const stage1_worker&) = delete;
+  stage1_worker(stage1_worker&&) = delete;
+  stage1_worker operator=(const stage1_worker&) = delete;
+  ~stage1_worker();
+  /**
+   * We only start the thread when it is needed, not at object construction, this may throw.
+   * You should only call this once.
+   **/
+  void start_thread();
+  /**
+   * Start a stage 1 job. You should first call 'run', then 'finish'.
+   * You must call start_thread once before.
+   */
+  void run(document_stream * ds, parser * stage1, size_t next_batch_start);
+  /** Wait for the run to finish (blocking). You should first call 'run', then 'finish'. **/
+  void finish();
+
+private:
+
+  /**
+   * Normally, we would never stop the thread. But we do in the destructor.
+   * This function is only safe assuming that you are not waiting for results. You
+   * should have called run, then finish, and be done.
+   **/
+  void stop_thread();
+
+  std::thread thread{};
+  /** These three variables define the work done by the thread. **/
+  ondemand::parser * stage1_thread_parser{};
+  size_t _next_batch_start{};
+  document_stream * owner{};
+  /**
+   * We have two state variables. This could be streamlined to one variable in the future but
+   * we use two for clarity.
+   */
+  bool has_work{false};
+  bool can_work{true};
+
+  /**
+   * We lock using a mutex.
+   */
+  std::mutex locking_mutex{};
+  std::condition_variable cond_var{};
+
+  friend class document_stream;
+};
+#endif  // SIMDJSON_THREADS_ENABLED
+
+/**
+ * A forward-only stream of documents.
+ *
+ * Produced by parser::iterate_many.
+ *
+ */
+class document_stream {
+public:
+  /**
+   * Construct an uninitialized document_stream.
+   *
+   *  ```c++
+   *  document_stream docs;
+   *  auto error = parser.iterate_many(json).get(docs);
+   *  ```
+   */
+  simdjson_inline document_stream() noexcept;
+  /** Move one document_stream to another. */
+  simdjson_inline document_stream(document_stream &&other) noexcept = default;
+  /** Move one document_stream to another. */
+  simdjson_inline document_stream &operator=(document_stream &&other) noexcept = default;
+
+  simdjson_inline ~document_stream() noexcept;
+
+  /**
+   * Returns the input size in bytes.
+   */
+  inline size_t size_in_bytes() const noexcept;
+
+  /**
+   * After iterating through the stream, this method
+   * returns the number of bytes that were not parsed at the end
+   * of the stream. If truncated_bytes() differs from zero,
+   * then the input was truncated maybe because incomplete JSON
+   * documents were found at the end of the stream. You
+   * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()).
+   *
+   * You should only call truncated_bytes() after streaming through all
+   * documents, like so:
+   *
+   *   document_stream stream = parser.iterate_many(json,window);
+   *   for(auto & doc : stream) {
+   *      // do something with doc
+   *   }
+   *   size_t truncated = stream.truncated_bytes();
+   *
+   */
+  inline size_t truncated_bytes() const noexcept;
+
+  class iterator {
+  public:
+    using value_type = simdjson_result<document>;
+    using reference  = value_type;
+
+    using difference_type   = std::ptrdiff_t;
+
+    using iterator_category = std::input_iterator_tag;
+
+    /**
+     * Default constructor.
+     */
+    simdjson_inline iterator() noexcept;
+    /**
+     * Get the current document (or error).
+     */
+    simdjson_inline simdjson_result<ondemand::document_reference> operator*() noexcept;
+    /**
+     * Advance to the next document (prefix).
+     */
+    inline iterator& operator++() noexcept;
+    /**
+     * Check if we're at the end yet.
+     * @param other the end iterator to compare to.
+     */
+    simdjson_inline bool operator!=(const iterator &other) const noexcept;
+    /**
+     * @private
+     *
+     * Gives the current index in the input document in bytes.
+     *
+     *   document_stream stream = parser.parse_many(json,window);
+     *   for(auto i = stream.begin(); i != stream.end(); ++i) {
+     *      auto doc = *i;
+     *      size_t index = i.current_index();
+     *   }
+     *
+     * This function (current_index()) is experimental and the usage
+     * may change in future versions of simdjson: we find the API somewhat
+     * awkward and we would like to offer something friendlier.
+     */
+     simdjson_inline size_t current_index() const noexcept;
+
+     /**
+     * @private
+     *
+     * Gives a view of the current document at the current position.
+     *
+     *   document_stream stream = parser.iterate_many(json,window);
+     *   for(auto i = stream.begin(); i != stream.end(); ++i) {
+     *      std::string_view v = i.source();
+     *   }
+     *
+     * The returned string_view instance is simply a map to the (unparsed)
+     * source string: it may thus include white-space characters and all manner
+     * of padding.
+     *
+     * This function (source()) is experimental and the usage
+     * may change in future versions of simdjson: we find the API somewhat
+     * awkward and we would like to offer something friendlier.
+     *
+     */
+     simdjson_inline std::string_view source() const noexcept;
+
+    /**
+     * Returns error of the stream (if any).
+     */
+     inline error_code error() const noexcept;
+
+  private:
+    simdjson_inline iterator(document_stream *s, bool finished) noexcept;
+    /** The document_stream we're iterating through. */
+    document_stream* stream;
+    /** Whether we're finished or not. */
+    bool finished;
+
+    friend class document;
+    friend class document_stream;
+    friend class json_iterator;
+  };
+
+  /**
+   * Start iterating the documents in the stream.
+   */
+  simdjson_inline iterator begin() noexcept;
+  /**
+   * The end of the stream, for iterator comparison purposes.
+   */
+  simdjson_inline iterator end() noexcept;
+
+private:
+
+  document_stream &operator=(const document_stream &) = delete; // Disallow copying
+  document_stream(const document_stream &other) = delete; // Disallow copying
+
+  /**
+   * Construct a document_stream. Does not allocate or parse anything until the iterator is
+   * used.
+   *
+   * @param parser is a reference to the parser instance used to generate this document_stream
+   * @param buf is the raw byte buffer we need to process
+   * @param len is the length of the raw byte buffer in bytes
+   * @param batch_size is the size of the windows (must be strictly greater or equal to the largest JSON document)
+   */
+  simdjson_inline document_stream(
+    ondemand::parser &parser,
+    const uint8_t *buf,
+    size_t len,
+    size_t batch_size
+  ) noexcept;
+
+  /**
+   * Parse the first document in the buffer. Used by begin(), to handle allocation and
+   * initialization.
+   */
+  inline void start() noexcept;
+
+  /**
+   * Parse the next document found in the buffer previously given to document_stream.
+   *
+   * The content should be a valid JSON document encoded as UTF-8. If there is a
+   * UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
+   * discouraged.
+   *
+   * You do NOT need to pre-allocate a parser.  This function takes care of
+   * pre-allocating a capacity defined by the batch_size defined when creating the
+   * document_stream object.
+   *
+   * The function returns simdjson::EMPTY if there is no more data to be parsed.
+   *
+   * The function returns simdjson::SUCCESS (as integer = 0) in case of success
+   * and indicates that the buffer has successfully been parsed to the end.
+   * Every document it contained has been parsed without error.
+   *
+   * The function returns an error code from simdjson/simdjson.h in case of failure
+   * such as simdjson::CAPACITY, simdjson::MEMALLOC, simdjson::DEPTH_ERROR and so forth;
+   * the simdjson::error_message function converts these error codes into a string).
+   *
+   * You can also check validity by calling parser.is_valid(). The same parser can
+   * and should be reused for the other documents in the buffer.
+   */
+  inline void next() noexcept;
+
+  /** Move the json_iterator of the document to the location of the next document in the stream. */
+  inline void next_document() noexcept;
+
+  /** Get the next document index. */
+  inline size_t next_batch_start() const noexcept;
+
+  /** Pass the next batch through stage 1 with the given parser. */
+  inline error_code run_stage1(ondemand::parser &p, size_t batch_start) noexcept;
+
+  // Fields
+  ondemand::parser *parser;
+  const uint8_t *buf;
+  size_t len;
+  size_t batch_size;
+  /**
+   * We are going to use just one document instance. The document owns
+   * the json_iterator. It implies that we only ever pass a reference
+   * to the document to the users.
+   */
+  document doc{};
+  /** The error (or lack thereof) from the current document. */
+  error_code error;
+  size_t batch_start{0};
+  size_t doc_index{};
+
+  #ifdef SIMDJSON_THREADS_ENABLED
+  /** Indicates whether we use threads. Note that this needs to be a constant during the execution of the parsing. */
+  bool use_thread;
+
+  inline void load_from_stage1_thread() noexcept;
+
+  /** Start a thread to run stage 1 on the next batch. */
+  inline void start_stage1_thread() noexcept;
+
+  /** Wait for the stage 1 thread to finish and capture the results. */
+  inline void finish_stage1_thread() noexcept;
+
+  /** The error returned from the stage 1 thread. */
+  error_code stage1_thread_error{UNINITIALIZED};
+  /** The thread used to run stage 1 against the next batch in the background. */
+  std::unique_ptr<stage1_worker> worker{new(std::nothrow) stage1_worker()};
+  /**
+   * The parser used to run stage 1 in the background. Will be swapped
+   * with the regular parser when finished.
+   */
+  ondemand::parser stage1_thread_parser{};
+
+  friend struct stage1_worker;
+  #endif // SIMDJSON_THREADS_ENABLED
+
+  friend class parser;
+  friend class document;
+  friend class json_iterator;
+  friend struct simdjson_result<ondemand::document_stream>;
+  friend struct internal::simdjson_result_base<ondemand::document_stream>;
+};  // document_stream
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+template<>
+struct simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream> : public SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream> {
+public:
+  simdjson_inline simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream &&value) noexcept; ///< @private
+  simdjson_inline simdjson_result(error_code error) noexcept; ///< @private
+  simdjson_inline simdjson_result() noexcept = default;
+};
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/document_stream.h */
+/* begin file include/simdjson/generic/ondemand/serialization.h */
+
+namespace simdjson {
+/**
+ * Create a string-view instance out of a document instance. The string-view instance
+ * contains JSON text that is suitable to be parsed as JSON again. It does not
+ * validate the content.
+ */
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& x) noexcept;
+/**
+ * Create a string-view instance out of a value instance. The string-view instance
+ * contains JSON text that is suitable to be parsed as JSON again. The value must
+ * not have been accessed previously. It does not
+ * validate the content.
+ */
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value& x) noexcept;
+/**
+ * Create a string-view instance out of an object instance. The string-view instance
+ * contains JSON text that is suitable to be parsed as JSON again. It does not
+ * validate the content.
+ */
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object& x) noexcept;
+/**
+ * Create a string-view instance out of an array instance. The string-view instance
+ * contains JSON text that is suitable to be parsed as JSON again. It does not
+ * validate the content.
+ */
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array& x) noexcept;
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document> x);
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> x);
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> x);
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> x);
+} // namespace simdjson
+
+/**
+ * We want to support argument-dependent lookup (ADL).
+ * Hence we should define operator<< in the namespace
+ * where the argument (here value, object, etc.) resides.
+ * Credit: @madhur4127
+ * See https://github.com/simdjson/simdjson/issues/1768
+ */
+namespace simdjson { namespace SIMDJSON_BUILTIN_IMPLEMENTATION { namespace ondemand {
+
+/**
+ * Print JSON to an output stream.  It does not
+ * validate the content.
+ *
+ * @param out The output stream.
+ * @param x The element.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value x);
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> x);
+#endif
+/**
+ * Print JSON to an output stream. It does not
+ * validate the content.
+ *
+ * @param out The output stream.
+ * @param value The array.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array value);
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> x);
+#endif
+/**
+ * Print JSON to an output stream. It does not
+ * validate the content.
+ *
+ * @param out The output stream.
+ * @param value The array.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& value);
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>&& x);
+#endif
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference& value);
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>&& x);
+#endif
+/**
+ * Print JSON to an output stream. It does not
+ * validate the content.
+ *
+ * @param out The output stream.
+ * @param value The object.
+ * @throw if there is an error with the underlying output stream. simdjson itself will not throw.
+ */
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object value);
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> x);
+#endif
+}}} // namespace simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand
+/* end file include/simdjson/generic/ondemand/serialization.h */
+/* end file include/simdjson/generic/ondemand.h */
+
+// Inline definitions
+/* begin file include/simdjson/generic/implementation_simdjson_result_base-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+
+//
+// internal::implementation_simdjson_result_base<T> inline implementation
+//
+
+template<typename T>
+simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
+  error = this->second;
+  if (!error) {
+    value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
+  }
+}
+
+template<typename T>
+simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
+  error_code error;
+  std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
+  return error;
+}
+
+template<typename T>
+simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
+  return this->second;
+}
+
+#if SIMDJSON_EXCEPTIONS
+
+template<typename T>
+simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return this->first;
+}
+
+template<typename T>
+simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
+  return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
+}
+
+template<typename T>
+simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return std::forward<T>(this->first);
+}
+
+template<typename T>
+simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
+  return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
+}
+
+#endif // SIMDJSON_EXCEPTIONS
+
+template<typename T>
+simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
+  return this->first;
+}
+
+template<typename T>
+simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
+  return this->first;
+}
+
+template<typename T>
+simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
+  return std::forward<T>(this->first);
+}
+
+template<typename T>
+simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
+    : first{std::forward<T>(value)}, second{error} {}
+template<typename T>
+simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
+    : implementation_simdjson_result_base(T{}, error) {}
+template<typename T>
+simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
+    : implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
+
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+/* end file include/simdjson/generic/implementation_simdjson_result_base-inl.h */
+/* begin file include/simdjson/generic/ondemand-inl.h */
+/* begin file include/simdjson/generic/ondemand/json_type-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+inline std::ostream& operator<<(std::ostream& out, json_type type) noexcept {
+    switch (type) {
+        case json_type::array: out << "array"; break;
+        case json_type::object: out << "object"; break;
+        case json_type::number: out << "number"; break;
+        case json_type::string: out << "string"; break;
+        case json_type::boolean: out << "boolean"; break;
+        case json_type::null: out << "null"; break;
+        default: SIMDJSON_UNREACHABLE();
+    }
+    return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
+    switch (type) {
+        case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
+        case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
+        case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
+        default: SIMDJSON_UNREACHABLE();
+    }
+    return out;
+}
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson_result<json_type> &type) noexcept(false) {
+    return out << type.value();
+}
+#endif
+
+
+
+simdjson_inline number_type number::get_number_type() const noexcept {
+  return type;
+}
+
+simdjson_inline bool number::is_uint64() const noexcept {
+  return get_number_type() == number_type::unsigned_integer;
+}
+
+simdjson_inline uint64_t number::get_uint64() const noexcept {
+  return payload.unsigned_integer;
+}
+
+simdjson_inline number::operator uint64_t() const noexcept {
+  return get_uint64();
+}
+
+
+simdjson_inline bool number::is_int64() const noexcept {
+  return get_number_type() == number_type::signed_integer;
+}
+
+simdjson_inline int64_t number::get_int64() const noexcept {
+  return payload.signed_integer;
+}
+
+simdjson_inline number::operator int64_t() const noexcept {
+  return get_int64();
+}
+
+simdjson_inline bool number::is_double() const noexcept {
+    return get_number_type() == number_type::floating_point_number;
+}
+
+simdjson_inline double number::get_double() const noexcept {
+  return payload.floating_point_number;
+}
+
+simdjson_inline number::operator double() const noexcept {
+  return get_double();
+}
+
+simdjson_inline double number::as_double() const noexcept {
+  if(is_double()) {
+    return payload.floating_point_number;
+  }
+  if(is_int64()) {
+    return double(payload.signed_integer);
+  }
+  return double(payload.unsigned_integer);
+}
+
+simdjson_inline void number::append_s64(int64_t value) noexcept {
+  payload.signed_integer = value;
+  type = number_type::signed_integer;
+}
+
+simdjson_inline void number::append_u64(uint64_t value) noexcept {
+  payload.unsigned_integer = value;
+  type = number_type::unsigned_integer;
+}
+
+simdjson_inline void number::append_double(double value) noexcept {
+  payload.floating_point_number = value;
+  type = number_type::floating_point_number;
+}
+
+simdjson_inline void number::skip_double() noexcept {
+  type = number_type::floating_point_number;
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type &&value) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type>(value)) {}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type>::simdjson_result(error_code error) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type>(error) {}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/json_type-inl.h */
+/* begin file include/simdjson/generic/ondemand/logger-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+namespace logger {
+
+static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+static constexpr const int LOG_EVENT_LEN = 20;
+static constexpr const int LOG_BUFFER_LEN = 30;
+static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
+static int log_depth = 0; // Not threadsafe. Log only.
+
+// Helper to turn unprintable or newline characters into spaces
+static inline char printable_char(char c) {
+  if (c >= 0x20) {
+    return c;
+  } else {
+    return ' ';
+  }
+}
+
+inline void log_event(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept {
+  log_line(iter, "", type, detail, delta, depth_delta);
+}
+
+inline void log_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept {
+  log_line(iter, index, depth, "", type, detail);
+}
+inline void log_value(const json_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept {
+  log_line(iter, "", type, detail, delta, depth_delta);
+}
+
+inline void log_start_value(const json_iterator &iter, token_position index, depth_t depth, const char *type, std::string_view detail) noexcept {
+  log_line(iter, index, depth, "+", type, detail);
+  if (LOG_ENABLED) { log_depth++; }
+}
+inline void log_start_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept {
+  log_line(iter, "+", type, "", delta, depth_delta);
+  if (LOG_ENABLED) { log_depth++; }
+}
+
+inline void log_end_value(const json_iterator &iter, const char *type, int delta, int depth_delta) noexcept {
+  if (LOG_ENABLED) { log_depth--; }
+  log_line(iter, "-", type, "", delta, depth_delta);
+}
+
+inline void log_error(const json_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept {
+  log_line(iter, "ERROR: ", error, detail, delta, depth_delta);
+}
+inline void log_error(const json_iterator &iter, token_position index, depth_t depth, const char *error, const char *detail) noexcept {
+  log_line(iter, index, depth, "ERROR: ", error, detail);
+}
+
+inline void log_event(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept {
+  log_event(iter.json_iter(), type, detail, delta, depth_delta);
+}
+
+inline void log_value(const value_iterator &iter, const char *type, std::string_view detail, int delta, int depth_delta) noexcept {
+  log_value(iter.json_iter(), type, detail, delta, depth_delta);
+}
+
+inline void log_start_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept {
+  log_start_value(iter.json_iter(), type, delta, depth_delta);
+}
+
+inline void log_end_value(const value_iterator &iter, const char *type, int delta, int depth_delta) noexcept {
+  log_end_value(iter.json_iter(), type, delta, depth_delta);
+}
+
+inline void log_error(const value_iterator &iter, const char *error, const char *detail, int delta, int depth_delta) noexcept {
+  log_error(iter.json_iter(), error, detail, delta, depth_delta);
+}
+
+inline void log_headers() noexcept {
+  if (LOG_ENABLED) {
+    // Technically a static variable is not thread-safe, but if you are using threads
+    // and logging... well...
+    static bool displayed_hint{false};
+    log_depth = 0;
+    printf("\n");
+    if(!displayed_hint) {
+      // We only print this helpful header once.
+      printf("# Logging provides the depth and position of the iterator user-visible steps:\n");
+      printf("# +array says 'this is where we were when we discovered the start array'\n");
+      printf("# -array says 'this is where we were when we ended the array'\n");
+      printf("# skip says 'this is a structural or value I am skipping'\n");
+      printf("# +/-skip says 'this is a start/end array or object I am skipping'\n");
+      printf("#\n");
+      printf("# The indentation of the terms (array, string,...) indicates the depth,\n");
+      printf("# in addition to the depth being displayed.\n");
+      printf("#\n");
+      printf("# Every token in the document has a single depth determined by the tokens before it,\n");
+      printf("# and is not affected by what the token actually is.\n");
+      printf("#\n");
+      printf("# Not all structural elements are presented as tokens in the logs.\n");
+      printf("#\n");
+      printf("# We never give control to the user within an empty array or an empty object.\n");
+      printf("#\n");
+      printf("# Inside an array, having a depth greater than the array's depth means that\n");
+      printf("# we are pointing inside a value.\n");
+      printf("# Having a depth equal to the array means that we are pointing right before a value.\n");
+      printf("# Having a depth smaller than the array means that we have moved beyond the array.\n");
+      displayed_hint = true;
+    }
+    printf("\n");
+    printf("| %-*s ", LOG_EVENT_LEN,        "Event");
+    printf("| %-*s ", LOG_BUFFER_LEN,       "Buffer");
+    printf("| %-*s ", LOG_SMALL_BUFFER_LEN, "Next");
+    // printf("| %-*s ", 5,                    "Next#");
+    printf("| %-*s ", 5,                    "Depth");
+    printf("| Detail ");
+    printf("|\n");
+
+    printf("|%.*s", LOG_EVENT_LEN+2, DASHES);
+    printf("|%.*s", LOG_BUFFER_LEN+2, DASHES);
+    printf("|%.*s", LOG_SMALL_BUFFER_LEN+2, DASHES);
+    // printf("|%.*s", 5+2, DASHES);
+    printf("|%.*s", 5+2, DASHES);
+    printf("|--------");
+    printf("|\n");
+    fflush(stdout);
+  }
+}
+
+inline void log_line(const json_iterator &iter, const char *title_prefix, const char *title, std::string_view detail, int delta, int depth_delta) noexcept {
+  log_line(iter, iter.position()+delta, depth_t(iter.depth()+depth_delta), title_prefix, title, detail);
+}
+inline void log_line(const json_iterator &iter, token_position index, depth_t depth, const char *title_prefix, const char *title, std::string_view detail) noexcept {
+  if (LOG_ENABLED) {
+    const int indent = depth*2;
+    const auto buf = iter.token.buf;
+    printf("| %*s%s%-*s ",
+      indent, "",
+      title_prefix,
+      LOG_EVENT_LEN - indent - int(strlen(title_prefix)), title
+      );
+    {
+      // Print the current structural.
+      printf("| ");
+      auto current_structural = &buf[*index];
+      for (int i=0;i<LOG_BUFFER_LEN;i++) {
+        printf("%c", printable_char(current_structural[i]));
+      }
+      printf(" ");
+    }
+    {
+      // Print the next structural.
+      printf("| ");
+      auto next_structural = &buf[*(index+1)];
+      for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
+        printf("%c", printable_char(next_structural[i]));
+      }
+      printf(" ");
+    }
+    // printf("| %5u ", *(index+1));
+    printf("| %5i ", depth);
+    printf("| %.*s ", int(detail.size()), detail.data());
+    printf("|\n");
+    fflush(stdout);
+  }
+}
+
+} // namespace logger
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/logger-inl.h */
+/* begin file include/simdjson/generic/ondemand/raw_json_string-inl.h */
+namespace simdjson {
+
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline raw_json_string::raw_json_string(const uint8_t * _buf) noexcept : buf{_buf} {}
+
+simdjson_inline const char * raw_json_string::raw() const noexcept { return reinterpret_cast<const char *>(buf); }
+
+
+simdjson_inline bool raw_json_string::is_free_from_unescaped_quote(std::string_view target) noexcept {
+  size_t pos{0};
+  // if the content has no escape character, just scan through it quickly!
+  for(;pos < target.size() && target[pos] != '\\';pos++) {}
+  // slow path may begin.
+  bool escaping{false};
+  for(;pos < target.size();pos++) {
+    if((target[pos] == '"') && !escaping) {
+      return false;
+    } else if(target[pos] == '\\') {
+      escaping = !escaping;
+    } else {
+      escaping = false;
+    }
+  }
+  return true;
+}
+
+simdjson_inline bool raw_json_string::is_free_from_unescaped_quote(const char* target) noexcept {
+  size_t pos{0};
+  // if the content has no escape character, just scan through it quickly!
+  for(;target[pos] && target[pos] != '\\';pos++) {}
+  // slow path may begin.
+  bool escaping{false};
+  for(;target[pos];pos++) {
+    if((target[pos] == '"') && !escaping) {
+      return false;
+    } else if(target[pos] == '\\') {
+      escaping = !escaping;
+    } else {
+      escaping = false;
+    }
+  }
+  return true;
+}
+
+
+simdjson_inline bool raw_json_string::unsafe_is_equal(size_t length, std::string_view target) const noexcept {
+  // If we are going to call memcmp, then we must know something about the length of the raw_json_string.
+  return (length >= target.size()) && (raw()[target.size()] == '"') && !memcmp(raw(), target.data(), target.size());
+}
+
+simdjson_inline bool raw_json_string::unsafe_is_equal(std::string_view target) const noexcept {
+  // Assumptions: does not contain unescaped quote characters, and
+  // the raw content is quote terminated within a valid JSON string.
+  if(target.size() <= SIMDJSON_PADDING) {
+    return (raw()[target.size()] == '"') && !memcmp(raw(), target.data(), target.size());
+  }
+  const char * r{raw()};
+  size_t pos{0};
+  for(;pos < target.size();pos++) {
+    if(r[pos] != target[pos]) { return false; }
+  }
+  if(r[pos] != '"') { return false; }
+  return true;
+}
+
+simdjson_inline bool raw_json_string::is_equal(std::string_view target) const noexcept {
+  const char * r{raw()};
+  size_t pos{0};
+  bool escaping{false};
+  for(;pos < target.size();pos++) {
+    if(r[pos] != target[pos]) { return false; }
+    // if target is a compile-time constant and it is free from
+    // quotes, then the next part could get optimized away through
+    // inlining.
+    if((target[pos] == '"') && !escaping) {
+      // We have reached the end of the raw_json_string but
+      // the target is not done.
+      return false;
+    } else if(target[pos] == '\\') {
+      escaping = !escaping;
+    } else {
+      escaping = false;
+    }
+  }
+  if(r[pos] != '"') { return false; }
+  return true;
+}
+
+
+simdjson_inline bool raw_json_string::unsafe_is_equal(const char * target) const noexcept {
+  // Assumptions: 'target' does not contain unescaped quote characters, is null terminated and
+  // the raw content is quote terminated within a valid JSON string.
+  const char * r{raw()};
+  size_t pos{0};
+  for(;target[pos];pos++) {
+    if(r[pos] != target[pos]) { return false; }
+  }
+  if(r[pos] != '"') { return false; }
+  return true;
+}
+
+simdjson_inline bool raw_json_string::is_equal(const char* target) const noexcept {
+  // Assumptions: does not contain unescaped quote characters, and
+  // the raw content is quote terminated within a valid JSON string.
+  const char * r{raw()};
+  size_t pos{0};
+  bool escaping{false};
+  for(;target[pos];pos++) {
+    if(r[pos] != target[pos]) { return false; }
+    // if target is a compile-time constant and it is free from
+    // quotes, then the next part could get optimized away through
+    // inlining.
+    if((target[pos] == '"') && !escaping) {
+      // We have reached the end of the raw_json_string but
+      // the target is not done.
+      return false;
+    } else if(target[pos] == '\\') {
+      escaping = !escaping;
+    } else {
+      escaping = false;
+    }
+  }
+  if(r[pos] != '"') { return false; }
+  return true;
+}
+
+simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept {
+  return a.unsafe_is_equal(c);
+}
+
+simdjson_unused simdjson_inline bool operator==(std::string_view c, const raw_json_string &a) noexcept {
+  return a == c;
+}
+
+simdjson_unused simdjson_inline bool operator!=(const raw_json_string &a, std::string_view c) noexcept {
+  return !(a == c);
+}
+
+simdjson_unused simdjson_inline bool operator!=(std::string_view c, const raw_json_string &a) noexcept {
+  return !(a == c);
+}
+
+
+simdjson_inline simdjson_warn_unused simdjson_result<std::string_view> raw_json_string::unescape(json_iterator &iter) const noexcept {
+  return iter.unescape(*this);
+}
+
+
+simdjson_unused simdjson_inline std::ostream &operator<<(std::ostream &out, const raw_json_string &str) noexcept {
+  bool in_escape = false;
+  const char *s = str.raw();
+  while (true) {
+    switch (*s) {
+      case '\\': in_escape = !in_escape; break;
+      case '"': if (in_escape) { in_escape = false; } else { return out; } break;
+      default: if (in_escape) { in_escape = false; }
+    }
+    out << *s;
+    s++;
+  }
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string &&value) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string>(value)) {}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string>::simdjson_result(error_code error) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string>(error) {}
+
+simdjson_inline simdjson_result<const char *> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string>::raw() const noexcept {
+  if (error()) { return error(); }
+  return first.raw();
+}
+simdjson_inline simdjson_warn_unused simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string>::unescape(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator &iter) const noexcept {
+  if (error()) { return error(); }
+  return first.unescape(iter);
+}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/raw_json_string-inl.h */
+/* begin file include/simdjson/generic/ondemand/token_iterator-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline token_iterator::token_iterator(
+  const uint8_t *_buf,
+  token_position position
+) noexcept : buf{_buf}, _position{position}
+{
+}
+
+simdjson_inline uint32_t token_iterator::current_offset() const noexcept {
+  return *(_position);
+}
+
+
+simdjson_inline const uint8_t *token_iterator::return_current_and_advance() noexcept {
+  return &buf[*(_position++)];
+}
+
+simdjson_inline const uint8_t *token_iterator::peek(token_position position) const noexcept {
+  return &buf[*position];
+}
+simdjson_inline uint32_t token_iterator::peek_index(token_position position) const noexcept {
+  return *position;
+}
+simdjson_inline uint32_t token_iterator::peek_length(token_position position) const noexcept {
+  return *(position+1) - *position;
+}
+
+simdjson_inline const uint8_t *token_iterator::peek(int32_t delta) const noexcept {
+  return &buf[*(_position+delta)];
+}
+simdjson_inline uint32_t token_iterator::peek_index(int32_t delta) const noexcept {
+  return *(_position+delta);
+}
+simdjson_inline uint32_t token_iterator::peek_length(int32_t delta) const noexcept {
+  return *(_position+delta+1) - *(_position+delta);
+}
+
+simdjson_inline token_position token_iterator::position() const noexcept {
+  return _position;
+}
+simdjson_inline void token_iterator::set_position(token_position target_position) noexcept {
+  _position = target_position;
+}
+
+simdjson_inline bool token_iterator::operator==(const token_iterator &other) const noexcept {
+  return _position == other._position;
+}
+simdjson_inline bool token_iterator::operator!=(const token_iterator &other) const noexcept {
+  return _position != other._position;
+}
+simdjson_inline bool token_iterator::operator>(const token_iterator &other) const noexcept {
+  return _position > other._position;
+}
+simdjson_inline bool token_iterator::operator>=(const token_iterator &other) const noexcept {
+  return _position >= other._position;
+}
+simdjson_inline bool token_iterator::operator<(const token_iterator &other) const noexcept {
+  return _position < other._position;
+}
+simdjson_inline bool token_iterator::operator<=(const token_iterator &other) const noexcept {
+  return _position <= other._position;
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator &&value) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator>(value)) {}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator>::simdjson_result(error_code error) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::token_iterator>(error) {}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/token_iterator-inl.h */
+/* begin file include/simdjson/generic/ondemand/json_iterator-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline json_iterator::json_iterator(json_iterator &&other) noexcept
+  : token(std::forward<token_iterator>(other.token)),
+    parser{other.parser},
+    _string_buf_loc{other._string_buf_loc},
+    error{other.error},
+    _depth{other._depth},
+    _root{other._root},
+    _streaming{other._streaming}
+{
+  other.parser = nullptr;
+}
+simdjson_inline json_iterator &json_iterator::operator=(json_iterator &&other) noexcept {
+  token = other.token;
+  parser = other.parser;
+  _string_buf_loc = other._string_buf_loc;
+  error = other.error;
+  _depth = other._depth;
+  _root = other._root;
+  _streaming = other._streaming;
+  other.parser = nullptr;
+  return *this;
+}
+
+simdjson_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parser *_parser) noexcept
+  : token(buf, &_parser->implementation->structural_indexes[0]),
+    parser{_parser},
+    _string_buf_loc{parser->string_buf.get()},
+    _depth{1},
+    _root{parser->implementation->structural_indexes.get()},
+    _streaming{false}
+
+{
+  logger::log_headers();
+#if SIMDJSON_CHECK_EOF
+  assert_more_tokens();
+#endif
+}
+
+inline void json_iterator::rewind() noexcept {
+  token.set_position( root_position() );
+  logger::log_headers(); // We start again
+  _string_buf_loc = parser->string_buf.get();
+  _depth = 1;
+}
+
+inline bool json_iterator::balanced() const noexcept {
+  token_iterator ti(token);
+  int32_t count{0};
+  ti.set_position( root_position() );
+  while(ti.peek() <= peek_last()) {
+    switch (*ti.return_current_and_advance())
+    {
+    case '[': case '{':
+      count++;
+      break;
+    case ']': case '}':
+      count--;
+      break;
+    default:
+      break;
+    }
+  }
+  return count == 0;
+}
+
+
+// GCC 7 warns when the first line of this function is inlined away into oblivion due to the caller
+// relating depth and parent_depth, which is a desired effect. The warning does not show up if the
+// skip_child() function is not marked inline).
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING
+simdjson_warn_unused simdjson_inline error_code json_iterator::skip_child(depth_t parent_depth) noexcept {
+  if (depth() <= parent_depth) { return SUCCESS; }
+  switch (*return_current_and_advance()) {
+    // TODO consider whether matching braces is a requirement: if non-matching braces indicates
+    // *missing* braces, then future lookups are not in the object/arrays they think they are,
+    // violating the rule "validate enough structure that the user can be confident they are
+    // looking at the right values."
+    // PERF TODO we can eliminate the switch here with a lookup of how much to add to depth
+
+    // For the first open array/object in a value, we've already incremented depth, so keep it the same
+    // We never stop at colon, but if we did, it wouldn't affect depth
+    case '[': case '{': case ':':
+      logger::log_start_value(*this, "skip");
+      break;
+    // If there is a comma, we have just finished a value in an array/object, and need to get back in
+    case ',':
+      logger::log_value(*this, "skip");
+      break;
+    // ] or } means we just finished a value and need to jump out of the array/object
+    case ']': case '}':
+      logger::log_end_value(*this, "skip");
+      _depth--;
+      if (depth() <= parent_depth) { return SUCCESS; }
+#if SIMDJSON_CHECK_EOF
+      // If there are no more tokens, the parent is incomplete.
+      if (at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "Missing [ or { at start"); }
+#endif // SIMDJSON_CHECK_EOF
+      break;
+    case '"':
+      if(*peek() == ':') {
+        // We are at a key!!!
+        // This might happen if you just started an object and you skip it immediately.
+        // Performance note: it would be nice to get rid of this check as it is somewhat
+        // expensive.
+        // https://github.com/simdjson/simdjson/issues/1742
+        logger::log_value(*this, "key");
+        return_current_and_advance(); // eat up the ':'
+        break; // important!!!
+      }
+      simdjson_fallthrough;
+    // Anything else must be a scalar value
+    default:
+      // For the first scalar, we will have incremented depth already, so we decrement it here.
+      logger::log_value(*this, "skip");
+      _depth--;
+      if (depth() <= parent_depth) { return SUCCESS; }
+      break;
+  }
+
+  // Now that we've considered the first value, we only increment/decrement for arrays/objects
+  while (position() < end_position()) {
+    switch (*return_current_and_advance()) {
+      case '[': case '{':
+        logger::log_start_value(*this, "skip");
+        _depth++;
+        break;
+      // TODO consider whether matching braces is a requirement: if non-matching braces indicates
+      // *missing* braces, then future lookups are not in the object/arrays they think they are,
+      // violating the rule "validate enough structure that the user can be confident they are
+      // looking at the right values."
+      // PERF TODO we can eliminate the switch here with a lookup of how much to add to depth
+      case ']': case '}':
+        logger::log_end_value(*this, "skip");
+        _depth--;
+        if (depth() <= parent_depth) { return SUCCESS; }
+        break;
+      default:
+        logger::log_value(*this, "skip", "");
+        break;
+    }
+  }
+
+  return report_error(TAPE_ERROR, "not enough close braces");
+}
+
+SIMDJSON_POP_DISABLE_WARNINGS
+
+simdjson_inline bool json_iterator::at_root() const noexcept {
+  return position() == root_position();
+}
+
+simdjson_inline bool json_iterator::is_single_token() const noexcept {
+  return parser->implementation->n_structural_indexes == 1;
+}
+
+simdjson_inline bool json_iterator::streaming() const noexcept {
+  return _streaming;
+}
+
+simdjson_inline token_position json_iterator::root_position() const noexcept {
+  return _root;
+}
+
+simdjson_inline void json_iterator::assert_at_document_depth() const noexcept {
+  SIMDJSON_ASSUME( _depth == 1 );
+}
+
+simdjson_inline void json_iterator::assert_at_root() const noexcept {
+  SIMDJSON_ASSUME( _depth == 1 );
+#ifndef SIMDJSON_CLANG_VISUAL_STUDIO
+  // Under Visual Studio, the next SIMDJSON_ASSUME fails with: the argument
+  // has side effects that will be discarded.
+  SIMDJSON_ASSUME( token.position() == _root );
+#endif
+}
+
+simdjson_inline void json_iterator::assert_more_tokens(uint32_t required_tokens) const noexcept {
+  assert_valid_position(token._position + required_tokens - 1);
+}
+
+simdjson_inline void json_iterator::assert_valid_position(token_position position) const noexcept {
+#ifndef SIMDJSON_CLANG_VISUAL_STUDIO
+  SIMDJSON_ASSUME( position >= &parser->implementation->structural_indexes[0] );
+  SIMDJSON_ASSUME( position < &parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] );
+#endif
+}
+
+simdjson_inline bool json_iterator::at_end() const noexcept {
+  return position() == end_position();
+}
+simdjson_inline token_position json_iterator::end_position() const noexcept {
+  uint32_t n_structural_indexes{parser->implementation->n_structural_indexes};
+  return &parser->implementation->structural_indexes[n_structural_indexes];
+}
+
+inline std::string json_iterator::to_string() const noexcept {
+  if( !is_alive() ) { return "dead json_iterator instance"; }
+  const char * current_structural = reinterpret_cast<const char *>(token.peek());
+  return std::string("json_iterator [ depth : ") + std::to_string(_depth)
+          + std::string(", structural : '") + std::string(current_structural,1)
+          + std::string("', offset : ") + std::to_string(token.current_offset())
+          + std::string("', error : ") + error_message(error)
+          + std::string(" ]");
+}
+
+inline simdjson_result<const char *> json_iterator::current_location() noexcept {
+  if (!is_alive()) {    // Unrecoverable error
+    if (!at_root()) {
+      return reinterpret_cast<const char *>(token.peek(-1));
+    } else {
+      return reinterpret_cast<const char *>(token.peek());
+    }
+  }
+  if (at_end()) {
+    return OUT_OF_BOUNDS;
+  }
+  return reinterpret_cast<const char *>(token.peek());
+}
+
+simdjson_inline bool json_iterator::is_alive() const noexcept {
+  return parser;
+}
+
+simdjson_inline void json_iterator::abandon() noexcept {
+  parser = nullptr;
+  _depth = 0;
+}
+
+simdjson_inline const uint8_t *json_iterator::return_current_and_advance() noexcept {
+#if SIMDJSON_CHECK_EOF
+  assert_more_tokens();
+#endif // SIMDJSON_CHECK_EOF
+  return token.return_current_and_advance();
+}
+
+simdjson_inline const uint8_t *json_iterator::unsafe_pointer() const noexcept {
+  // deliberately done without safety guard:
+  return token.peek(0);
+}
+
+simdjson_inline const uint8_t *json_iterator::peek(int32_t delta) const noexcept {
+#if SIMDJSON_CHECK_EOF
+  assert_more_tokens(delta+1);
+#endif // SIMDJSON_CHECK_EOF
+  return token.peek(delta);
+}
+
+simdjson_inline uint32_t json_iterator::peek_length(int32_t delta) const noexcept {
+#if SIMDJSON_CHECK_EOF
+  assert_more_tokens(delta+1);
+#endif // #if SIMDJSON_CHECK_EOF
+  return token.peek_length(delta);
+}
+
+simdjson_inline const uint8_t *json_iterator::peek(token_position position) const noexcept {
+  // todo: currently we require end-of-string buffering, but the following
+  // assert_valid_position should be turned on if/when we lift that condition.
+  // assert_valid_position(position);
+  // This is almost surely related to SIMDJSON_CHECK_EOF but given that SIMDJSON_CHECK_EOF
+  // is ON by default, we have no choice but to disable it for real with a comment.
+  return token.peek(position);
+}
+
+simdjson_inline uint32_t json_iterator::peek_length(token_position position) const noexcept {
+#if SIMDJSON_CHECK_EOF
+  assert_valid_position(position);
+#endif // SIMDJSON_CHECK_EOF
+  return token.peek_length(position);
+}
+
+simdjson_inline token_position json_iterator::last_position() const noexcept {
+  // The following line fails under some compilers...
+  // SIMDJSON_ASSUME(parser->implementation->n_structural_indexes > 0);
+  // since it has side-effects.
+  uint32_t n_structural_indexes{parser->implementation->n_structural_indexes};
+  SIMDJSON_ASSUME(n_structural_indexes > 0);
+  return &parser->implementation->structural_indexes[n_structural_indexes - 1];
+}
+simdjson_inline const uint8_t *json_iterator::peek_last() const noexcept {
+  return token.peek(last_position());
+}
+
+simdjson_inline void json_iterator::ascend_to(depth_t parent_depth) noexcept {
+  SIMDJSON_ASSUME(parent_depth >= 0 && parent_depth < INT32_MAX - 1);
+  SIMDJSON_ASSUME(_depth == parent_depth + 1);
+  _depth = parent_depth;
+}
+
+simdjson_inline void json_iterator::descend_to(depth_t child_depth) noexcept {
+  SIMDJSON_ASSUME(child_depth >= 1 && child_depth < INT32_MAX);
+  SIMDJSON_ASSUME(_depth == child_depth - 1);
+  _depth = child_depth;
+}
+
+simdjson_inline depth_t json_iterator::depth() const noexcept {
+  return _depth;
+}
+
+simdjson_inline uint8_t *&json_iterator::string_buf_loc() noexcept {
+  return _string_buf_loc;
+}
+
+simdjson_inline error_code json_iterator::report_error(error_code _error, const char *message) noexcept {
+  SIMDJSON_ASSUME(_error != SUCCESS && _error != UNINITIALIZED && _error != INCORRECT_TYPE && _error != NO_SUCH_FIELD);
+  logger::log_error(*this, message);
+  error = _error;
+  return error;
+}
+
+simdjson_inline token_position json_iterator::position() const noexcept {
+  return token.position();
+}
+
+simdjson_inline simdjson_result<std::string_view> json_iterator::unescape(raw_json_string in) noexcept {
+  return parser->unescape(in, _string_buf_loc);
+}
+
+simdjson_inline void json_iterator::reenter_child(token_position position, depth_t child_depth) noexcept {
+  SIMDJSON_ASSUME(child_depth >= 1 && child_depth < INT32_MAX);
+  SIMDJSON_ASSUME(_depth == child_depth - 1);
+#if SIMDJSON_DEVELOPMENT_CHECKS
+#ifndef SIMDJSON_CLANG_VISUAL_STUDIO
+  SIMDJSON_ASSUME(size_t(child_depth) < parser->max_depth());
+  SIMDJSON_ASSUME(position >= parser->start_positions[child_depth]);
+#endif
+#endif
+  token.set_position(position);
+  _depth = child_depth;
+}
+
+#if SIMDJSON_DEVELOPMENT_CHECKS
+
+simdjson_inline token_position json_iterator::start_position(depth_t depth) const noexcept {
+  SIMDJSON_ASSUME(size_t(depth) < parser->max_depth());
+  return size_t(depth) < parser->max_depth() ? parser->start_positions[depth] : 0;
+}
+
+simdjson_inline void json_iterator::set_start_position(depth_t depth, token_position position) noexcept {
+  SIMDJSON_ASSUME(size_t(depth) < parser->max_depth());
+  if(size_t(depth) < parser->max_depth()) { parser->start_positions[depth] = position; }
+}
+
+#endif
+
+
+simdjson_inline error_code json_iterator::optional_error(error_code _error, const char *message) noexcept {
+  SIMDJSON_ASSUME(_error == INCORRECT_TYPE || _error == NO_SUCH_FIELD);
+  logger::log_error(*this, message);
+  return _error;
+}
+
+template<int N>
+simdjson_warn_unused simdjson_inline bool json_iterator::copy_to_buffer(const uint8_t *json, uint32_t max_len, uint8_t (&tmpbuf)[N]) noexcept {
+  // Let us guard against silly cases:
+  if((N < max_len) || (N == 0)) { return false; }
+  // Truncate whitespace to fit the buffer.
+  if (max_len > N-1) {
+    // if (jsoncharutils::is_not_structural_or_whitespace(json[N-1])) { return false; }
+    max_len = N-1;
+  }
+
+  // Copy to the buffer.
+  std::memcpy(tmpbuf, json, max_len);
+  tmpbuf[max_len] = ' ';
+  return true;
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator &&value) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator>(value)) {}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator>::simdjson_result(error_code error) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_iterator>(error) {}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/json_iterator-inl.h */
+/* begin file include/simdjson/generic/ondemand/value_iterator-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline value_iterator::value_iterator(
+  json_iterator *json_iter,
+  depth_t depth,
+  token_position start_position
+) noexcept : _json_iter{json_iter}, _depth{depth}, _start_position{start_position}
+{
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::start_object() noexcept {
+  SIMDJSON_TRY( start_container('{', "Not an object", "object") );
+  return started_object();
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::start_root_object() noexcept {
+  SIMDJSON_TRY( start_container('{', "Not an object", "object") );
+  return started_root_object();
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::started_object() noexcept {
+  assert_at_container_start();
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  _json_iter->set_start_position(_depth, start_position());
+#endif
+  if (*_json_iter->peek() == '}') {
+    logger::log_value(*_json_iter, "empty object");
+    _json_iter->return_current_and_advance();
+    end_container();
+    return false;
+  }
+  return true;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::started_root_object() noexcept {
+  // When in streaming mode, we cannot expect peek_last() to be the last structural element of the
+  // current document. It only works in the normal mode where we have indexed a single document.
+  // Note that adding a check for 'streaming' is not expensive since we only have at most
+  // one root element.
+  if ( ! _json_iter->streaming() ) {
+    if (*_json_iter->peek_last() != '}') {
+      _json_iter->abandon();
+      return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing } at end");
+    }
+    // If the last character is } *and* the first gibberish character is also '}'
+    // then on-demand could accidentally go over. So we need additional checks.
+    // https://github.com/simdjson/simdjson/issues/1834
+    // Checking that the document is balanced requires a full scan which is potentially
+    // expensive, but it only happens in edge cases where the first padding character is
+    // a closing bracket.
+    if ((*_json_iter->peek(_json_iter->end_position()) == '}') && (!_json_iter->balanced())) {
+      _json_iter->abandon();
+      // The exact error would require more work. It will typically be an unclosed object.
+      return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "the document is unbalanced");
+    }
+  }
+  return started_object();
+}
+
+simdjson_warn_unused simdjson_inline error_code value_iterator::end_container() noexcept {
+#if SIMDJSON_CHECK_EOF
+    if (depth() > 1 && at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing parent ] or }"); }
+    // if (depth() <= 1 && !at_end()) { return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing [ or { at start"); }
+#endif // SIMDJSON_CHECK_EOF
+    _json_iter->ascend_to(depth()-1);
+    return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::has_next_field() noexcept {
+  assert_at_next();
+
+  // It's illegal to call this unless there are more tokens: anything that ends in } or ] is
+  // obligated to verify there are more tokens if they are not the top level.
+  switch (*_json_iter->return_current_and_advance()) {
+    case '}':
+      logger::log_end_value(*_json_iter, "object");
+      SIMDJSON_TRY( end_container() );
+      return false;
+    case ',':
+      return true;
+    default:
+      return report_error(TAPE_ERROR, "Missing comma between object fields");
+  }
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::find_field_raw(const std::string_view key) noexcept {
+  error_code error;
+  bool has_value;
+  //
+  // Initially, the object can be in one of a few different places:
+  //
+  // 1. The start of the object, at the first field:
+  //
+  //    ```
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //      ^ (depth 2, index 1)
+  //    ```
+  if (at_first_field()) {
+    has_value = true;
+
+  //
+  // 2. When a previous search did not yield a value or the object is empty:
+  //
+  //    ```
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                                     ^ (depth 0)
+  //    { }
+  //        ^ (depth 0, index 2)
+  //    ```
+  //
+  } else if (!is_open()) {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+    // If we're past the end of the object, we're being iterated out of order.
+    // Note: this isn't perfect detection. It's possible the user is inside some other object; if so,
+    // this object iterator will blithely scan that object for fields.
+    if (_json_iter->depth() < depth() - 1) { return OUT_OF_ORDER_ITERATION; }
+#endif
+    return false;
+
+  // 3. When a previous search found a field or an iterator yielded a value:
+  //
+  //    ```
+  //    // When a field was not fully consumed (or not even touched at all)
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //           ^ (depth 2)
+  //    // When a field was fully consumed
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                   ^ (depth 1)
+  //    // When the last field was fully consumed
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                                   ^ (depth 1)
+  //    ```
+  //
+  } else {
+    if ((error = skip_child() )) { abandon(); return error; }
+    if ((error = has_next_field().get(has_value) )) { abandon(); return error; }
+#if SIMDJSON_DEVELOPMENT_CHECKS
+    if (_json_iter->start_position(_depth) != start_position()) { return OUT_OF_ORDER_ITERATION; }
+#endif
+  }
+  while (has_value) {
+    // Get the key and colon, stopping at the value.
+    raw_json_string actual_key;
+    // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes
+    // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2.
+    // field_key() advances the pointer and checks that '"' is found (corresponding to a key).
+    // The depth is left unchanged by field_key().
+    if ((error = field_key().get(actual_key) )) { abandon(); return error; };
+    // field_value() will advance and check that we find a ':' separating the
+    // key and the value. It will also increment the depth by one.
+    if ((error = field_value() )) { abandon(); return error; }
+    // If it matches, stop and return
+    // We could do it this way if we wanted to allow arbitrary
+    // key content (including escaped quotes).
+    //if (actual_key.unsafe_is_equal(max_key_length, key)) {
+    // Instead we do the following which may trigger buffer overruns if the
+    // user provides an adversarial key (containing a well placed unescaped quote
+    // character and being longer than the number of bytes remaining in the JSON
+    // input).
+    if (actual_key.unsafe_is_equal(key)) {
+      logger::log_event(*this, "match", key, -2);
+      // If we return here, then we return while pointing at the ':' that we just checked.
+      return true;
+    }
+
+    // No match: skip the value and see if , or } is next
+    logger::log_event(*this, "no match", key, -2);
+    // The call to skip_child is meant to skip over the value corresponding to the key.
+    // After skip_child(), we are right before the next comma (',') or the final brace ('}').
+    SIMDJSON_TRY( skip_child() ); // Skip the value entirely
+    // The has_next_field() advances the pointer and check that either ',' or '}' is found.
+    // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found,
+    // then we are in error and we abort.
+    if ((error = has_next_field().get(has_value) )) { abandon(); return error; }
+  }
+
+  // If the loop ended, we're out of fields to look at.
+  return false;
+}
+
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::find_field_unordered_raw(const std::string_view key) noexcept {
+  /**
+   * When find_field_unordered_raw is called, we can either be pointing at the
+   * first key, pointing outside (at the closing brace) or if a key was matched
+   * we can be either pointing right afterthe ':' right before the value (that we need skip),
+   * or we may have consumed the value and we might be at a comma or at the
+   * final brace (ready for a call to has_next_field()).
+   */
+  error_code error;
+  bool has_value;
+
+  // First, we scan from that point to the end.
+  // If we don't find a match, we may loop back around, and scan from the beginning to that point.
+  token_position search_start = _json_iter->position();
+
+  // We want to know whether we need to go back to the beginning.
+  bool at_first = at_first_field();
+  ///////////////
+  // Initially, the object can be in one of a few different places:
+  //
+  // 1. At the first key:
+  //
+  //    ```
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //      ^ (depth 2, index 1)
+  //    ```
+  //
+  if (at_first) {
+    has_value = true;
+
+  // 2. When a previous search did not yield a value or the object is empty:
+  //
+  //    ```
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                                     ^ (depth 0)
+  //    { }
+  //        ^ (depth 0, index 2)
+  //    ```
+  //
+  } else if (!is_open()) {
+
+#if SIMDJSON_DEVELOPMENT_CHECKS
+    // If we're past the end of the object, we're being iterated out of order.
+    // Note: this isn't perfect detection. It's possible the user is inside some other object; if so,
+    // this object iterator will blithely scan that object for fields.
+    if (_json_iter->depth() < depth() - 1) { return OUT_OF_ORDER_ITERATION; }
+#endif
+    SIMDJSON_TRY(reset_object().get(has_value));
+    at_first = true;
+  // 3. When a previous search found a field or an iterator yielded a value:
+  //
+  //    ```
+  //    // When a field was not fully consumed (or not even touched at all)
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //           ^ (depth 2)
+  //    // When a field was fully consumed
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                   ^ (depth 1)
+  //    // When the last field was fully consumed
+  //    { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                                   ^ (depth 1)
+  //    ```
+  //
+  } else {
+    // If someone queried a key but they not did access the value, then we are left pointing
+    // at the ':' and we need to move forward through the value... If the value was
+    // processed then skip_child() does not move the iterator (but may adjust the depth).
+    if ((error = skip_child() )) { abandon(); return error; }
+    search_start = _json_iter->position();
+    if ((error = has_next_field().get(has_value) )) { abandon(); return error; }
+#if SIMDJSON_DEVELOPMENT_CHECKS
+    if (_json_iter->start_position(_depth) != start_position()) { return OUT_OF_ORDER_ITERATION; }
+#endif
+  }
+
+  // After initial processing, we will be in one of two states:
+  //
+  // ```
+  // // At the beginning of a field
+  // { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //   ^ (depth 1)
+  // { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                  ^ (depth 1)
+  // // At the end of the object
+  // { "a": [ 1, 2 ], "b": [ 3, 4 ] }
+  //                                  ^ (depth 0)
+  // ```
+  //
+  // Next, we find a match starting from the current position.
+  while (has_value) {
+    SIMDJSON_ASSUME( _json_iter->_depth == _depth ); // We must be at the start of a field
+
+    // Get the key and colon, stopping at the value.
+    raw_json_string actual_key;
+    // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes
+    // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2.
+    // field_key() advances the pointer and checks that '"' is found (corresponding to a key).
+    // The depth is left unchanged by field_key().
+    if ((error = field_key().get(actual_key) )) { abandon(); return error; };
+    // field_value() will advance and check that we find a ':' separating the
+    // key and the value. It will also increment the depth by one.
+    if ((error = field_value() )) { abandon(); return error; }
+
+    // If it matches, stop and return
+    // We could do it this way if we wanted to allow arbitrary
+    // key content (including escaped quotes).
+    // if (actual_key.unsafe_is_equal(max_key_length, key)) {
+    // Instead we do the following which may trigger buffer overruns if the
+    // user provides an adversarial key (containing a well placed unescaped quote
+    // character and being longer than the number of bytes remaining in the JSON
+    // input).
+    if (actual_key.unsafe_is_equal(key)) {
+      logger::log_event(*this, "match", key, -2);
+      // If we return here, then we return while pointing at the ':' that we just checked.
+      return true;
+    }
+
+    // No match: skip the value and see if , or } is next
+    logger::log_event(*this, "no match", key, -2);
+    // The call to skip_child is meant to skip over the value corresponding to the key.
+    // After skip_child(), we are right before the next comma (',') or the final brace ('}').
+    SIMDJSON_TRY( skip_child() );
+    // The has_next_field() advances the pointer and check that either ',' or '}' is found.
+    // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found,
+    // then we are in error and we abort.
+    if ((error = has_next_field().get(has_value) )) { abandon(); return error; }
+  }
+  // Performance note: it maybe wasteful to rewind to the beginning when there might be
+  // no other query following. Indeed, it would require reskipping the whole object.
+  // Instead, you can just stay where you are. If there is a new query, there is always time
+  // to rewind.
+  if(at_first) { return false; }
+
+  // If we reach the end without finding a match, search the rest of the fields starting at the
+  // beginning of the object.
+  // (We have already run through the object before, so we've already validated its structure. We
+  // don't check errors in this bit.)
+  SIMDJSON_TRY(reset_object().get(has_value));
+  while (true) {
+    SIMDJSON_ASSUME(has_value); // we should reach search_start before ever reaching the end of the object
+    SIMDJSON_ASSUME( _json_iter->_depth == _depth ); // We must be at the start of a field
+
+    // Get the key and colon, stopping at the value.
+    raw_json_string actual_key;
+    // size_t max_key_length = _json_iter->peek_length() - 2; // -2 for the two quotes
+    // Note: _json_iter->peek_length() - 2 might overflow if _json_iter->peek_length() < 2.
+    // field_key() advances the pointer and checks that '"' is found (corresponding to a key).
+    // The depth is left unchanged by field_key().
+    error = field_key().get(actual_key); SIMDJSON_ASSUME(!error);
+    // field_value() will advance and check that we find a ':' separating the
+    // key and the value.  It will also increment the depth by one.
+    error = field_value(); SIMDJSON_ASSUME(!error);
+
+    // If it matches, stop and return
+    // We could do it this way if we wanted to allow arbitrary
+    // key content (including escaped quotes).
+    // if (actual_key.unsafe_is_equal(max_key_length, key)) {
+    // Instead we do the following which may trigger buffer overruns if the
+    // user provides an adversarial key (containing a well placed unescaped quote
+    // character and being longer than the number of bytes remaining in the JSON
+    // input).
+    if (actual_key.unsafe_is_equal(key)) {
+      logger::log_event(*this, "match", key, -2);
+      // If we return here, then we return while pointing at the ':' that we just checked.
+      return true;
+    }
+
+    // No match: skip the value and see if , or } is next
+    logger::log_event(*this, "no match", key, -2);
+    // The call to skip_child is meant to skip over the value corresponding to the key.
+    // After skip_child(), we are right before the next comma (',') or the final brace ('}').
+    SIMDJSON_TRY( skip_child() );
+    // If we reached the end of the key-value pair we started from, then we know
+    // that the key is not there so we return false. We are either right before
+    // the next comma or the final brace.
+    if(_json_iter->position() == search_start) { return false; }
+    // The has_next_field() advances the pointer and check that either ',' or '}' is found.
+    // It returns true if ',' is found, false otherwise. If anything other than ',' or '}' is found,
+    // then we are in error and we abort.
+    error = has_next_field().get(has_value); SIMDJSON_ASSUME(!error);
+    // If we make the mistake of exiting here, then we could be left pointing at a key
+    // in the middle of an object. That's not an allowable state.
+  }
+  // If the loop ended, we're out of fields to look at. The program should
+  // never reach this point.
+  return false;
+}
+SIMDJSON_POP_DISABLE_WARNINGS
+
+simdjson_warn_unused simdjson_inline simdjson_result<raw_json_string> value_iterator::field_key() noexcept {
+  assert_at_next();
+
+  const uint8_t *key = _json_iter->return_current_and_advance();
+  if (*(key++) != '"') { return report_error(TAPE_ERROR, "Object key is not a string"); }
+  return raw_json_string(key);
+}
+
+simdjson_warn_unused simdjson_inline error_code value_iterator::field_value() noexcept {
+  assert_at_next();
+
+  if (*_json_iter->return_current_and_advance() != ':') { return report_error(TAPE_ERROR, "Missing colon in object field"); }
+  _json_iter->descend_to(depth()+1);
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::start_array() noexcept {
+  SIMDJSON_TRY( start_container('[', "Not an array", "array") );
+  return started_array();
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::start_root_array() noexcept {
+  SIMDJSON_TRY( start_container('[', "Not an array", "array") );
+  return started_root_array();
+}
+
+inline std::string value_iterator::to_string() const noexcept {
+  auto answer = std::string("value_iterator [ depth : ") + std::to_string(_depth) + std::string(", ");
+  if(_json_iter != nullptr) { answer +=  _json_iter->to_string(); }
+  answer += std::string(" ]");
+  return answer;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::started_array() noexcept {
+  assert_at_container_start();
+  if (*_json_iter->peek() == ']') {
+    logger::log_value(*_json_iter, "empty array");
+    _json_iter->return_current_and_advance();
+    SIMDJSON_TRY( end_container() );
+    return false;
+  }
+  _json_iter->descend_to(depth()+1);
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  _json_iter->set_start_position(_depth, start_position());
+#endif
+  return true;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::started_root_array() noexcept {
+  // When in streaming mode, we cannot expect peek_last() to be the last structural element of the
+  // current document. It only works in the normal mode where we have indexed a single document.
+  // Note that adding a check for 'streaming' is not expensive since we only have at most
+  // one root element.
+  if ( ! _json_iter->streaming() ) {
+    if (*_json_iter->peek_last() != ']') {
+      _json_iter->abandon();
+      return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing ] at end");
+    }
+    // If the last character is ] *and* the first gibberish character is also ']'
+    // then on-demand could accidentally go over. So we need additional checks.
+    // https://github.com/simdjson/simdjson/issues/1834
+    // Checking that the document is balanced requires a full scan which is potentially
+    // expensive, but it only happens in edge cases where the first padding character is
+    // a closing bracket.
+    if ((*_json_iter->peek(_json_iter->end_position()) == ']') && (!_json_iter->balanced())) {
+      _json_iter->abandon();
+      // The exact error would require more work. It will typically be an unclosed array.
+      return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "the document is unbalanced");
+    }
+  }
+  return started_array();
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::has_next_element() noexcept {
+  assert_at_next();
+
+  logger::log_event(*this, "has_next_element");
+  switch (*_json_iter->return_current_and_advance()) {
+    case ']':
+      logger::log_end_value(*_json_iter, "array");
+      SIMDJSON_TRY( end_container() );
+      return false;
+    case ',':
+      _json_iter->descend_to(depth()+1);
+      return true;
+    default:
+      return report_error(TAPE_ERROR, "Missing comma between array elements");
+  }
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::parse_bool(const uint8_t *json) const noexcept {
+  auto not_true = atomparsing::str4ncmp(json, "true");
+  auto not_false = atomparsing::str4ncmp(json, "fals") | (json[4] ^ 'e');
+  bool error = (not_true && not_false) || jsoncharutils::is_not_structural_or_whitespace(json[not_true ? 5 : 4]);
+  if (error) { return incorrect_type_error("Not a boolean"); }
+  return simdjson_result<bool>(!not_true);
+}
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::parse_null(const uint8_t *json) const noexcept {
+  bool is_null_string = !atomparsing::str4ncmp(json, "null") && jsoncharutils::is_structural_or_whitespace(json[4]);
+  // if we start with 'n', we must be a null
+  if(!is_null_string && json[0]=='n') { return incorrect_type_error("Not a null but starts with n"); }
+  return is_null_string;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<std::string_view> value_iterator::get_string() noexcept {
+  return get_raw_json_string().unescape(json_iter());
+}
+simdjson_warn_unused simdjson_inline simdjson_result<raw_json_string> value_iterator::get_raw_json_string() noexcept {
+  auto json = peek_scalar("string");
+  if (*json != '"') { return incorrect_type_error("Not a string"); }
+  advance_scalar("string");
+  return raw_json_string(json+1);
+}
+simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> value_iterator::get_uint64() noexcept {
+  auto result = numberparsing::parse_unsigned(peek_non_root_scalar("uint64"));
+  if(result.error() == SUCCESS) { advance_non_root_scalar("uint64"); }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> value_iterator::get_uint64_in_string() noexcept {
+  auto result = numberparsing::parse_unsigned_in_string(peek_non_root_scalar("uint64"));
+  if(result.error() == SUCCESS) { advance_non_root_scalar("uint64"); }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<int64_t> value_iterator::get_int64() noexcept {
+  auto result = numberparsing::parse_integer(peek_non_root_scalar("int64"));
+  if(result.error() == SUCCESS) { advance_non_root_scalar("int64"); }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<int64_t> value_iterator::get_int64_in_string() noexcept {
+  auto result = numberparsing::parse_integer_in_string(peek_non_root_scalar("int64"));
+  if(result.error() == SUCCESS) { advance_non_root_scalar("int64"); }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<double> value_iterator::get_double() noexcept {
+  auto result = numberparsing::parse_double(peek_non_root_scalar("double"));
+  if(result.error() == SUCCESS) { advance_non_root_scalar("double"); }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<double> value_iterator::get_double_in_string() noexcept {
+  auto result = numberparsing::parse_double_in_string(peek_non_root_scalar("double"));
+  if(result.error() == SUCCESS) { advance_non_root_scalar("double"); }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::get_bool() noexcept {
+  auto result = parse_bool(peek_non_root_scalar("bool"));
+  if(result.error() == SUCCESS) { advance_non_root_scalar("bool"); }
+  return result;
+}
+simdjson_inline simdjson_result<bool> value_iterator::is_null() noexcept {
+  bool is_null_value;
+  SIMDJSON_TRY(parse_null(peek_non_root_scalar("null")).get(is_null_value));
+  if(is_null_value) { advance_non_root_scalar("null"); }
+  return is_null_value;
+}
+simdjson_inline bool value_iterator::is_negative() noexcept {
+  return numberparsing::is_negative(peek_non_root_scalar("numbersign"));
+}
+simdjson_inline bool value_iterator::is_root_negative() noexcept {
+  return numberparsing::is_negative(peek_root_scalar("numbersign"));
+}
+simdjson_inline simdjson_result<bool> value_iterator::is_integer() noexcept {
+  return numberparsing::is_integer(peek_non_root_scalar("integer"));
+}
+simdjson_inline simdjson_result<number_type> value_iterator::get_number_type() noexcept {
+  return numberparsing::get_number_type(peek_non_root_scalar("integer"));
+}
+simdjson_inline simdjson_result<number> value_iterator::get_number() noexcept {
+  number num;
+  error_code error =  numberparsing::parse_number(peek_non_root_scalar("number"), num);
+  if(error) { return error; }
+  return num;
+}
+
+simdjson_inline simdjson_result<bool> value_iterator::is_root_integer() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("is_root_integer");
+  uint8_t tmpbuf[20+1]; // <20 digits> is the longest possible unsigned integer
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    return false; // if there are more than 20 characters, it cannot be represented as an integer.
+  }
+  auto answer = numberparsing::is_integer(tmpbuf);
+  // If the parsing was a success, we must still check that it is
+  // a single scalar. Note that we parse first because of cases like '[]' where
+  // getting TRAILING_CONTENT is wrong.
+  if((answer.error() == SUCCESS) && (!_json_iter->is_single_token())) { return TRAILING_CONTENT; }
+  return answer;
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number_type> value_iterator::get_root_number_type() noexcept {
+  if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("number");
+  // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/,
+  // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest
+  // number: -0.<fraction>e-308.
+  uint8_t tmpbuf[1074+8+1];
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters");
+    return NUMBER_ERROR;
+  }
+  // If the parsing was a success, we must still check that it is
+  // a single scalar. Note that we parse first because of cases like '[]' where
+  // getting TRAILING_CONTENT is wrong.
+  auto answer = numberparsing::get_number_type(tmpbuf);
+  if((answer.error() == SUCCESS) && (!_json_iter->is_single_token())) { return TRAILING_CONTENT; }
+  return answer;
+}
+simdjson_inline simdjson_result<number> value_iterator::get_root_number() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("number");
+  // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/,
+  // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest
+  // number: -0.<fraction>e-308.
+  uint8_t tmpbuf[1074+8+1];
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters");
+    return NUMBER_ERROR;
+  }
+  number num;
+  error_code error =  numberparsing::parse_number(tmpbuf, num);
+  if(error) { return error; }
+  if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+  advance_root_scalar("number");
+  return num;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<std::string_view> value_iterator::get_root_string() noexcept {
+  return get_string();
+}
+simdjson_warn_unused simdjson_inline simdjson_result<raw_json_string> value_iterator::get_root_raw_json_string() noexcept {
+  return get_raw_json_string();
+}
+simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> value_iterator::get_root_uint64() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("uint64");
+  uint8_t tmpbuf[20+1]; // <20 digits> is the longest possible unsigned integer
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters");
+    return NUMBER_ERROR;
+  }
+  auto result = numberparsing::parse_unsigned(tmpbuf);
+  if(result.error() == SUCCESS) {
+    if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+    advance_root_scalar("uint64");
+  }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<uint64_t> value_iterator::get_root_uint64_in_string() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("uint64");
+  uint8_t tmpbuf[20+1]; // <20 digits> is the longest possible unsigned integer
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters");
+    return NUMBER_ERROR;
+  }
+  auto result = numberparsing::parse_unsigned_in_string(tmpbuf);
+  if(result.error() == SUCCESS) {
+    if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+    advance_root_scalar("uint64");
+  }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<int64_t> value_iterator::get_root_int64() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("int64");
+  uint8_t tmpbuf[20+1]; // -<19 digits> is the longest possible integer
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters");
+    return NUMBER_ERROR;
+  }
+
+  auto result = numberparsing::parse_integer(tmpbuf);
+  if(result.error() == SUCCESS) {
+    if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+    advance_root_scalar("int64");
+  }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<int64_t> value_iterator::get_root_int64_in_string() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("int64");
+  uint8_t tmpbuf[20+1]; // -<19 digits> is the longest possible integer
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 20 characters");
+    return NUMBER_ERROR;
+  }
+
+  auto result = numberparsing::parse_integer_in_string(tmpbuf);
+  if(result.error() == SUCCESS) {
+    if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+    advance_root_scalar("int64");
+  }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<double> value_iterator::get_root_double() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("double");
+  // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/,
+  // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest
+  // number: -0.<fraction>e-308.
+  uint8_t tmpbuf[1074+8+1];
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters");
+    return NUMBER_ERROR;
+  }
+  auto result = numberparsing::parse_double(tmpbuf);
+  if(result.error() == SUCCESS) {
+    if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+    advance_root_scalar("double");
+  }
+  return result;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<double> value_iterator::get_root_double_in_string() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("double");
+  // Per https://www.exploringbinary.com/maximum-number-of-decimal-digits-in-binary-floating-point-numbers/,
+  // 1074 is the maximum number of significant fractional digits. Add 8 more digits for the biggest
+  // number: -0.<fraction>e-308.
+  uint8_t tmpbuf[1074+8+1];
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) {
+    logger::log_error(*_json_iter, start_position(), depth(), "Root number more than 1082 characters");
+    return NUMBER_ERROR;
+  }
+  auto result = numberparsing::parse_double_in_string(tmpbuf);
+  if(result.error() == SUCCESS) {
+    if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+    advance_root_scalar("double");
+  }
+  return result;
+}
+simdjson_warn_unused simdjson_inline simdjson_result<bool> value_iterator::get_root_bool() noexcept {
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("bool");
+  uint8_t tmpbuf[5+1];
+  if (!_json_iter->copy_to_buffer(json, max_len, tmpbuf)) { return incorrect_type_error("Not a boolean"); }
+  auto result = parse_bool(tmpbuf);
+  if(result.error() == SUCCESS) {
+    if (!_json_iter->is_single_token()) { return TRAILING_CONTENT; }
+    advance_root_scalar("bool");
+  }
+  return result;
+}
+simdjson_inline bool value_iterator::is_root_null() noexcept {
+  // If there is trailing content, then the document is not null.
+  if (!_json_iter->is_single_token()) { return false; }
+  auto max_len = peek_start_length();
+  auto json = peek_root_scalar("null");
+  bool result = (max_len >= 4 && !atomparsing::str4ncmp(json, "null") &&
+         (max_len == 4 || jsoncharutils::is_structural_or_whitespace(json[5])));
+  if(result) { advance_root_scalar("null"); }
+  return result;
+}
+
+simdjson_warn_unused simdjson_inline error_code value_iterator::skip_child() noexcept {
+  SIMDJSON_ASSUME( _json_iter->token._position > _start_position );
+  SIMDJSON_ASSUME( _json_iter->_depth >= _depth );
+
+  return _json_iter->skip_child(depth());
+}
+
+simdjson_inline value_iterator value_iterator::child() const noexcept {
+  assert_at_child();
+  return { _json_iter, depth()+1, _json_iter->token.position() };
+}
+
+// GCC 7 warns when the first line of this function is inlined away into oblivion due to the caller
+// relating depth and iterator depth, which is a desired effect. It does not happen if is_open is
+// marked non-inline.
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING
+simdjson_inline bool value_iterator::is_open() const noexcept {
+  return _json_iter->depth() >= depth();
+}
+SIMDJSON_POP_DISABLE_WARNINGS
+
+simdjson_inline bool value_iterator::at_end() const noexcept {
+  return _json_iter->at_end();
+}
+
+simdjson_inline bool value_iterator::at_start() const noexcept {
+  return _json_iter->token.position() == start_position();
+}
+
+simdjson_inline bool value_iterator::at_first_field() const noexcept {
+  SIMDJSON_ASSUME( _json_iter->token._position > _start_position );
+  return _json_iter->token.position() == start_position() + 1;
+}
+
+simdjson_inline void value_iterator::abandon() noexcept {
+  _json_iter->abandon();
+}
+
+simdjson_warn_unused simdjson_inline depth_t value_iterator::depth() const noexcept {
+  return _depth;
+}
+simdjson_warn_unused simdjson_inline error_code value_iterator::error() const noexcept {
+  return _json_iter->error;
+}
+simdjson_warn_unused simdjson_inline uint8_t *&value_iterator::string_buf_loc() noexcept {
+  return _json_iter->string_buf_loc();
+}
+simdjson_warn_unused simdjson_inline const json_iterator &value_iterator::json_iter() const noexcept {
+  return *_json_iter;
+}
+simdjson_warn_unused simdjson_inline json_iterator &value_iterator::json_iter() noexcept {
+  return *_json_iter;
+}
+
+simdjson_inline const uint8_t *value_iterator::peek_start() const noexcept {
+  return _json_iter->peek(start_position());
+}
+simdjson_inline uint32_t value_iterator::peek_start_length() const noexcept {
+  return _json_iter->peek_length(start_position());
+}
+
+simdjson_inline const uint8_t *value_iterator::peek_scalar(const char *type) noexcept {
+  logger::log_value(*_json_iter, start_position(), depth(), type);
+  // If we're not at the position anymore, we don't want to advance the cursor.
+  if (!is_at_start()) { return peek_start(); }
+
+  // Get the JSON and advance the cursor, decreasing depth to signify that we have retrieved the value.
+  assert_at_start();
+  return _json_iter->peek();
+}
+
+simdjson_inline void value_iterator::advance_scalar(const char *type) noexcept {
+  logger::log_value(*_json_iter, start_position(), depth(), type);
+  // If we're not at the position anymore, we don't want to advance the cursor.
+  if (!is_at_start()) { return; }
+
+  // Get the JSON and advance the cursor, decreasing depth to signify that we have retrieved the value.
+  assert_at_start();
+  _json_iter->return_current_and_advance();
+  _json_iter->ascend_to(depth()-1);
+}
+
+simdjson_inline error_code value_iterator::start_container(uint8_t start_char, const char *incorrect_type_message, const char *type) noexcept {
+  logger::log_start_value(*_json_iter, start_position(), depth(), type);
+  // If we're not at the position anymore, we don't want to advance the cursor.
+  const uint8_t *json;
+  if (!is_at_start()) {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+    if (!is_at_iterator_start()) { return OUT_OF_ORDER_ITERATION; }
+#endif
+    json = peek_start();
+    if (*json != start_char) { return incorrect_type_error(incorrect_type_message); }
+  } else {
+    assert_at_start();
+    /**
+     * We should be prudent. Let us peek. If it is not the right type, we
+     * return an error. Only once we have determined that we have the right
+     * type are we allowed to advance!
+     */
+    json = _json_iter->peek();
+    if (*json != start_char) { return incorrect_type_error(incorrect_type_message); }
+    _json_iter->return_current_and_advance();
+  }
+
+
+  return SUCCESS;
+}
+
+
+simdjson_inline const uint8_t *value_iterator::peek_root_scalar(const char *type) noexcept {
+  logger::log_value(*_json_iter, start_position(), depth(), type);
+  if (!is_at_start()) { return peek_start(); }
+
+  assert_at_root();
+  return _json_iter->peek();
+}
+simdjson_inline const uint8_t *value_iterator::peek_non_root_scalar(const char *type) noexcept {
+  logger::log_value(*_json_iter, start_position(), depth(), type);
+  if (!is_at_start()) { return peek_start(); }
+
+  assert_at_non_root_start();
+  return _json_iter->peek();
+}
+
+simdjson_inline void value_iterator::advance_root_scalar(const char *type) noexcept {
+  logger::log_value(*_json_iter, start_position(), depth(), type);
+  if (!is_at_start()) { return; }
+
+  assert_at_root();
+  _json_iter->return_current_and_advance();
+  _json_iter->ascend_to(depth()-1);
+}
+simdjson_inline void value_iterator::advance_non_root_scalar(const char *type) noexcept {
+  logger::log_value(*_json_iter, start_position(), depth(), type);
+  if (!is_at_start()) { return; }
+
+  assert_at_non_root_start();
+  _json_iter->return_current_and_advance();
+  _json_iter->ascend_to(depth()-1);
+}
+
+simdjson_inline error_code value_iterator::incorrect_type_error(const char *message) const noexcept {
+  logger::log_error(*_json_iter, start_position(), depth(), message);
+  return INCORRECT_TYPE;
+}
+
+simdjson_inline bool value_iterator::is_at_start() const noexcept {
+  return position() == start_position();
+}
+
+simdjson_inline bool value_iterator::is_at_key() const noexcept {
+  // Keys are at the same depth as the object.
+  // Note here that we could be safer and check that we are within an object,
+  // but we do not.
+  return _depth == _json_iter->_depth && *_json_iter->peek() == '"';
+}
+
+simdjson_inline bool value_iterator::is_at_iterator_start() const noexcept {
+  // We can legitimately be either at the first value ([1]), or after the array if it's empty ([]).
+  auto delta = position() - start_position();
+  return delta == 1 || delta == 2;
+}
+
+inline void value_iterator::assert_at_start() const noexcept {
+  SIMDJSON_ASSUME( _json_iter->token._position == _start_position );
+  SIMDJSON_ASSUME( _json_iter->_depth == _depth );
+  SIMDJSON_ASSUME( _depth > 0 );
+}
+
+inline void value_iterator::assert_at_container_start() const noexcept {
+  SIMDJSON_ASSUME( _json_iter->token._position == _start_position + 1 );
+  SIMDJSON_ASSUME( _json_iter->_depth == _depth );
+  SIMDJSON_ASSUME( _depth > 0 );
+}
+
+inline void value_iterator::assert_at_next() const noexcept {
+  SIMDJSON_ASSUME( _json_iter->token._position > _start_position );
+  SIMDJSON_ASSUME( _json_iter->_depth == _depth );
+  SIMDJSON_ASSUME( _depth > 0 );
+}
+
+simdjson_inline void value_iterator::move_at_start() noexcept {
+  _json_iter->_depth = _depth;
+  _json_iter->token.set_position(_start_position);
+}
+
+simdjson_inline void value_iterator::move_at_container_start() noexcept {
+  _json_iter->_depth = _depth;
+  _json_iter->token.set_position(_start_position + 1);
+}
+
+simdjson_inline simdjson_result<bool> value_iterator::reset_array() noexcept {
+  move_at_container_start();
+  return started_array();
+}
+
+simdjson_inline simdjson_result<bool> value_iterator::reset_object() noexcept {
+  move_at_container_start();
+  return started_object();
+}
+
+inline void value_iterator::assert_at_child() const noexcept {
+  SIMDJSON_ASSUME( _json_iter->token._position > _start_position );
+  SIMDJSON_ASSUME( _json_iter->_depth == _depth + 1 );
+  SIMDJSON_ASSUME( _depth > 0 );
+}
+
+inline void value_iterator::assert_at_root() const noexcept {
+  assert_at_start();
+  SIMDJSON_ASSUME( _depth == 1 );
+}
+
+inline void value_iterator::assert_at_non_root_start() const noexcept {
+  assert_at_start();
+  SIMDJSON_ASSUME( _depth > 1 );
+}
+
+inline void value_iterator::assert_is_valid() const noexcept {
+  SIMDJSON_ASSUME( _json_iter != nullptr );
+}
+
+simdjson_inline bool value_iterator::is_valid() const noexcept {
+  return _json_iter != nullptr;
+}
+
+simdjson_inline simdjson_result<json_type> value_iterator::type() const noexcept {
+  switch (*peek_start()) {
+    case '{':
+      return json_type::object;
+    case '[':
+      return json_type::array;
+    case '"':
+      return json_type::string;
+    case 'n':
+      return json_type::null;
+    case 't': case 'f':
+      return json_type::boolean;
+    case '-':
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+      return json_type::number;
+    default:
+      return TAPE_ERROR;
+  }
+}
+
+simdjson_inline token_position value_iterator::start_position() const noexcept {
+  return _start_position;
+}
+
+simdjson_inline token_position value_iterator::position() const noexcept {
+  return _json_iter->position();
+}
+
+simdjson_inline token_position value_iterator::end_position() const noexcept {
+  return _json_iter->end_position();
+}
+
+simdjson_inline token_position value_iterator::last_position() const noexcept {
+  return _json_iter->last_position();
+}
+
+simdjson_inline error_code value_iterator::report_error(error_code error, const char *message) noexcept {
+  return _json_iter->report_error(error, message);
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator &&value) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator>(value)) {}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator>::simdjson_result(error_code error) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value_iterator>(error) {}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/value_iterator-inl.h */
+/* begin file include/simdjson/generic/ondemand/array_iterator-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline array_iterator::array_iterator(const value_iterator &_iter) noexcept
+  : iter{_iter}
+{}
+
+simdjson_inline simdjson_result<value> array_iterator::operator*() noexcept {
+  if (iter.error()) { iter.abandon(); return iter.error(); }
+  return value(iter.child());
+}
+simdjson_inline bool array_iterator::operator==(const array_iterator &other) const noexcept {
+  return !(*this != other);
+}
+simdjson_inline bool array_iterator::operator!=(const array_iterator &) const noexcept {
+  return iter.is_open();
+}
+simdjson_inline array_iterator &array_iterator::operator++() noexcept {
+  error_code error;
+  // PERF NOTE this is a safety rail ... users should exit loops as soon as they receive an error, so we'll never get here.
+  // However, it does not seem to make a perf difference, so we add it out of an abundance of caution.
+  if (( error = iter.error() )) { return *this; }
+  if (( error = iter.skip_child() )) { return *this; }
+  if (( error = iter.has_next_element().error() )) { return *this; }
+  return *this;
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>::simdjson_result(
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator &&value
+) noexcept
+  : SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>(value))
+{
+  first.iter.assert_is_valid();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>::simdjson_result(error_code error) noexcept
+  : SIMDJSON_BUILTIN_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>({}, error)
+{
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>::operator*() noexcept {
+  if (error()) { return error(); }
+  return *first;
+}
+simdjson_inline bool simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>::operator==(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> &other) const noexcept {
+  if (!first.iter.is_valid()) { return !error(); }
+  return first == other.first;
+}
+simdjson_inline bool simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>::operator!=(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> &other) const noexcept {
+  if (!first.iter.is_valid()) { return error(); }
+  return first != other.first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> &simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator>::operator++() noexcept {
+  // Clear the error if there is one, so we don't yield it twice
+  if (error()) { second = SUCCESS; return *this; }
+  ++(first);
+  return *this;
+}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/array_iterator-inl.h */
+/* begin file include/simdjson/generic/ondemand/object_iterator-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+//
+// object_iterator
+//
+
+simdjson_inline object_iterator::object_iterator(const value_iterator &_iter) noexcept
+  : iter{_iter}
+{}
+
+simdjson_inline simdjson_result<field> object_iterator::operator*() noexcept {
+  error_code error = iter.error();
+  if (error) { iter.abandon(); return error; }
+  auto result = field::start(iter);
+  // TODO this is a safety rail ... users should exit loops as soon as they receive an error.
+  // Nonetheless, let's see if performance is OK with this if statement--the compiler may give it to us for free.
+  if (result.error()) { iter.abandon(); }
+  return result;
+}
+simdjson_inline bool object_iterator::operator==(const object_iterator &other) const noexcept {
+  return !(*this != other);
+}
+simdjson_inline bool object_iterator::operator!=(const object_iterator &) const noexcept {
+  return iter.is_open();
+}
+
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING
+simdjson_inline object_iterator &object_iterator::operator++() noexcept {
+  // TODO this is a safety rail ... users should exit loops as soon as they receive an error.
+  // Nonetheless, let's see if performance is OK with this if statement--the compiler may give it to us for free.
+  if (!iter.is_open()) { return *this; } // Iterator will be released if there is an error
+
+  simdjson_unused error_code error;
+  if ((error = iter.skip_child() )) { return *this; }
+
+  simdjson_unused bool has_value;
+  if ((error = iter.has_next_field().get(has_value) )) { return *this; };
+  return *this;
+}
+SIMDJSON_POP_DISABLE_WARNINGS
+
+//
+// ### Live States
+//
+// While iterating or looking up values, depth >= iter.depth. at_start may vary. Error is
+// always SUCCESS:
+//
+// - Start: This is the state when the object is first found and the iterator is just past the {.
+//   In this state, at_start == true.
+// - Next: After we hand a scalar value to the user, or an array/object which they then fully
+//   iterate over, the iterator is at the , or } before the next value. In this state,
+//   depth == iter.depth, at_start == false, and error == SUCCESS.
+// - Unfinished Business: When we hand an array/object to the user which they do not fully
+//   iterate over, we need to finish that iteration by skipping child values until we reach the
+//   Next state. In this state, depth > iter.depth, at_start == false, and error == SUCCESS.
+//
+// ## Error States
+//
+// In error states, we will yield exactly one more value before stopping. iter.depth == depth
+// and at_start is always false. We decrement after yielding the error, moving to the Finished
+// state.
+//
+// - Chained Error: When the object iterator is part of an error chain--for example, in
+//   `for (auto tweet : doc["tweets"])`, where the tweet field may be missing or not be an
+//   object--we yield that error in the loop, exactly once. In this state, error != SUCCESS and
+//   iter.depth == depth, and at_start == false. We decrement depth when we yield the error.
+// - Missing Comma Error: When the iterator ++ method discovers there is no comma between fields,
+//   we flag that as an error and treat it exactly the same as a Chained Error. In this state,
+//   error == TAPE_ERROR, iter.depth == depth, and at_start == false.
+//
+// Errors that occur while reading a field to give to the user (such as when the key is not a
+// string or the field is missing a colon) are yielded immediately. Depth is then decremented,
+// moving to the Finished state without transitioning through an Error state at all.
+//
+// ## Terminal State
+//
+// The terminal state has iter.depth < depth. at_start is always false.
+//
+// - Finished: When we have reached a }, we are finished. We signal this by decrementing depth.
+//   In this state, iter.depth < depth, at_start == false, and error == SUCCESS.
+//
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>::simdjson_result(
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator &&value
+) noexcept
+  : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>(value))
+{
+  first.iter.assert_is_valid();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>::simdjson_result(error_code error) noexcept
+  : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>({}, error)
+{
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>::operator*() noexcept {
+  if (error()) { return error(); }
+  return *first;
+}
+// If we're iterating and there is an error, return the error once.
+simdjson_inline bool simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>::operator==(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> &other) const noexcept {
+  if (!first.iter.is_valid()) { return !error(); }
+  return first == other.first;
+}
+// If we're iterating and there is an error, return the error once.
+simdjson_inline bool simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>::operator!=(const simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> &other) const noexcept {
+  if (!first.iter.is_valid()) { return error(); }
+  return first != other.first;
+}
+// Checks for ']' and ','
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> &simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator>::operator++() noexcept {
+  // Clear the error if there is one, so we don't yield it twice
+  if (error()) { second = SUCCESS; return *this; }
+  ++first;
+  return *this;
+}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/object_iterator-inl.h */
+/* begin file include/simdjson/generic/ondemand/array-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+//
+// ### Live States
+//
+// While iterating or looking up values, depth >= iter->depth. at_start may vary. Error is
+// always SUCCESS:
+//
+// - Start: This is the state when the array is first found and the iterator is just past the `{`.
+//   In this state, at_start == true.
+// - Next: After we hand a scalar value to the user, or an array/object which they then fully
+//   iterate over, the iterator is at the `,` before the next value (or `]`). In this state,
+//   depth == iter->depth, at_start == false, and error == SUCCESS.
+// - Unfinished Business: When we hand an array/object to the user which they do not fully
+//   iterate over, we need to finish that iteration by skipping child values until we reach the
+//   Next state. In this state, depth > iter->depth, at_start == false, and error == SUCCESS.
+//
+// ## Error States
+//
+// In error states, we will yield exactly one more value before stopping. iter->depth == depth
+// and at_start is always false. We decrement after yielding the error, moving to the Finished
+// state.
+//
+// - Chained Error: When the array iterator is part of an error chain--for example, in
+//   `for (auto tweet : doc["tweets"])`, where the tweet element may be missing or not be an
+//   array--we yield that error in the loop, exactly once. In this state, error != SUCCESS and
+//   iter->depth == depth, and at_start == false. We decrement depth when we yield the error.
+// - Missing Comma Error: When the iterator ++ method discovers there is no comma between elements,
+//   we flag that as an error and treat it exactly the same as a Chained Error. In this state,
+//   error == TAPE_ERROR, iter->depth == depth, and at_start == false.
+//
+// ## Terminal State
+//
+// The terminal state has iter->depth < depth. at_start is always false.
+//
+// - Finished: When we have reached a `]` or have reported an error, we are finished. We signal this
+//   by decrementing depth. In this state, iter->depth < depth, at_start == false, and
+//   error == SUCCESS.
+//
+
+simdjson_inline array::array(const value_iterator &_iter) noexcept
+  : iter{_iter}
+{
+}
+
+simdjson_inline simdjson_result<array> array::start(value_iterator &iter) noexcept {
+  // We don't need to know if the array is empty to start iteration, but we do want to know if there
+  // is an error--thus `simdjson_unused`.
+  simdjson_unused bool has_value;
+  SIMDJSON_TRY( iter.start_array().get(has_value) );
+  return array(iter);
+}
+simdjson_inline simdjson_result<array> array::start_root(value_iterator &iter) noexcept {
+  simdjson_unused bool has_value;
+  SIMDJSON_TRY( iter.start_root_array().get(has_value) );
+  return array(iter);
+}
+simdjson_inline simdjson_result<array> array::started(value_iterator &iter) noexcept {
+  bool has_value;
+  SIMDJSON_TRY(iter.started_array().get(has_value));
+  return array(iter);
+}
+
+simdjson_inline simdjson_result<array_iterator> array::begin() noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  if (!iter.is_at_iterator_start()) { return OUT_OF_ORDER_ITERATION; }
+#endif
+  return array_iterator(iter);
+}
+simdjson_inline simdjson_result<array_iterator> array::end() noexcept {
+  return array_iterator(iter);
+}
+simdjson_inline error_code array::consume() noexcept {
+  auto error = iter.json_iter().skip_child(iter.depth()-1);
+  if(error) { iter.abandon(); }
+  return error;
+}
+
+simdjson_inline simdjson_result<std::string_view> array::raw_json() noexcept {
+  const uint8_t * starting_point{iter.peek_start()};
+  auto error = consume();
+  if(error) { return error; }
+  // After 'consume()', we could be left pointing just beyond the document, but that
+  // is ok because we are not going to dereference the final pointer position, we just
+  // use it to compute the length in bytes.
+  const uint8_t * final_point{iter._json_iter->unsafe_pointer()};
+  return std::string_view(reinterpret_cast<const char*>(starting_point), size_t(final_point - starting_point));
+}
+
+SIMDJSON_PUSH_DISABLE_WARNINGS
+SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING
+simdjson_inline simdjson_result<size_t> array::count_elements() & noexcept {
+  size_t count{0};
+  // Important: we do not consume any of the values.
+  for(simdjson_unused auto v : *this) { count++; }
+  // The above loop will always succeed, but we want to report errors.
+  if(iter.error()) { return iter.error(); }
+  // We need to move back at the start because we expect users to iterate through
+  // the array after counting the number of elements.
+  iter.reset_array();
+  return count;
+}
+SIMDJSON_POP_DISABLE_WARNINGS
+
+simdjson_inline simdjson_result<bool> array::is_empty() & noexcept {
+  bool is_not_empty;
+  auto error = iter.reset_array().get(is_not_empty);
+  if(error) { return error; }
+  return !is_not_empty;
+}
+
+inline simdjson_result<bool> array::reset() & noexcept {
+  return iter.reset_array();
+}
+
+inline simdjson_result<value> array::at_pointer(std::string_view json_pointer) noexcept {
+  if (json_pointer[0] != '/') { return INVALID_JSON_POINTER; }
+  json_pointer = json_pointer.substr(1);
+  // - means "the append position" or "the element after the end of the array"
+  // We don't support this, because we're returning a real element, not a position.
+  if (json_pointer == "-") { return INDEX_OUT_OF_BOUNDS; }
+
+  // Read the array index
+  size_t array_index = 0;
+  size_t i;
+  for (i = 0; i < json_pointer.length() && json_pointer[i] != '/'; i++) {
+    uint8_t digit = uint8_t(json_pointer[i] - '0');
+    // Check for non-digit in array index. If it's there, we're trying to get a field in an object
+    if (digit > 9) { return INCORRECT_TYPE; }
+    array_index = array_index*10 + digit;
+  }
+
+  // 0 followed by other digits is invalid
+  if (i > 1 && json_pointer[0] == '0') { return INVALID_JSON_POINTER; } // "JSON pointer array index has other characters after 0"
+
+  // Empty string is invalid; so is a "/" with no digits before it
+  if (i == 0) { return INVALID_JSON_POINTER; } // "Empty string in JSON pointer array index"
+  // Get the child
+  auto child = at(array_index);
+  // If there is an error, it ends here
+  if(child.error()) {
+    return child;
+  }
+
+  // If there is a /, we're not done yet, call recursively.
+  if (i < json_pointer.length()) {
+    child = child.at_pointer(json_pointer.substr(i));
+  }
+  return child;
+}
+
+simdjson_inline simdjson_result<value> array::at(size_t index) noexcept {
+  size_t i = 0;
+  for (auto value : *this) {
+    if (i == index) { return value; }
+    i++;
+  }
+  return INDEX_OUT_OF_BOUNDS;
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::simdjson_result(
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array &&value
+) noexcept
+  : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>(
+      std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>(value)
+    )
+{
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::simdjson_result(
+  error_code error
+) noexcept
+  : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>(error)
+{
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::begin() noexcept {
+  if (error()) { return error(); }
+  return first.begin();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::end() noexcept {
+  if (error()) { return error(); }
+  return first.end();
+}
+simdjson_inline  simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::count_elements() & noexcept {
+  if (error()) { return error(); }
+  return first.count_elements();
+}
+simdjson_inline  simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::is_empty() & noexcept {
+  if (error()) { return error(); }
+  return first.is_empty();
+}
+simdjson_inline  simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::at(size_t index) noexcept {
+  if (error()) { return error(); }
+  return first.at(index);
+}
+simdjson_inline  simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array>::at_pointer(std::string_view json_pointer) noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/array-inl.h */
+/* begin file include/simdjson/generic/ondemand/document-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline document::document(ondemand::json_iterator &&_iter) noexcept
+  : iter{std::forward<json_iterator>(_iter)}
+{
+  logger::log_start_value(iter, "document");
+}
+
+simdjson_inline document document::start(json_iterator &&iter) noexcept {
+  return document(std::forward<json_iterator>(iter));
+}
+
+inline void document::rewind() noexcept {
+  iter.rewind();
+}
+
+inline std::string document::to_debug_string() noexcept {
+  return iter.to_string();
+}
+
+inline simdjson_result<const char *> document::current_location() noexcept {
+  return iter.current_location();
+}
+
+inline int32_t document::current_depth() const noexcept {
+  return iter.depth();
+}
+
+inline bool document::is_alive() noexcept {
+  return iter.is_alive();
+}
+simdjson_inline value_iterator document::resume_value_iterator() noexcept {
+  return value_iterator(&iter, 1, iter.root_position());
+}
+simdjson_inline value_iterator document::get_root_value_iterator() noexcept {
+  return resume_value_iterator();
+}
+simdjson_inline simdjson_result<object> document::start_or_resume_object() noexcept {
+  if (iter.at_root()) {
+    return get_object();
+  } else {
+    return object::resume(resume_value_iterator());
+  }
+}
+simdjson_inline simdjson_result<value> document::get_value() noexcept {
+  // Make sure we start any arrays or objects before returning, so that start_root_<object/array>()
+  // gets called.
+  iter.assert_at_document_depth();
+  switch (*iter.peek()) {
+    case '[':
+    case '{':
+      return value(get_root_value_iterator());
+    default:
+      // Unfortunately, scalar documents are a special case in simdjson and they cannot
+      // be safely converted to value instances.
+      return SCALAR_DOCUMENT_AS_VALUE;
+      // return value(get_root_value_iterator());
+  }
+}
+simdjson_inline simdjson_result<array> document::get_array() & noexcept {
+  auto value = get_root_value_iterator();
+  return array::start_root(value);
+}
+simdjson_inline simdjson_result<object> document::get_object() & noexcept {
+  auto value = get_root_value_iterator();
+  return object::start_root(value);
+}
+simdjson_inline simdjson_result<uint64_t> document::get_uint64() noexcept {
+  return get_root_value_iterator().get_root_uint64();
+}
+simdjson_inline simdjson_result<uint64_t> document::get_uint64_in_string() noexcept {
+  return get_root_value_iterator().get_root_uint64_in_string();
+}
+simdjson_inline simdjson_result<int64_t> document::get_int64() noexcept {
+  return get_root_value_iterator().get_root_int64();
+}
+simdjson_inline simdjson_result<int64_t> document::get_int64_in_string() noexcept {
+  return get_root_value_iterator().get_root_int64_in_string();
+}
+simdjson_inline simdjson_result<double> document::get_double() noexcept {
+  return get_root_value_iterator().get_root_double();
+}
+simdjson_inline simdjson_result<double> document::get_double_in_string() noexcept {
+  return get_root_value_iterator().get_root_double_in_string();
+}
+simdjson_inline simdjson_result<std::string_view> document::get_string() noexcept {
+  return get_root_value_iterator().get_root_string();
+}
+simdjson_inline simdjson_result<raw_json_string> document::get_raw_json_string() noexcept {
+  return get_root_value_iterator().get_root_raw_json_string();
+}
+simdjson_inline simdjson_result<bool> document::get_bool() noexcept {
+  return get_root_value_iterator().get_root_bool();
+}
+simdjson_inline simdjson_result<bool> document::is_null() noexcept {
+  return get_root_value_iterator().is_root_null();
+}
+
+template<> simdjson_inline simdjson_result<array> document::get() & noexcept { return get_array(); }
+template<> simdjson_inline simdjson_result<object> document::get() & noexcept { return get_object(); }
+template<> simdjson_inline simdjson_result<raw_json_string> document::get() & noexcept { return get_raw_json_string(); }
+template<> simdjson_inline simdjson_result<std::string_view> document::get() & noexcept { return get_string(); }
+template<> simdjson_inline simdjson_result<double> document::get() & noexcept { return get_double(); }
+template<> simdjson_inline simdjson_result<uint64_t> document::get() & noexcept { return get_uint64(); }
+template<> simdjson_inline simdjson_result<int64_t> document::get() & noexcept { return get_int64(); }
+template<> simdjson_inline simdjson_result<bool> document::get() & noexcept { return get_bool(); }
+template<> simdjson_inline simdjson_result<value> document::get() & noexcept { return get_value(); }
+
+template<> simdjson_inline simdjson_result<raw_json_string> document::get() && noexcept { return get_raw_json_string(); }
+template<> simdjson_inline simdjson_result<std::string_view> document::get() && noexcept { return get_string(); }
+template<> simdjson_inline simdjson_result<double> document::get() && noexcept { return std::forward<document>(*this).get_double(); }
+template<> simdjson_inline simdjson_result<uint64_t> document::get() && noexcept { return std::forward<document>(*this).get_uint64(); }
+template<> simdjson_inline simdjson_result<int64_t> document::get() && noexcept { return std::forward<document>(*this).get_int64(); }
+template<> simdjson_inline simdjson_result<bool> document::get() && noexcept { return std::forward<document>(*this).get_bool(); }
+template<> simdjson_inline simdjson_result<value> document::get() && noexcept { return get_value(); }
+
+template<typename T> simdjson_inline error_code document::get(T &out) & noexcept {
+  return get<T>().get(out);
+}
+template<typename T> simdjson_inline error_code document::get(T &out) && noexcept {
+  return std::forward<document>(*this).get<T>().get(out);
+}
+
+#if SIMDJSON_EXCEPTIONS
+simdjson_inline document::operator array() & noexcept(false) { return get_array(); }
+simdjson_inline document::operator object() & noexcept(false) { return get_object(); }
+simdjson_inline document::operator uint64_t() noexcept(false) { return get_uint64(); }
+simdjson_inline document::operator int64_t() noexcept(false) { return get_int64(); }
+simdjson_inline document::operator double() noexcept(false) { return get_double(); }
+simdjson_inline document::operator std::string_view() noexcept(false) { return get_string(); }
+simdjson_inline document::operator raw_json_string() noexcept(false) { return get_raw_json_string(); }
+simdjson_inline document::operator bool() noexcept(false) { return get_bool(); }
+simdjson_inline document::operator value() noexcept(false) { return get_value(); }
+
+#endif
+simdjson_inline simdjson_result<size_t> document::count_elements() & noexcept {
+  auto a = get_array();
+  simdjson_result<size_t> answer = a.count_elements();
+  /* If there was an array, we are now left pointing at its first element. */
+  if(answer.error() == SUCCESS) { rewind(); }
+  return answer;
+}
+simdjson_inline simdjson_result<size_t> document::count_fields() & noexcept {
+  auto a = get_object();
+  simdjson_result<size_t> answer = a.count_fields();
+  /* If there was an object, we are now left pointing at its first element. */
+  if(answer.error() == SUCCESS) { rewind(); }
+  return answer;
+}
+simdjson_inline simdjson_result<value> document::at(size_t index) & noexcept {
+  auto a = get_array();
+  return a.at(index);
+}
+simdjson_inline simdjson_result<array_iterator> document::begin() & noexcept {
+  return get_array().begin();
+}
+simdjson_inline simdjson_result<array_iterator> document::end() & noexcept {
+  return {};
+}
+
+simdjson_inline simdjson_result<value> document::find_field(std::string_view key) & noexcept {
+  return start_or_resume_object().find_field(key);
+}
+simdjson_inline simdjson_result<value> document::find_field(const char *key) & noexcept {
+  return start_or_resume_object().find_field(key);
+}
+simdjson_inline simdjson_result<value> document::find_field_unordered(std::string_view key) & noexcept {
+  return start_or_resume_object().find_field_unordered(key);
+}
+simdjson_inline simdjson_result<value> document::find_field_unordered(const char *key) & noexcept {
+  return start_or_resume_object().find_field_unordered(key);
+}
+simdjson_inline simdjson_result<value> document::operator[](std::string_view key) & noexcept {
+  return start_or_resume_object()[key];
+}
+simdjson_inline simdjson_result<value> document::operator[](const char *key) & noexcept {
+  return start_or_resume_object()[key];
+}
+
+simdjson_inline error_code document::consume() noexcept {
+  auto error = iter.skip_child(0);
+  if(error) { iter.abandon(); }
+  return error;
+}
+
+simdjson_inline simdjson_result<std::string_view> document::raw_json() noexcept {
+  auto _iter = get_root_value_iterator();
+  const uint8_t * starting_point{_iter.peek_start()};
+  auto error = consume();
+  if(error) { return error; }
+  // After 'consume()', we could be left pointing just beyond the document, but that
+  // is ok because we are not going to dereference the final pointer position, we just
+  // use it to compute the length in bytes.
+  const uint8_t * final_point{iter.unsafe_pointer()};
+  return std::string_view(reinterpret_cast<const char*>(starting_point), size_t(final_point - starting_point));
+}
+
+simdjson_inline simdjson_result<json_type> document::type() noexcept {
+  return get_root_value_iterator().type();
+}
+
+simdjson_inline simdjson_result<bool> document::is_scalar() noexcept {
+  json_type this_type;
+  auto error = type().get(this_type);
+  if(error) { return error; }
+  return ! ((this_type == json_type::array) || (this_type == json_type::object));
+}
+
+simdjson_inline bool document::is_negative() noexcept {
+  return get_root_value_iterator().is_root_negative();
+}
+
+simdjson_inline simdjson_result<bool> document::is_integer() noexcept {
+  return get_root_value_iterator().is_root_integer();
+}
+
+simdjson_inline simdjson_result<number_type> document::get_number_type() noexcept {
+  return get_root_value_iterator().get_root_number_type();
+}
+
+simdjson_inline simdjson_result<number> document::get_number() noexcept {
+  return get_root_value_iterator().get_root_number();
+}
+
+
+simdjson_inline simdjson_result<std::string_view> document::raw_json_token() noexcept {
+  auto _iter = get_root_value_iterator();
+  return std::string_view(reinterpret_cast<const char*>(_iter.peek_start()), _iter.peek_start_length());
+}
+
+simdjson_inline simdjson_result<value> document::at_pointer(std::string_view json_pointer) noexcept {
+  rewind(); // Rewind the document each time at_pointer is called
+  if (json_pointer.empty()) {
+    return this->get_value();
+  }
+  json_type t;
+  SIMDJSON_TRY(type().get(t));
+  switch (t)
+  {
+    case json_type::array:
+      return (*this).get_array().at_pointer(json_pointer);
+    case json_type::object:
+      return (*this).get_object().at_pointer(json_pointer);
+    default:
+      return INVALID_JSON_POINTER;
+  }
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::simdjson_result(
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document &&value
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(
+      std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(value)
+    )
+{
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::simdjson_result(
+  error_code error
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(
+      error
+    )
+{
+}
+simdjson_inline simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::count_elements() & noexcept {
+  if (error()) { return error(); }
+  return first.count_elements();
+}
+simdjson_inline simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::count_fields() & noexcept {
+  if (error()) { return error(); }
+  return first.count_fields();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::at(size_t index) & noexcept {
+  if (error()) { return error(); }
+  return first.at(index);
+}
+simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::rewind() noexcept {
+  if (error()) { return error(); }
+  first.rewind();
+  return SUCCESS;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::begin() & noexcept {
+  if (error()) { return error(); }
+  return first.begin();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::end() & noexcept {
+  return {};
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::find_field_unordered(std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field_unordered(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::find_field_unordered(const char *key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field_unordered(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator[](std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator[](const char *key) & noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::find_field(std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::find_field(const char *key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_array() & noexcept {
+  if (error()) { return error(); }
+  return first.get_array();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_object() & noexcept {
+  if (error()) { return error(); }
+  return first.get_object();
+}
+simdjson_inline simdjson_result<uint64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_uint64() noexcept {
+  if (error()) { return error(); }
+  return first.get_uint64();
+}
+simdjson_inline simdjson_result<int64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_int64() noexcept {
+  if (error()) { return error(); }
+  return first.get_int64();
+}
+simdjson_inline simdjson_result<double> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_double() noexcept {
+  if (error()) { return error(); }
+  return first.get_double();
+}
+simdjson_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_string();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_raw_json_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_raw_json_string();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_bool() noexcept {
+  if (error()) { return error(); }
+  return first.get_bool();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_value() noexcept {
+  if (error()) { return error(); }
+  return first.get_value();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::is_null() noexcept {
+  if (error()) { return error(); }
+  return first.is_null();
+}
+
+template<typename T>
+simdjson_inline simdjson_result<T> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get() & noexcept {
+  if (error()) { return error(); }
+  return first.get<T>();
+}
+template<typename T>
+simdjson_inline simdjson_result<T> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get() && noexcept {
+  if (error()) { return error(); }
+  return std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(first).get<T>();
+}
+template<typename T>
+simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get(T &out) & noexcept {
+  if (error()) { return error(); }
+  return first.get<T>(out);
+}
+template<typename T>
+simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get(T &out) && noexcept {
+  if (error()) { return error(); }
+  return std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(first).get<T>(out);
+}
+
+template<> simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>() & noexcept = delete;
+template<> simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>() && noexcept {
+  if (error()) { return error(); }
+  return std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(first);
+}
+template<> simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document &out) & noexcept = delete;
+template<> simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document &out) && noexcept {
+  if (error()) { return error(); }
+  out = std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>(first);
+  return SUCCESS;
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::type() noexcept {
+  if (error()) { return error(); }
+  return first.type();
+}
+
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::is_scalar() noexcept {
+  if (error()) { return error(); }
+  return first.is_scalar();
+}
+
+
+simdjson_inline bool simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::is_negative() noexcept {
+  if (error()) { return error(); }
+  return first.is_negative();
+}
+
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::is_integer() noexcept {
+  if (error()) { return error(); }
+  return first.is_integer();
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number_type> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_number_type() noexcept {
+  if (error()) { return error(); }
+  return first.get_number_type();
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::get_number() noexcept {
+  if (error()) { return error(); }
+  return first.get_number();
+}
+
+
+#if SIMDJSON_EXCEPTIONS
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() & noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() & noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator uint64_t() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator int64_t() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator double() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator std::string_view() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator bool() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+#endif
+
+
+simdjson_inline simdjson_result<const char *> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::current_location() noexcept {
+  if (error()) { return error(); }
+  return first.current_location();
+}
+
+simdjson_inline int32_t simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::current_depth() const noexcept {
+  if (error()) { return error(); }
+  return first.current_depth();
+}
+
+simdjson_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::raw_json_token() noexcept {
+  if (error()) { return error(); }
+  return first.raw_json_token();
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>::at_pointer(std::string_view json_pointer) noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+
+
+} // namespace simdjson
+
+
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline document_reference::document_reference() noexcept : doc{nullptr} {}
+simdjson_inline document_reference::document_reference(document &d) noexcept : doc(&d) {}
+simdjson_inline void document_reference::rewind() noexcept { doc->rewind(); }
+simdjson_inline simdjson_result<array> document_reference::get_array() & noexcept { return doc->get_array(); }
+simdjson_inline simdjson_result<object> document_reference::get_object() & noexcept { return doc->get_object(); }
+simdjson_inline simdjson_result<uint64_t> document_reference::get_uint64() noexcept { return doc->get_uint64(); }
+simdjson_inline simdjson_result<int64_t> document_reference::get_int64() noexcept { return doc->get_int64(); }
+simdjson_inline simdjson_result<double> document_reference::get_double() noexcept { return doc->get_double(); }
+simdjson_inline simdjson_result<std::string_view> document_reference::get_string() noexcept { return doc->get_string(); }
+simdjson_inline simdjson_result<raw_json_string> document_reference::get_raw_json_string() noexcept { return doc->get_raw_json_string(); }
+simdjson_inline simdjson_result<bool> document_reference::get_bool() noexcept { return doc->get_bool(); }
+simdjson_inline simdjson_result<value> document_reference::get_value() noexcept { return doc->get_value(); }
+simdjson_inline simdjson_result<bool> document_reference::is_null() noexcept { return doc->is_null(); }
+
+#if SIMDJSON_EXCEPTIONS
+simdjson_inline document_reference::operator array() & noexcept(false) { return array(*doc); }
+simdjson_inline document_reference::operator object() & noexcept(false) { return object(*doc); }
+simdjson_inline document_reference::operator uint64_t() noexcept(false) { return uint64_t(*doc); }
+simdjson_inline document_reference::operator int64_t() noexcept(false) { return int64_t(*doc); }
+simdjson_inline document_reference::operator double() noexcept(false) { return double(*doc); }
+simdjson_inline document_reference::operator std::string_view() noexcept(false) { return std::string_view(*doc); }
+simdjson_inline document_reference::operator raw_json_string() noexcept(false) { return raw_json_string(*doc); }
+simdjson_inline document_reference::operator bool() noexcept(false) { return bool(*doc); }
+simdjson_inline document_reference::operator value() noexcept(false) { return value(*doc); }
+#endif
+simdjson_inline simdjson_result<size_t> document_reference::count_elements() & noexcept { return doc->count_elements(); }
+simdjson_inline simdjson_result<size_t> document_reference::count_fields() & noexcept { return doc->count_fields(); }
+simdjson_inline simdjson_result<value> document_reference::at(size_t index) & noexcept { return doc->at(index); }
+simdjson_inline simdjson_result<array_iterator> document_reference::begin() & noexcept { return doc->begin(); }
+simdjson_inline simdjson_result<array_iterator> document_reference::end() & noexcept { return doc->end(); }
+simdjson_inline simdjson_result<value> document_reference::find_field(std::string_view key) & noexcept { return doc->find_field(key); }
+simdjson_inline simdjson_result<value> document_reference::find_field(const char *key) & noexcept { return doc->find_field(key); }
+simdjson_inline simdjson_result<value> document_reference::operator[](std::string_view key) & noexcept { return (*doc)[key]; }
+simdjson_inline simdjson_result<value> document_reference::operator[](const char *key) & noexcept { return (*doc)[key]; }
+simdjson_inline simdjson_result<value> document_reference::find_field_unordered(std::string_view key) & noexcept { return doc->find_field_unordered(key); }
+simdjson_inline simdjson_result<value> document_reference::find_field_unordered(const char *key) & noexcept { return doc->find_field_unordered(key); }
+simdjson_inline simdjson_result<json_type> document_reference::type() noexcept { return doc->type(); }
+simdjson_inline simdjson_result<bool> document_reference::is_scalar() noexcept { return doc->is_scalar(); }
+simdjson_inline simdjson_result<const char *> document_reference::current_location() noexcept { return doc->current_location(); }
+simdjson_inline int32_t document_reference::current_depth() const noexcept { return doc->current_depth(); }
+simdjson_inline bool document_reference::is_negative() noexcept { return doc->is_negative(); }
+simdjson_inline simdjson_result<bool> document_reference::is_integer() noexcept { return doc->is_integer(); }
+simdjson_inline simdjson_result<number_type> document_reference::get_number_type() noexcept { return doc->get_number_type(); }
+simdjson_inline simdjson_result<number> document_reference::get_number() noexcept { return doc->get_number(); }
+simdjson_inline simdjson_result<std::string_view> document_reference::raw_json_token() noexcept { return doc->raw_json_token(); }
+simdjson_inline simdjson_result<value> document_reference::at_pointer(std::string_view json_pointer) noexcept { return doc->at_pointer(json_pointer); }
+simdjson_inline simdjson_result<std::string_view> document_reference::raw_json() noexcept { return doc->raw_json();}
+simdjson_inline document_reference::operator document&() const noexcept { return *doc; }
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+
+
+namespace simdjson {
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference value, error_code error)
+  noexcept : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>(value), error) {}
+
+
+simdjson_inline simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::count_elements() & noexcept {
+  if (error()) { return error(); }
+  return first.count_elements();
+}
+simdjson_inline simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::count_fields() & noexcept {
+  if (error()) { return error(); }
+  return first.count_fields();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::at(size_t index) & noexcept {
+  if (error()) { return error(); }
+  return first.at(index);
+}
+simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::rewind() noexcept {
+  if (error()) { return error(); }
+  first.rewind();
+  return SUCCESS;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::begin() & noexcept {
+  if (error()) { return error(); }
+  return first.begin();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::end() & noexcept {
+  return {};
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::find_field_unordered(std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field_unordered(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::find_field_unordered(const char *key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field_unordered(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator[](std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator[](const char *key) & noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::find_field(std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::find_field(const char *key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_array() & noexcept {
+  if (error()) { return error(); }
+  return first.get_array();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_object() & noexcept {
+  if (error()) { return error(); }
+  return first.get_object();
+}
+simdjson_inline simdjson_result<uint64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_uint64() noexcept {
+  if (error()) { return error(); }
+  return first.get_uint64();
+}
+simdjson_inline simdjson_result<int64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_int64() noexcept {
+  if (error()) { return error(); }
+  return first.get_int64();
+}
+simdjson_inline simdjson_result<double> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_double() noexcept {
+  if (error()) { return error(); }
+  return first.get_double();
+}
+simdjson_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_string();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_raw_json_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_raw_json_string();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_bool() noexcept {
+  if (error()) { return error(); }
+  return first.get_bool();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_value() noexcept {
+  if (error()) { return error(); }
+  return first.get_value();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::is_null() noexcept {
+  if (error()) { return error(); }
+  return first.is_null();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::type() noexcept {
+  if (error()) { return error(); }
+  return first.type();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::is_scalar() noexcept {
+  if (error()) { return error(); }
+  return first.is_scalar();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::is_negative() noexcept {
+  if (error()) { return error(); }
+  return first.is_negative();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::is_integer() noexcept {
+  if (error()) { return error(); }
+  return first.is_integer();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number_type> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_number_type() noexcept {
+  if (error()) { return error(); }
+  return first.get_number_type();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::get_number() noexcept {
+  if (error()) { return error(); }
+  return first.get_number();
+}
+#if SIMDJSON_EXCEPTIONS
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() & noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() & noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator uint64_t() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator int64_t() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator double() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator std::string_view() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator bool() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+#endif
+
+simdjson_inline simdjson_result<const char *> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::current_location() noexcept {
+  if (error()) { return error(); }
+  return first.current_location();
+}
+
+simdjson_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::raw_json_token() noexcept {
+  if (error()) { return error(); }
+  return first.raw_json_token();
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>::at_pointer(std::string_view json_pointer) noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/document-inl.h */
+/* begin file include/simdjson/generic/ondemand/value-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline value::value(const value_iterator &_iter) noexcept
+  : iter{_iter}
+{
+}
+simdjson_inline value value::start(const value_iterator &iter) noexcept {
+  return iter;
+}
+simdjson_inline value value::resume(const value_iterator &iter) noexcept {
+  return iter;
+}
+
+simdjson_inline simdjson_result<array> value::get_array() noexcept {
+  return array::start(iter);
+}
+simdjson_inline simdjson_result<object> value::get_object() noexcept {
+  return object::start(iter);
+}
+simdjson_inline simdjson_result<object> value::start_or_resume_object() noexcept {
+  if (iter.at_start()) {
+    return get_object();
+  } else {
+    return object::resume(iter);
+  }
+}
+
+simdjson_inline simdjson_result<raw_json_string> value::get_raw_json_string() noexcept {
+  return iter.get_raw_json_string();
+}
+simdjson_inline simdjson_result<std::string_view> value::get_string() noexcept {
+  return iter.get_string();
+}
+simdjson_inline simdjson_result<double> value::get_double() noexcept {
+  return iter.get_double();
+}
+simdjson_inline simdjson_result<double> value::get_double_in_string() noexcept {
+  return iter.get_double_in_string();
+}
+simdjson_inline simdjson_result<uint64_t> value::get_uint64() noexcept {
+  return iter.get_uint64();
+}
+simdjson_inline simdjson_result<uint64_t> value::get_uint64_in_string() noexcept {
+  return iter.get_uint64_in_string();
+}
+simdjson_inline simdjson_result<int64_t> value::get_int64() noexcept {
+  return iter.get_int64();
+}
+simdjson_inline simdjson_result<int64_t> value::get_int64_in_string() noexcept {
+  return iter.get_int64_in_string();
+}
+simdjson_inline simdjson_result<bool> value::get_bool() noexcept {
+  return iter.get_bool();
+}
+simdjson_inline simdjson_result<bool> value::is_null() noexcept {
+  return iter.is_null();
+}
+template<> simdjson_inline simdjson_result<array> value::get() noexcept { return get_array(); }
+template<> simdjson_inline simdjson_result<object> value::get() noexcept { return get_object(); }
+template<> simdjson_inline simdjson_result<raw_json_string> value::get() noexcept { return get_raw_json_string(); }
+template<> simdjson_inline simdjson_result<std::string_view> value::get() noexcept { return get_string(); }
+template<> simdjson_inline simdjson_result<number> value::get() noexcept { return get_number(); }
+template<> simdjson_inline simdjson_result<double> value::get() noexcept { return get_double(); }
+template<> simdjson_inline simdjson_result<uint64_t> value::get() noexcept { return get_uint64(); }
+template<> simdjson_inline simdjson_result<int64_t> value::get() noexcept { return get_int64(); }
+template<> simdjson_inline simdjson_result<bool> value::get() noexcept { return get_bool(); }
+
+template<typename T> simdjson_inline error_code value::get(T &out) noexcept {
+  return get<T>().get(out);
+}
+
+#if SIMDJSON_EXCEPTIONS
+simdjson_inline value::operator array() noexcept(false) {
+  return get_array();
+}
+simdjson_inline value::operator object() noexcept(false) {
+  return get_object();
+}
+simdjson_inline value::operator uint64_t() noexcept(false) {
+  return get_uint64();
+}
+simdjson_inline value::operator int64_t() noexcept(false) {
+  return get_int64();
+}
+simdjson_inline value::operator double() noexcept(false) {
+  return get_double();
+}
+simdjson_inline value::operator std::string_view() noexcept(false) {
+  return get_string();
+}
+simdjson_inline value::operator raw_json_string() noexcept(false) {
+  return get_raw_json_string();
+}
+simdjson_inline value::operator bool() noexcept(false) {
+  return get_bool();
+}
+#endif
+
+simdjson_inline simdjson_result<array_iterator> value::begin() & noexcept {
+  return get_array().begin();
+}
+simdjson_inline simdjson_result<array_iterator> value::end() & noexcept {
+  return {};
+}
+simdjson_inline simdjson_result<size_t> value::count_elements() & noexcept {
+  simdjson_result<size_t> answer;
+  auto a = get_array();
+  answer = a.count_elements();
+  // count_elements leaves you pointing inside the array, at the first element.
+  // We need to move back so that the user can create a new array (which requires that
+  // we point at '[').
+  iter.move_at_start();
+  return answer;
+}
+simdjson_inline simdjson_result<size_t> value::count_fields() & noexcept {
+  simdjson_result<size_t> answer;
+  auto a = get_object();
+  answer = a.count_fields();
+  iter.move_at_start();
+  return answer;
+}
+simdjson_inline simdjson_result<value> value::at(size_t index) noexcept {
+  auto a = get_array();
+  return a.at(index);
+}
+
+simdjson_inline simdjson_result<value> value::find_field(std::string_view key) noexcept {
+  return start_or_resume_object().find_field(key);
+}
+simdjson_inline simdjson_result<value> value::find_field(const char *key) noexcept {
+  return start_or_resume_object().find_field(key);
+}
+
+simdjson_inline simdjson_result<value> value::find_field_unordered(std::string_view key) noexcept {
+  return start_or_resume_object().find_field_unordered(key);
+}
+simdjson_inline simdjson_result<value> value::find_field_unordered(const char *key) noexcept {
+  return start_or_resume_object().find_field_unordered(key);
+}
+
+simdjson_inline simdjson_result<value> value::operator[](std::string_view key) noexcept {
+  return start_or_resume_object()[key];
+}
+simdjson_inline simdjson_result<value> value::operator[](const char *key) noexcept {
+  return start_or_resume_object()[key];
+}
+
+simdjson_inline simdjson_result<json_type> value::type() noexcept {
+  return iter.type();
+}
+
+simdjson_inline simdjson_result<bool> value::is_scalar() noexcept {
+  json_type this_type;
+  auto error = type().get(this_type);
+  if(error) { return error; }
+  return ! ((this_type == json_type::array) || (this_type == json_type::object));
+}
+
+simdjson_inline bool value::is_negative() noexcept {
+  return iter.is_negative();
+}
+
+simdjson_inline simdjson_result<bool> value::is_integer() noexcept {
+  return iter.is_integer();
+}
+simdjson_warn_unused simdjson_inline simdjson_result<number_type> value::get_number_type() noexcept {
+  return iter.get_number_type();
+}
+simdjson_warn_unused simdjson_inline simdjson_result<number> value::get_number() noexcept {
+  return iter.get_number();
+}
+
+simdjson_inline std::string_view value::raw_json_token() noexcept {
+  return std::string_view(reinterpret_cast<const char*>(iter.peek_start()), iter.peek_start_length());
+}
+
+simdjson_inline simdjson_result<const char *> value::current_location() noexcept {
+  return iter.json_iter().current_location();
+}
+
+simdjson_inline int32_t value::current_depth() const noexcept{
+  return iter.json_iter().depth();
+}
+
+simdjson_inline simdjson_result<value> value::at_pointer(std::string_view json_pointer) noexcept {
+  json_type t;
+  SIMDJSON_TRY(type().get(t));
+  switch (t)
+  {
+    case json_type::array:
+      return (*this).get_array().at_pointer(json_pointer);
+    case json_type::object:
+      return (*this).get_object().at_pointer(json_pointer);
+    default:
+      return INVALID_JSON_POINTER;
+  }
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::simdjson_result(
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value &&value
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>(
+      std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>(value)
+    )
+{
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::simdjson_result(
+  error_code error
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>(error)
+{
+}
+simdjson_inline simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::count_elements() & noexcept {
+  if (error()) { return error(); }
+  return first.count_elements();
+}
+simdjson_inline simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::count_fields() & noexcept {
+  if (error()) { return error(); }
+  return first.count_fields();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::at(size_t index) noexcept {
+  if (error()) { return error(); }
+  return first.at(index);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::begin() & noexcept {
+  if (error()) { return error(); }
+  return first.begin();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::end() & noexcept {
+  if (error()) { return error(); }
+  return {};
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::find_field(std::string_view key) noexcept {
+  if (error()) { return error(); }
+  return first.find_field(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::find_field(const char *key) noexcept {
+  if (error()) { return error(); }
+  return first.find_field(key);
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::find_field_unordered(std::string_view key) noexcept {
+  if (error()) { return error(); }
+  return first.find_field_unordered(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::find_field_unordered(const char *key) noexcept {
+  if (error()) { return error(); }
+  return first.find_field_unordered(key);
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator[](std::string_view key) noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator[](const char *key) noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_array() noexcept {
+  if (error()) { return error(); }
+  return first.get_array();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_object() noexcept {
+  if (error()) { return error(); }
+  return first.get_object();
+}
+simdjson_inline simdjson_result<uint64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_uint64() noexcept {
+  if (error()) { return error(); }
+  return first.get_uint64();
+}
+simdjson_inline simdjson_result<uint64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_uint64_in_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_uint64_in_string();
+}
+simdjson_inline simdjson_result<int64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_int64() noexcept {
+  if (error()) { return error(); }
+  return first.get_int64();
+}
+simdjson_inline simdjson_result<int64_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_int64_in_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_int64_in_string();
+}
+simdjson_inline simdjson_result<double> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_double() noexcept {
+  if (error()) { return error(); }
+  return first.get_double();
+}
+simdjson_inline simdjson_result<double> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_double_in_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_double_in_string();
+}
+simdjson_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_string();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_raw_json_string() noexcept {
+  if (error()) { return error(); }
+  return first.get_raw_json_string();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_bool() noexcept {
+  if (error()) { return error(); }
+  return first.get_bool();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::is_null() noexcept {
+  if (error()) { return error(); }
+  return first.is_null();
+}
+
+template<typename T> simdjson_inline simdjson_result<T> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get() noexcept {
+  if (error()) { return error(); }
+  return first.get<T>();
+}
+template<typename T> simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get(T &out) noexcept {
+  if (error()) { return error(); }
+  return first.get<T>(out);
+}
+
+template<> simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>() noexcept  {
+  if (error()) { return error(); }
+  return std::move(first);
+}
+template<> simdjson_inline error_code simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value &out) noexcept {
+  if (error()) { return error(); }
+  out = first;
+  return SUCCESS;
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::type() noexcept {
+  if (error()) { return error(); }
+  return first.type();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::is_scalar() noexcept {
+  if (error()) { return error(); }
+  return first.is_scalar();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::is_negative() noexcept {
+  if (error()) { return error(); }
+  return first.is_negative();
+}
+simdjson_inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::is_integer() noexcept {
+  if (error()) { return error(); }
+  return first.is_integer();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number_type> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_number_type() noexcept {
+  if (error()) { return error(); }
+  return first.get_number_type();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::number> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::get_number() noexcept {
+  if (error()) { return error(); }
+  return first.get_number();
+}
+#if SIMDJSON_EXCEPTIONS
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator uint64_t() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator int64_t() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator double() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator std::string_view() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::operator bool() noexcept(false) {
+  if (error()) { throw simdjson_error(error()); }
+  return first;
+}
+#endif
+
+simdjson_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::raw_json_token() noexcept {
+  if (error()) { return error(); }
+  return first.raw_json_token();
+}
+
+simdjson_inline simdjson_result<const char *> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::current_location() noexcept {
+  if (error()) { return error(); }
+  return first.current_location();
+}
+
+simdjson_inline simdjson_result<int32_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::current_depth() const noexcept {
+  if (error()) { return error(); }
+  return first.current_depth();
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value>::at_pointer(std::string_view json_pointer) noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/value-inl.h */
+/* begin file include/simdjson/generic/ondemand/field-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+// clang 6 doesn't think the default constructor can be noexcept, so we make it explicit
+simdjson_inline field::field() noexcept : std::pair<raw_json_string, ondemand::value>() {}
+
+simdjson_inline field::field(raw_json_string key, ondemand::value &&value) noexcept
+  : std::pair<raw_json_string, ondemand::value>(key, std::forward<ondemand::value>(value))
+{
+}
+
+simdjson_inline simdjson_result<field> field::start(value_iterator &parent_iter) noexcept {
+  raw_json_string key;
+  SIMDJSON_TRY( parent_iter.field_key().get(key) );
+  SIMDJSON_TRY( parent_iter.field_value() );
+  return field::start(parent_iter, key);
+}
+
+simdjson_inline simdjson_result<field> field::start(const value_iterator &parent_iter, raw_json_string key) noexcept {
+    return field(key, parent_iter.child());
+}
+
+simdjson_inline simdjson_warn_unused simdjson_result<std::string_view> field::unescaped_key() noexcept {
+  SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() but Visual Studio won't let us.
+  simdjson_result<std::string_view> answer = first.unescape(second.iter.json_iter());
+  first.consume();
+  return answer;
+}
+
+simdjson_inline raw_json_string field::key() const noexcept {
+  SIMDJSON_ASSUME(first.buf != nullptr); // We would like to call .alive() by Visual Studio won't let us.
+  return first;
+}
+
+simdjson_inline value &field::value() & noexcept {
+  return second;
+}
+
+simdjson_inline value field::value() && noexcept {
+  return std::forward<field>(*this).second;
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>::simdjson_result(
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field &&value
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>(
+      std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>(value)
+    )
+{
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>::simdjson_result(
+  error_code error
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>(error)
+{
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::raw_json_string> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>::key() noexcept {
+  if (error()) { return error(); }
+  return first.key();
+}
+simdjson_inline simdjson_result<std::string_view> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>::unescaped_key() noexcept {
+  if (error()) { return error(); }
+  return first.unescaped_key();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::field>::value() noexcept {
+  if (error()) { return error(); }
+  return std::move(first.value());
+}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/field-inl.h */
+/* begin file include/simdjson/generic/ondemand/object-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline simdjson_result<value> object::find_field_unordered(const std::string_view key) & noexcept {
+  bool has_value;
+  SIMDJSON_TRY( iter.find_field_unordered_raw(key).get(has_value) );
+  if (!has_value) { return NO_SUCH_FIELD; }
+  return value(iter.child());
+}
+simdjson_inline simdjson_result<value> object::find_field_unordered(const std::string_view key) && noexcept {
+  bool has_value;
+  SIMDJSON_TRY( iter.find_field_unordered_raw(key).get(has_value) );
+  if (!has_value) { return NO_SUCH_FIELD; }
+  return value(iter.child());
+}
+simdjson_inline simdjson_result<value> object::operator[](const std::string_view key) & noexcept {
+  return find_field_unordered(key);
+}
+simdjson_inline simdjson_result<value> object::operator[](const std::string_view key) && noexcept {
+  return std::forward<object>(*this).find_field_unordered(key);
+}
+simdjson_inline simdjson_result<value> object::find_field(const std::string_view key) & noexcept {
+  bool has_value;
+  SIMDJSON_TRY( iter.find_field_raw(key).get(has_value) );
+  if (!has_value) { return NO_SUCH_FIELD; }
+  return value(iter.child());
+}
+simdjson_inline simdjson_result<value> object::find_field(const std::string_view key) && noexcept {
+  bool has_value;
+  SIMDJSON_TRY( iter.find_field_raw(key).get(has_value) );
+  if (!has_value) { return NO_SUCH_FIELD; }
+  return value(iter.child());
+}
+
+simdjson_inline simdjson_result<object> object::start(value_iterator &iter) noexcept {
+  SIMDJSON_TRY( iter.start_object().error() );
+  return object(iter);
+}
+simdjson_inline simdjson_result<object> object::start_root(value_iterator &iter) noexcept {
+  SIMDJSON_TRY( iter.start_root_object().error() );
+  return object(iter);
+}
+simdjson_inline error_code object::consume() noexcept {
+  if(iter.is_at_key()) {
+    /**
+     * whenever you are pointing at a key, calling skip_child() is
+     * unsafe because you will hit a string and you will assume that
+     * it is string value, and this mistake will lead you to make bad
+     * depth computation.
+     */
+    /**
+     * We want to 'consume' the key. We could really
+     * just do _json_iter->return_current_and_advance(); at this
+     * point, but, for clarity, we will use the high-level API to
+     * eat the key. We assume that the compiler optimizes away
+     * most of the work.
+     */
+    simdjson_unused raw_json_string actual_key;
+    auto error = iter.field_key().get(actual_key);
+    if (error) { iter.abandon(); return error; };
+    // Let us move to the value while we are at it.
+    if ((error = iter.field_value())) { iter.abandon(); return error; }
+  }
+  auto error_skip = iter.json_iter().skip_child(iter.depth()-1);
+  if(error_skip) { iter.abandon(); }
+  return error_skip;
+}
+
+simdjson_inline simdjson_result<std::string_view> object::raw_json() noexcept {
+  const uint8_t * starting_point{iter.peek_start()};
+  auto error = consume();
+  if(error) { return error; }
+  const uint8_t * final_point{iter._json_iter->peek(0)};
+  return std::string_view(reinterpret_cast<const char*>(starting_point), size_t(final_point - starting_point));
+}
+
+simdjson_inline simdjson_result<object> object::started(value_iterator &iter) noexcept {
+  SIMDJSON_TRY( iter.started_object().error() );
+  return object(iter);
+}
+
+simdjson_inline object object::resume(const value_iterator &iter) noexcept {
+  return iter;
+}
+
+simdjson_inline object::object(const value_iterator &_iter) noexcept
+  : iter{_iter}
+{
+}
+
+simdjson_inline simdjson_result<object_iterator> object::begin() noexcept {
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  if (!iter.is_at_iterator_start()) { return OUT_OF_ORDER_ITERATION; }
+#endif
+  return object_iterator(iter);
+}
+simdjson_inline simdjson_result<object_iterator> object::end() noexcept {
+  return object_iterator(iter);
+}
+
+inline simdjson_result<value> object::at_pointer(std::string_view json_pointer) noexcept {
+  if (json_pointer[0] != '/') { return INVALID_JSON_POINTER; }
+  json_pointer = json_pointer.substr(1);
+  size_t slash = json_pointer.find('/');
+  std::string_view key = json_pointer.substr(0, slash);
+  // Grab the child with the given key
+  simdjson_result<value> child;
+
+  // If there is an escape character in the key, unescape it and then get the child.
+  size_t escape = key.find('~');
+  if (escape != std::string_view::npos) {
+    // Unescape the key
+    std::string unescaped(key);
+    do {
+      switch (unescaped[escape+1]) {
+        case '0':
+          unescaped.replace(escape, 2, "~");
+          break;
+        case '1':
+          unescaped.replace(escape, 2, "/");
+          break;
+        default:
+          return INVALID_JSON_POINTER; // "Unexpected ~ escape character in JSON pointer");
+      }
+      escape = unescaped.find('~', escape+1);
+    } while (escape != std::string::npos);
+    child = find_field(unescaped);  // Take note find_field does not unescape keys when matching
+  } else {
+    child = find_field(key);
+  }
+  if(child.error()) {
+    return child; // we do not continue if there was an error
+  }
+  // If there is a /, we have to recurse and look up more of the path
+  if (slash != std::string_view::npos) {
+    child = child.at_pointer(json_pointer.substr(slash));
+  }
+  return child;
+}
+
+simdjson_inline simdjson_result<size_t> object::count_fields() & noexcept {
+  size_t count{0};
+  // Important: we do not consume any of the values.
+  for(simdjson_unused auto v : *this) { count++; }
+  // The above loop will always succeed, but we want to report errors.
+  if(iter.error()) { return iter.error(); }
+  // We need to move back at the start because we expect users to iterate through
+  // the object after counting the number of elements.
+  iter.reset_object();
+  return count;
+}
+
+simdjson_inline simdjson_result<bool> object::is_empty() & noexcept {
+  bool is_not_empty;
+  auto error = iter.reset_object().get(is_not_empty);
+  if(error) { return error; }
+  return !is_not_empty;
+}
+
+simdjson_inline simdjson_result<bool> object::reset() & noexcept {
+  return iter.reset_object();
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object &&value) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>(value)) {}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::simdjson_result(error_code error) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>(error) {}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::begin() noexcept {
+  if (error()) { return error(); }
+  return first.begin();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object_iterator> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::end() noexcept {
+  if (error()) { return error(); }
+  return first.end();
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::find_field_unordered(std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field_unordered(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::find_field_unordered(std::string_view key) && noexcept {
+  if (error()) { return error(); }
+  return std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>(first).find_field_unordered(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::operator[](std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first[key];
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::operator[](std::string_view key) && noexcept {
+  if (error()) { return error(); }
+  return std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>(first)[key];
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::find_field(std::string_view key) & noexcept {
+  if (error()) { return error(); }
+  return first.find_field(key);
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::find_field(std::string_view key) && noexcept {
+  if (error()) { return error(); }
+  return std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>(first).find_field(key);
+}
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::at_pointer(std::string_view json_pointer) noexcept {
+  if (error()) { return error(); }
+  return first.at_pointer(json_pointer);
+}
+
+inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::reset() noexcept {
+  if (error()) { return error(); }
+  return first.reset();
+}
+
+inline simdjson_result<bool> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::is_empty() noexcept {
+  if (error()) { return error(); }
+  return first.is_empty();
+}
+
+simdjson_inline  simdjson_result<size_t> simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object>::count_fields() & noexcept {
+  if (error()) { return error(); }
+  return first.count_fields();
+}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/object-inl.h */
+/* begin file include/simdjson/generic/ondemand/parser-inl.h */
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+simdjson_inline parser::parser(size_t max_capacity) noexcept
+  : _max_capacity{max_capacity} {
+}
+
+simdjson_warn_unused simdjson_inline error_code parser::allocate(size_t new_capacity, size_t new_max_depth) noexcept {
+  if (new_capacity > max_capacity()) { return CAPACITY; }
+  if (string_buf && new_capacity == capacity() && new_max_depth == max_depth()) { return SUCCESS; }
+
+  // string_capacity copied from document::allocate
+  _capacity = 0;
+  size_t string_capacity = SIMDJSON_ROUNDUP_N(5 * new_capacity / 3 + SIMDJSON_PADDING, 64);
+  string_buf.reset(new (std::nothrow) uint8_t[string_capacity]);
+#if SIMDJSON_DEVELOPMENT_CHECKS
+  start_positions.reset(new (std::nothrow) token_position[new_max_depth]);
+#endif
+  if (implementation) {
+    SIMDJSON_TRY( implementation->set_capacity(new_capacity) );
+    SIMDJSON_TRY( implementation->set_max_depth(new_max_depth) );
+  } else {
+    SIMDJSON_TRY( simdjson::get_active_implementation()->create_dom_parser_implementation(new_capacity, new_max_depth, implementation) );
+  }
+  _capacity = new_capacity;
+  _max_depth = new_max_depth;
+  return SUCCESS;
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(padded_string_view json) & noexcept {
+  if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; }
+
+  // Allocate if needed
+  if (capacity() < json.length() || !string_buf) {
+    SIMDJSON_TRY( allocate(json.length(), max_depth()) );
+  }
+
+  // Run stage 1.
+  SIMDJSON_TRY( implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.length(), stage1_mode::regular) );
+  return document::start({ reinterpret_cast<const uint8_t *>(json.data()), this });
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(const char *json, size_t len, size_t allocated) & noexcept {
+  return iterate(padded_string_view(json, len, allocated));
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(const uint8_t *json, size_t len, size_t allocated) & noexcept {
+  return iterate(padded_string_view(json, len, allocated));
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(std::string_view json, size_t allocated) & noexcept {
+  return iterate(padded_string_view(json, allocated));
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(const std::string &json) & noexcept {
+  return iterate(padded_string_view(json));
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(const simdjson_result<padded_string_view> &result) & noexcept {
+  // We don't presently have a way to temporarily get a const T& from a simdjson_result<T> without throwing an exception
+  SIMDJSON_TRY( result.error() );
+  padded_string_view json = result.value_unsafe();
+  return iterate(json);
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(const simdjson_result<padded_string> &result) & noexcept {
+  // We don't presently have a way to temporarily get a const T& from a simdjson_result<T> without throwing an exception
+  SIMDJSON_TRY( result.error() );
+  const padded_string &json = result.value_unsafe();
+  return iterate(json);
+}
+
+simdjson_warn_unused simdjson_inline simdjson_result<json_iterator> parser::iterate_raw(padded_string_view json) & noexcept {
+  if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; }
+
+  // Allocate if needed
+  if (capacity() < json.length()) {
+    SIMDJSON_TRY( allocate(json.length(), max_depth()) );
+  }
+
+  // Run stage 1.
+  SIMDJSON_TRY( implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.length(), stage1_mode::regular) );
+  return json_iterator(reinterpret_cast<const uint8_t *>(json.data()), this);
+}
+
+inline simdjson_result<document_stream> parser::iterate_many(const uint8_t *buf, size_t len, size_t batch_size) noexcept {
+  if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; }
+  return document_stream(*this, buf, len, batch_size);
+}
+inline simdjson_result<document_stream> parser::iterate_many(const char *buf, size_t len, size_t batch_size) noexcept {
+  return iterate_many(reinterpret_cast<const uint8_t *>(buf), len, batch_size);
+}
+inline simdjson_result<document_stream> parser::iterate_many(const std::string &s, size_t batch_size) noexcept {
+  return iterate_many(s.data(), s.length(), batch_size);
+}
+inline simdjson_result<document_stream> parser::iterate_many(const padded_string &s, size_t batch_size) noexcept {
+  return iterate_many(s.data(), s.length(), batch_size);
+}
+
+simdjson_inline size_t parser::capacity() const noexcept {
+  return _capacity;
+}
+simdjson_inline size_t parser::max_capacity() const noexcept {
+  return _max_capacity;
+}
+simdjson_inline size_t parser::max_depth() const noexcept {
+  return _max_depth;
+}
+
+simdjson_inline void parser::set_max_capacity(size_t max_capacity) noexcept {
+  if(max_capacity < dom::MINIMAL_DOCUMENT_CAPACITY) {
+    _max_capacity = max_capacity;
+  } else {
+    _max_capacity = dom::MINIMAL_DOCUMENT_CAPACITY;
+  }
+}
+
+simdjson_inline simdjson_warn_unused simdjson_result<std::string_view> parser::unescape(raw_json_string in, uint8_t *&dst) const noexcept {
+  uint8_t *end = implementation->parse_string(in.buf, dst);
+  if (!end) { return STRING_ERROR; }
+  std::string_view result(reinterpret_cast<const char *>(dst), end-dst);
+  dst = end;
+  return result;
+}
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser>::simdjson_result(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser &&value) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser>(std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser>(value)) {}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser>::simdjson_result(error_code error) noexcept
+    : implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::parser>(error) {}
+
+} // namespace simdjson
+/* end file include/simdjson/generic/ondemand/parser-inl.h */
+/* begin file include/simdjson/generic/ondemand/document_stream-inl.h */
+#include <algorithm>
+#include <limits>
+#include <stdexcept>
+namespace simdjson {
+namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
+namespace ondemand {
+
+#ifdef SIMDJSON_THREADS_ENABLED
+
+inline void stage1_worker::finish() {
+  // After calling "run" someone would call finish() to wait
+  // for the end of the processing.
+  // This function will wait until either the thread has done
+  // the processing or, else, the destructor has been called.
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  cond_var.wait(lock, [this]{return has_work == false;});
+}
+
+inline stage1_worker::~stage1_worker() {
+  // The thread may never outlive the stage1_worker instance
+  // and will always be stopped/joined before the stage1_worker
+  // instance is gone.
+  stop_thread();
+}
+
+inline void stage1_worker::start_thread() {
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  if(thread.joinable()) {
+    return; // This should never happen but we never want to create more than one thread.
+  }
+  thread = std::thread([this]{
+      while(true) {
+        std::unique_lock<std::mutex> thread_lock(locking_mutex);
+        // We wait for either "run" or "stop_thread" to be called.
+        cond_var.wait(thread_lock, [this]{return has_work || !can_work;});
+        // If, for some reason, the stop_thread() method was called (i.e., the
+        // destructor of stage1_worker is called, then we want to immediately destroy
+        // the thread (and not do any more processing).
+        if(!can_work) {
+          break;
+        }
+        this->owner->stage1_thread_error = this->owner->run_stage1(*this->stage1_thread_parser,
+              this->_next_batch_start);
+        this->has_work = false;
+        // The condition variable call should be moved after thread_lock.unlock() for performance
+        // reasons but thread sanitizers may report it as a data race if we do.
+        // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock
+        cond_var.notify_one(); // will notify "finish"
+        thread_lock.unlock();
+      }
+    }
+  );
+}
+
+
+inline void stage1_worker::stop_thread() {
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  // We have to make sure that all locks can be released.
+  can_work = false;
+  has_work = false;
+  cond_var.notify_all();
+  lock.unlock();
+  if(thread.joinable()) {
+    thread.join();
+  }
+}
+
+inline void stage1_worker::run(document_stream * ds, parser * stage1, size_t next_batch_start) {
+  std::unique_lock<std::mutex> lock(locking_mutex);
+  owner = ds;
+  _next_batch_start = next_batch_start;
+  stage1_thread_parser = stage1;
+  has_work = true;
+  // The condition variable call should be moved after thread_lock.unlock() for performance
+  // reasons but thread sanitizers may report it as a data race if we do.
+  // See https://stackoverflow.com/questions/35775501/c-should-condition-variable-be-notified-under-lock
+  cond_var.notify_one(); // will notify the thread lock that we have work
+  lock.unlock();
+}
+
+#endif  // SIMDJSON_THREADS_ENABLED
+
+simdjson_inline document_stream::document_stream(
+  ondemand::parser &_parser,
+  const uint8_t *_buf,
+  size_t _len,
+  size_t _batch_size
+) noexcept
+  : parser{&_parser},
+    buf{_buf},
+    len{_len},
+    batch_size{_batch_size <= MINIMAL_BATCH_SIZE ? MINIMAL_BATCH_SIZE : _batch_size},
+    error{SUCCESS}
+    #ifdef SIMDJSON_THREADS_ENABLED
+    , use_thread(_parser.threaded) // we need to make a copy because _parser.threaded can change
+    #endif
+{
+#ifdef SIMDJSON_THREADS_ENABLED
+  if(worker.get() == nullptr) {
+    error = MEMALLOC;
+  }
+#endif
+}
+
+simdjson_inline document_stream::document_stream() noexcept
+  : parser{nullptr},
+    buf{nullptr},
+    len{0},
+    batch_size{0},
+    error{UNINITIALIZED}
+    #ifdef SIMDJSON_THREADS_ENABLED
+    , use_thread(false)
+    #endif
+{
+}
+
+simdjson_inline document_stream::~document_stream() noexcept
+{
+  #ifdef SIMDJSON_THREADS_ENABLED
+  worker.reset();
+  #endif
+}
+
+inline size_t document_stream::size_in_bytes() const noexcept {
+  return len;
+}
+
+inline size_t document_stream::truncated_bytes() const noexcept {
+  if(error == CAPACITY) { return len - batch_start; }
+  return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1];
+}
+
+simdjson_inline document_stream::iterator::iterator() noexcept
+  : stream{nullptr}, finished{true} {
+}
+
+simdjson_inline document_stream::iterator::iterator(document_stream* _stream, bool is_end) noexcept
+  : stream{_stream}, finished{is_end} {
+}
+
+simdjson_inline simdjson_result<ondemand::document_reference> document_stream::iterator::operator*() noexcept {
+  //if(stream->error) { return stream->error; }
+  return simdjson_result<ondemand::document_reference>(stream->doc, stream->error);
+}
+
+simdjson_inline document_stream::iterator& document_stream::iterator::operator++() noexcept {
+  // If there is an error, then we want the iterator
+  // to be finished, no matter what. (E.g., we do not
+  // keep generating documents with errors, or go beyond
+  // a document with errors.)
+  //
+  // Users do not have to call "operator*()" when they use operator++,
+  // so we need to end the stream in the operator++ function.
+  //
+  // Note that setting finished = true is essential otherwise
+  // we would enter an infinite loop.
+  if (stream->error) { finished = true; }
+  // Note that stream->error() is guarded against error conditions
+  // (it will immediately return if stream->error casts to false).
+  // In effect, this next function does nothing when (stream->error)
+  // is true (hence the risk of an infinite loop).
+  stream->next();
+  // If that was the last document, we're finished.
+  // It is the only type of error we do not want to appear
+  // in operator*.
+  if (stream->error == EMPTY) { finished = true; }
+  // If we had any other kind of error (not EMPTY) then we want
+  // to pass it along to the operator* and we cannot mark the result
+  // as "finished" just yet.
+  return *this;
+}
+
+simdjson_inline bool document_stream::iterator::operator!=(const document_stream::iterator &other) const noexcept {
+  return finished != other.finished;
+}
+
+simdjson_inline document_stream::iterator document_stream::begin() noexcept {
+  start();
+  // If there are no documents, we're finished.
+  return iterator(this, error == EMPTY);
+}
+
+simdjson_inline document_stream::iterator document_stream::end() noexcept {
+  return iterator(this, true);
+}
+
+inline void document_stream::start() noexcept {
+  if (error) { return; }
+  error = parser->allocate(batch_size);
+  if (error) { return; }
+  // Always run the first stage 1 parse immediately
+  batch_start = 0;
+  error = run_stage1(*parser, batch_start);
+  while(error == EMPTY) {
+    // In exceptional cases, we may start with an empty block
+    batch_start = next_batch_start();
+    if (batch_start >= len) { return; }
+    error = run_stage1(*parser, batch_start);
+  }
+  if (error) { return; }
+  doc_index = batch_start;
+  doc = document(json_iterator(&buf[batch_start], parser));
+  doc.iter._streaming = true;
+
+  #ifdef SIMDJSON_THREADS_ENABLED
+  if (use_thread && next_batch_start() < len) {
+    // Kick off the first thread on next batch if needed
+    error = stage1_thread_parser.allocate(batch_size);
+    if (error) { return; }
+    worker->start_thread();
+    start_stage1_thread();
+    if (error) { return; }
+  }
+  #endif // SIMDJSON_THREADS_ENABLED
+}
+
+inline void document_stream::next() noexcept {
+  // We always enter at once once in an error condition.
+  if (error) { return; }
+  next_document();
+  if (error) { return; }
+  auto cur_struct_index = doc.iter._root - parser->implementation->structural_indexes.get();
+  doc_index = batch_start + parser->implementation->structural_indexes[cur_struct_index];
+
+  // Check if at end of structural indexes (i.e. at end of batch)
+  if(cur_struct_index >= static_cast<int64_t>(parser->implementation->n_structural_indexes)) {
+    error = EMPTY;
+    // Load another batch (if available)
+    while (error == EMPTY) {
+      batch_start = next_batch_start();
+      if (batch_start >= len) { break; }
+      #ifdef SIMDJSON_THREADS_ENABLED
+      if(use_thread) {
+        load_from_stage1_thread();
+      } else {
+        error = run_stage1(*parser, batch_start);
+      }
+      #else
+      error = run_stage1(*parser, batch_start);
+      #endif
+      /**
+       * Whenever we move to another window, we need to update all pointers to make
+       * it appear as if the input buffer started at the beginning of the window.
+       *
+       * Take this input:
+       *
+       * {"z":5}  {"1":1,"2":2,"4":4} [7,  10,   9]  [15,  11,   12, 13]  [154,  110,   112, 1311]
+       *
+       * Say you process the following window...
+       *
+       * '{"z":5}  {"1":1,"2":2,"4":4} [7,  10,   9]'
+       *
+       * When you do so, the json_iterator has a pointer at the beginning of the memory region
+       * (pointing at the beginning of '{"z"...'.
+       *
+       * When you move to the window that starts at...
+       *
+       * '[7,  10,   9]  [15,  11,   12, 13] ...
+       *
+       * then it is not sufficient to just run stage 1. You also need to re-anchor the
+       * json_iterator so that it believes we are starting at '[7,  10,   9]...'.
+       *
+       * Under the DOM front-end, this gets done automatically because the parser owns
+       * the pointer the data, and when you call stage1 and then stage2 on the same
+       * parser, then stage2 will run on the pointer acquired by stage1.
+       *
+       * That is, stage1 calls "this->buf = _buf" so the parser remembers the buffer that
+       * we used. But json_iterator has no callback when stage1 is called on the parser.
+       * In fact, I think that the parser is unaware of json_iterator.
+       *
+       *
+       * So we need to re-anchor the json_iterator after each call to stage 1 so that
+       * all of the pointers are in sync.
+       */
+      doc.iter = json_iterator(&buf[batch_start], parser);
+      doc.iter._streaming = true;
+      /**
+       * End of resync.
+       */
+
+      if (error) { continue; } // If the error was EMPTY, we may want to load another batch.
+      doc_index = batch_start;
+    }
+  }
+}
+
+inline void document_stream::next_document() noexcept {
+  // Go to next place where depth=0 (document depth)
+  error = doc.iter.skip_child(0);
+  if (error) { return; }
+  // Always set depth=1 at the start of document
+  doc.iter._depth = 1;
+  // Resets the string buffer at the beginning, thus invalidating the strings.
+  doc.iter._string_buf_loc = parser->string_buf.get();
+  doc.iter._root = doc.iter.position();
+}
+
+inline size_t document_stream::next_batch_start() const noexcept {
+  return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes];
+}
+
+inline error_code document_stream::run_stage1(ondemand::parser &p, size_t _batch_start) noexcept {
+  // This code only updates the structural index in the parser, it does not update any json_iterator
+  // instance.
+  size_t remaining = len - _batch_start;
+  if (remaining <= batch_size) {
+    return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final);
+  } else {
+    return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial);
+  }
+}
+
+simdjson_inline size_t document_stream::iterator::current_index() const noexcept {
+  return stream->doc_index;
+}
+
+simdjson_inline std::string_view document_stream::iterator::source() const noexcept {
+  auto depth = stream->doc.iter.depth();
+  auto cur_struct_index = stream->doc.iter._root - stream->parser->implementation->structural_indexes.get();
+
+  // If at root, process the first token to determine if scalar value
+  if (stream->doc.iter.at_root()) {
+    switch (stream->buf[stream->batch_start + stream->parser->implementation->structural_indexes[cur_struct_index]]) {
+      case '{': case '[':   // Depth=1 already at start of document
+        break;
+      case '}': case ']':
+        depth--;
+        break;
+      default:    // Scalar value document
+        // TODO: Remove any trailing whitespaces
+        // This returns a string spanning from start of value to the beginning of the next document (excluded)
+        return std::string_view(reinterpret_cast<const char*>(stream->buf) + current_index(), stream->parser->implementation->structural_indexes[++cur_struct_index] - current_index() - 1);
+    }
+    cur_struct_index++;
+  }
+
+  while (cur_struct_index <= static_cast<int64_t>(stream->parser->implementation->n_structural_indexes)) {
+    switch (stream->buf[stream->batch_start + stream->parser->implementation->structural_indexes[cur_struct_index]]) {
+      case '{': case '[':
+        depth++;
+        break;
+      case '}': case ']':
+        depth--;
+        break;
+    }
+    if (depth == 0) { break; }
+    cur_struct_index++;
+  }
+
+  return std::string_view(reinterpret_cast<const char*>(stream->buf) + current_index(), stream->parser->implementation->structural_indexes[cur_struct_index] - current_index() + stream->batch_start + 1);;
+}
+
+inline error_code document_stream::iterator::error() const noexcept {
+  return stream->error;
+}
+
+#ifdef SIMDJSON_THREADS_ENABLED
+
+inline void document_stream::load_from_stage1_thread() noexcept {
+  worker->finish();
+  // Swap to the parser that was loaded up in the thread. Make sure the parser has
+  // enough memory to swap to, as well.
+  std::swap(stage1_thread_parser,*parser);
+  error = stage1_thread_error;
+  if (error) { return; }
+
+  // If there's anything left, start the stage 1 thread!
+  if (next_batch_start() < len) {
+    start_stage1_thread();
+  }
+}
+
+inline void document_stream::start_stage1_thread() noexcept {
+  // we call the thread on a lambda that will update
+  // this->stage1_thread_error
+  // there is only one thread that may write to this value
+  // TODO this is NOT exception-safe.
+  this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error
+  size_t _next_batch_start = this->next_batch_start();
+
+  worker->run(this, & this->stage1_thread_parser, _next_batch_start);
+}
+
+#endif // SIMDJSON_THREADS_ENABLED
+
+} // namespace ondemand
+} // namespace SIMDJSON_BUILTIN_IMPLEMENTATION
+} // namespace simdjson
+
+namespace simdjson {
+
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream>::simdjson_result(
+  error_code error
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream>(error)
+{
+}
+simdjson_inline simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream>::simdjson_result(
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream &&value
+) noexcept :
+    implementation_simdjson_result_base<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream>(
+      std::forward<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_stream>(value)
+    )
+{
+}
+
+}
+/* end file include/simdjson/generic/ondemand/document_stream-inl.h */
+/* begin file include/simdjson/generic/ondemand/serialization-inl.h */
+
+
+namespace simdjson {
+
+inline std::string_view trim(const std::string_view str) noexcept {
+  // We can almost surely do better by rolling our own find_first_not_of function.
+  size_t first = str.find_first_not_of(" \t\n\r");
+  // If we have the empty string (just white space), then no trimming is possible, and
+  // we return the empty string_view.
+  if (std::string_view::npos == first) { return std::string_view(); }
+  size_t last = str.find_last_not_of(" \t\n\r");
+  return str.substr(first, (last - first + 1));
+}
+
+
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& x) noexcept {
+  std::string_view v;
+  auto error = x.raw_json().get(v);
+  if(error) {return error; }
+  return trim(v);
+}
+
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference& x) noexcept {
+  std::string_view v;
+  auto error = x.raw_json().get(v);
+  if(error) {return error; }
+  return trim(v);
+}
+
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value& x) noexcept {
+  /**
+   * If we somehow receive a value that has already been consumed,
+   * then the following code could be in trouble. E.g., we create
+   * an array as needed, but if an array was already created, then
+   * it could be bad.
+   */
+  using namespace SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand;
+  SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::json_type t;
+  auto error = x.type().get(t);
+  if(error != SUCCESS) { return error; }
+  switch (t)
+  {
+    case json_type::array:
+    {
+      SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array array;
+      error = x.get_array().get(array);
+      if(error) { return error; }
+      return to_json_string(array);
+    }
+    case json_type::object:
+    {
+      SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object object;
+      error = x.get_object().get(object);
+      if(error) { return error; }
+      return to_json_string(object);
+    }
+    default:
+      return trim(x.raw_json_token());
+  }
+}
+
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object& x) noexcept {
+  std::string_view v;
+  auto error = x.raw_json().get(v);
+  if(error) {return error; }
+  return trim(v);
+}
+
+inline simdjson_result<std::string_view> to_json_string(SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array& x) noexcept {
+  std::string_view v;
+  auto error = x.raw_json().get(v);
+  if(error) {return error; }
+  return trim(v);
+}
+
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document> x) {
+  if (x.error()) { return x.error(); }
+  return to_json_string(x.value_unsafe());
+}
+
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference> x) {
+  if (x.error()) { return x.error(); }
+  return to_json_string(x.value_unsafe());
+}
+
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> x) {
+  if (x.error()) { return x.error(); }
+  return to_json_string(x.value_unsafe());
+}
+
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> x) {
+  if (x.error()) { return x.error(); }
+  return to_json_string(x.value_unsafe());
+}
+
+inline simdjson_result<std::string_view> to_json_string(simdjson_result<SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> x) {
+  if (x.error()) { return x.error(); }
+  return to_json_string(x.value_unsafe());
+}
+} // namespace simdjson
+
+namespace simdjson { namespace SIMDJSON_BUILTIN_IMPLEMENTATION { namespace ondemand {
+
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value x) {
+  std::string_view v;
+  auto error = simdjson::to_json_string(x).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    throw simdjson::simdjson_error(error);
+  }
+}
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value> x) {
+  if (x.error()) { throw simdjson::simdjson_error(x.error()); }
+  return (out << x.value());
+}
+#else
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::value x) {
+  std::string_view v;
+  auto error = simdjson::to_json_string(x).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    return (out << error);
+  }
+}
+#endif
+
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array value) {
+  std::string_view v;
+  auto error = simdjson::to_json_string(value).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    throw simdjson::simdjson_error(error);
+  }
+}
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array> x) {
+  if (x.error()) { throw simdjson::simdjson_error(x.error()); }
+  return (out << x.value());
+}
+#else
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::array value) {
+  std::string_view v;
+  auto error = simdjson::to_json_string(value).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    return (out << error);
+  }
+}
+#endif
+
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& value)  {
+  std::string_view v;
+  auto error = simdjson::to_json_string(value).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    throw simdjson::simdjson_error(error);
+  }
+}
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference& value)  {
+  std::string_view v;
+  auto error = simdjson::to_json_string(value).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    throw simdjson::simdjson_error(error);
+  }
+}
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document>&& x) {
+  if (x.error()) { throw simdjson::simdjson_error(x.error()); }
+  return (out << x.value());
+}
+inline std::ostream& operator<<(std::ostream& out, simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document_reference>&& x) {
+  if (x.error()) { throw simdjson::simdjson_error(x.error()); }
+  return (out << x.value());
+}
+#else
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::document& value)  {
+  std::string_view v;
+  auto error = simdjson::to_json_string(value).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    return (out << error);
+  }
+}
+#endif
+
+#if SIMDJSON_EXCEPTIONS
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object value) {
+  std::string_view v;
+  auto error = simdjson::to_json_string(value).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    throw simdjson::simdjson_error(error);
+  }
+}
+inline std::ostream& operator<<(std::ostream& out,  simdjson::simdjson_result<simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object> x) {
+  if (x.error()) { throw  simdjson::simdjson_error(x.error()); }
+  return (out << x.value());
+}
+#else
+inline std::ostream& operator<<(std::ostream& out, simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand::object value) {
+  std::string_view v;
+  auto error = simdjson::to_json_string(value).get(v);
+  if(error == simdjson::SUCCESS) {
+    return (out << v);
+  } else {
+    return (out << error);
+  }
+}
+#endif
+}}} // namespace simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand
+/* end file include/simdjson/generic/ondemand/serialization-inl.h */
+/* end file include/simdjson/generic/ondemand-inl.h */
+
+
+namespace simdjson {
+  /**
+   * Represents the best statically linked simdjson implementation that can be used by the compiling
+   * program.
+   *
+   * Detects what options the program is compiled against, and picks the minimum implementation that
+   * will work on any computer that can run the program. For example, if you compile with g++
+   * -march=westmere, it will pick the westmere implementation. The haswell implementation will
+   * still be available, and can be selected at runtime, but the builtin implementation (and any
+   * code that uses it) will use westmere.
+   */
+  namespace builtin = SIMDJSON_BUILTIN_IMPLEMENTATION;
+  /**
+   * @copydoc simdjson::SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand
+   */
+  namespace ondemand = SIMDJSON_BUILTIN_IMPLEMENTATION::ondemand;
+  /**
+   * Function which returns a pointer to an implementation matching the "builtin" implementation.
+   * The builtin implementation is the best statically linked simdjson implementation that can be used by the compiling
+   * program. If you compile with g++ -march=haswell, this will return the haswell implementation.
+   * It is handy to be able to check what builtin was used: builtin_implementation()->name().
+   */
+  const implementation * builtin_implementation();
+} // namespace simdjson
+
+#endif // SIMDJSON_BUILTIN_H
+/* end file include/simdjson/builtin.h */
+
+#endif // SIMDJSON_H
+/* end file include/simdjson.h */
diff --git a/kram-profile/README.md b/kram-profile/README.md
index dbd18959..6e7bfea3 100644
--- a/kram-profile/README.md
+++ b/kram-profile/README.md
@@ -1,20 +1,45 @@
 kram-profile
 ==========
 
-kram-profile wraps SwiftUI atop a WKWebView running the Perfetto TraceViewer.  A dev can open directories or files of traces.  Supported files are added to a list to quickly view these in Perfetto.  The app is multidocument.  Each window is a single instance of Pefertto TraceViewer that is loaded once.   The sandboxed SwiftUI acts as the bridge to the native file system, which the TraceViewer browser sandbox lacks.
+kram-profile wraps SwiftUI atop a WKWebView running the Perfetto TraceViewer.  A dev can open directories or files of traces.  Supported files are added to a list to quickly view these in Perfetto.  The app is multi-document.  Each window is a single instance of Pefertto TraceViewer that is loaded once.   The sandboxed SwiftUI acts as the bridge to the native file system, which the TraceViewer browser sandbox lacks.
 
 Flamegraphs are key to all profiling.  Why look at giant table of numbers when you can see them visually.  Flamegraphs also need to be dynamic and display hover tips and details.  Fortunately there are several tools now supporting flamegraphs.  Perfetto is one such tool.
 
-This is also a discussion of profilers and optimizing.
+kram-profile fixes up build traces to reflect the name of the file/function.  And it demangles function names from clang.
+
+Files can be dragged onto the list view, double-clicked from Finder if the filenames below are associated with the app, or there is an Open and Refresh command.
 
 Supported files
 
 * .memtrace - memory report generated by Kram scripts folder.
 * .trace/.perftrace - performance timings in the form catapult trace json files
 * .json/.buildtrace - clang timing output generated using -ftime-trace
-
-There are pre-built version of kram-profile for macOS 13.0 and higher.
-
+* .zip archives of above
+* .gzip compressed files of above
+* folders of loose files or achives
+
+There is a pre-built version of kram-profile for macOS 13.0 and higher.
+
+List view 
+  File type, name, duration
+  Up/down arrow keys or cmd+N/cmd+shift+N to advance through list
+  Hover for path of filename
+  Can collapse/restore the list
+  Type-search in the list
+    
+Navigation Title
+  Filename (archive)
+  Info button (memtrace) - shows max of tracks for heap size
+  cmd+T  search by duration
+  cmd+shift+T search by name
+  
+WKWebView
+  Perfetto Flamegraph
+  Tracknames on left
+  cmd+S to search for a named entry in flamegraph
+  cmd+shift+P to parse command 
+  Cannot hide the tracknames
+  
 ----------------
 
 TODO: (x are done)
@@ -25,9 +50,9 @@ TODO: (x are done)
 * x Support gzip trace files
 * x Add sort by range (useful for mem/build traces)
 * x Add zip archive support, can drop archive of 1+ traces
+* x Tie in with the excellent ClangBuildAnalyzer tool
 
 * Add frame type for perf traces for vsync ticker (binary format prob has it)
-* Tie in with the excellent ClangBuildAnalyzer tool
 * Scale specific traces to a single duration.  That way the next file comes in at that scale. 
 * Move away from Catapult json to own binary format.  Can then translate to json or use the Perfetto SDK to convert to protobufs.
 
@@ -87,17 +112,19 @@ Perfetto
 
 This is a web-based profiling and flame-graph tool.  It's fast on desktop, and continues to evolve.  Only has second and timecode granularity which isn't enough.  For example, performance profiling for games is in milliseconds.  The team is mostly focused on Chrome profiling which apparently is in seconds.  But the visuals are nice, and it now has hover tips with size/name, and also has an Issues list that the devs are responsive to.  Flutter is using this profiler, and kram-profile does too.
 
-Perfetto lives inside a sandbox due to the browser, so feeding files to Perfetto is it's weekness.  As a result kram-profile's file list is a nice complement, and can send the file data across via Javascript.  This is not unlike an Electron wrapper, but in much less memory.  
+Perfetto lives inside a sandbox due to the browser, so feeding files to Perfetto is one weakness.  As a result kram-profile's file list is a nice complement, and can send the file data across via Javascript.  This is not unlike an Electron wrapper, but in much less memory.  
 
 One limitation is that traces must be nested.  So timestamps cannot overlap.   Make sure to honor this, or traces will overlap verticall and become confused.  There is a C++ SDK to help with writing out traces, and that is a much more compact format than the json.  But more languages can write to the json format.  The Perfetto team is doing no further work on the json format.  And fields like "color" are unsupported, and Perfetto uses it's own coloration for blocks instead.  This coloration is nice and consistent and tied to name.
 
+Having lots of issues trying to reuse the Perfetto web page to load more than one profile.  The web app gets into a bad state, and then won't load any content afterwareds.
+
 Orbit
 ---------
 * https://orbitprofiler.com/
 
-This profiler uses dynamic instrumentation of code via dtrace and trampolines.  Note that Win, macOS can use this sort of system.  Apple blocks access to dtrace on iOS, but there are mentions of ktrace.  So you inject/remove traces dynamically by patching the dll sources directly.  This used to run on macOS, Win, and Linux.  Google Stadio adopted this project, and not it is limited to Linux support.
+This profiler uses dynamic instrumentation of code via dtrace and trampolines.  Note that Win, macOS can use this sort of system.  Apple blocks access to dtrace on iOS, but there are mentions of ktrace.  So you inject/remove traces dynamically by patching the dll sources directly.  This used to run on macOS, Win, and Linux.  Google Stadia adopted this project, and now it is limited to Linux support.
 
-This avoids the need to blindly instrument code or inject scopes into high-frequency routines.  But this patching may not be compatible by the security theater adopted by some mobile devices.
+This avoids the need to blindly instrument code or inject scopes into high-frequency routines.  But this patching is not be compatible by the security theater adopted by iOS devices.
 
 ClangBuildAnalyzer
 --------
@@ -148,7 +175,7 @@ This is a minimal version of Make.  But code must generate the Ninja file.  Cmak
 Unity builds
 -----------
 
-Not to be confused with the Unity game engine.  But unity builds combine several .cpp files into a single .cpp.  This works around problems with slow linkers, and multile template and inline code instantations.  But code and macros from one .cpp spill into the next.  To facilitate this, be careful about undeffing at the bottoms of files.  kram also uses a common namespaces across headers and source files.  This allows "using namespace" in both, and keeps the sources compartmentalized.
+Not to be confused with the Unity game engine.  But unity builds combine several .cpp files into a single .cpp.  This works around problems with slow linkers, and multiple template and inline code instantations.  But code and macros from one .cpp spill into the next.  To avoid this, be careful about undeffing at the bottoms of files.  kram also uses a common namespaces across headers and source files.  This allows "using namespace" in both, and keeps the namespaces compartmentalized.
 
 Precompiled headers (PCH)
 -----------
@@ -159,39 +186,67 @@ pch spread headers into files.  So the build can break if some don't use it, or
 
 There are broken examples of setting up pch for Makefiles all over the internet.  Maybe cmake has a valid setup, but the jist is below for gcc/clang.  Make sure to verify the parse time is gone in kram-profile by looking at the clang build profiles.
 
+Clang has options to generate a pch .o file.  This must be linked separately into the library.  This is something MSVC pch support for a long time.  gcc doesn't support this.  See the link below, and the pchObj in the makefile example below.
+
+Advanced clang pch usage
+https://maskray.me/blog/2023-07-16-precompiled-headers
+
+
     # gen the .d file, written to tmp and only replaces if it changes
-    cppFlags = ... -MMD -MP (or -MD)
+    cppFlags = ... 
+    
+    cppDepFlags = -MMD -MP (or -MD)
 
+    # header must be unique to build (f.e. defines, etc)
+    cppBuild = $(platform)($config)
+    
     # setup the files involved, only get 1 pch per DLL/App since
     pchSrc = Precompile.h
-    pchHdr = Precompile-$(platform)($config).h
+    pchHdrSrc = Precompile-$(cppBuild).h
     pchDeps = $(pchHdr).d
-    pchObj = $(pchHdr).gch
+    pchHdr = $(pchHdrSrc).pch
+    pchObj = $(pchHdr).o
     pchIncludesDirs = -Idir1 -Idir2
-            
+    
+    # this does code gen, templates, and debuginfo into the h.pch.o file
+    pchFlags = -fpch-codegen -fpch-instantiate-templates -fpch-debuginfo
+             
     # important - only copy the hdr if it changes, don't want full rebuild every time
-    $(pchHdr): $(pchSrc)
-        $cp $< $@
+    # linux (cp -u), win (xcopy), macOS (shell compare then cp)
+    $(pchHdrSrc): $(pchSrc)
+        cp $< $@
         
     # this will output the .d and .gch file
-    $(pchObj): $(pchHdr)
-        clang++ -x c++header $(cppFlags) -c $< -o $@ -$(pchIncludesDirs)
+    $(pchHdr): $(pchHdrSrc)
+        clang++ -x c++header $(cppFlags) $(cppDepFlags) $(pchFlags) $(pchIncludesDirs) -c $< -o $@ 
         
     # this makes sure that the pch is rebuilt if hdrs within pchHdr changee
     # the - sign ignores the deps file on the first run where it does not exist.
     $(pchDeps): ;
     -include $(pchDeps)
     
+    # optional code to build .o from .pch 
+    # must link this in with the lib/exe, don't use "-x c++" here - it's ast not C++ code
+    #  speeds the build, since code isn't prepended to each .o file, and then linked.
+    $(pchObj): $(pchHdr)
+        clang++ $(cppFlags) -c $< -o $@
+    
     ....
     
-    # force include Precompile.h, 
-    # and then use the pch obj to avoid parsing (appends to top of .o)
-    cppPchFlags = -include $(pchHdr) -include-pch $(pchObj))
+    # prefix Precompile.h.pch to each .o file
+    cppPchFlags = -include-pch $(pchHdr)
    
     # now build the files
     *.cpp: ... $(pchHdr)
-        clang++ $(cppFlags) -c $< -o $@ $(cppPchFlags)
+        clang++ $(cppFlags) $(cppPchFlags) -c $< -o $@ 
+
+    # link the pchObj into the lib or ese
+    allObjs = *.o $(pchObj)
 
+    $(libOrExe): $(allObjs)
+        clang++ $< -o $@
+        
+        
 SIMD
 -----------
 
diff --git a/kram-profile/Source/KramZipHelper.cpp b/kram-profile/Source/KramZipHelper.cpp
index 42c175e9..ddf9b889 100644
--- a/kram-profile/Source/KramZipHelper.cpp
+++ b/kram-profile/Source/KramZipHelper.cpp
@@ -2,16 +2,19 @@
 
 #include <algorithm>
 //#include <iterator> // for copy_if on Win
-#include <vector>
+#include <mutex>
 #include <string>
+#include <unordered_map>
+#include <vector>
 
 #include "miniz.h"
 
 // test for perf of this compared to one in miniz also see
 // comments about faster algs.
 // libcompress can only encode lvl 5, but here it's only decompress.
+// This seems to fail when used for kramv zip archives, so disable fo now
 #ifndef USE_LIBCOMPRESSION
-#define USE_LIBCOMPRESSION (KRAM_MAC || KRAM_IOS)
+#define USE_LIBCOMPRESSION 0 // (KRAM_MAC || KRAM_IOS)
 #endif
 
 #if USE_LIBCOMPRESSION
@@ -21,31 +24,137 @@
 // Throwing this in for now, since it's the only .cpp file
 #if KRAM_MAC || KRAM_IOS
 #include <cxxabi.h> // demangle
-#include <unordered_map>
-#include <mutex>
 #endif
 
-extern "C" const char* _Nonnull demangleSymbolName(const char* _Nonnull symbolName_) {
-    using namespace NAMESPACE_STL;
-    
+using namespace STL_NAMESPACE;
+
+// copied out of KramLog.cpp
+static int32_t append_vsprintf(string& str, const char* format, va_list args)
+{
+    // for KLOGE("group", "%s", "text")
+    if (strcmp(format, "%s") == 0) {
+        const char* firstArg = va_arg(args, const char*);
+        str += firstArg;
+        return (int32_t)strlen(firstArg);
+    }
+
+    // This is important for the case where ##VAR_ARGS only leaves the format.
+    // In this case "text" must be a compile time constant string to avoid security warning needed for above.
+    // for KLOGE("group", "text")
+    if (strrchr(format, '%') == nullptr) {
+        str += format;
+        return (int32_t)strlen(format);
+    }
+
+    // format once to get length (without NULL at end)
+    va_list argsCopy;
+    va_copy(argsCopy, args);
+    int32_t len = vsnprintf(NULL, 0, format, argsCopy);
+    va_end(argsCopy);
+
+    if (len > 0) {
+        size_t existingLen = str.length();
+
+        // resize and format again into string
+        str.resize(existingLen + len, 0);
+
+        vsnprintf((char*)str.c_str() + existingLen, len + 1, format, args);
+    }
+
+    return len;
+}
+
+int32_t append_sprintf(string& str, const char* format, ...)
+{
+    va_list args;
+    va_start(args, format);
+    int32_t len = append_vsprintf(str, format, args);
+    va_end(args);
+
+    return len;
+}
+
+// This is extracted from CBA Analysis.cpp
+extern "C" const char* _Nullable collapseFunctionName(const char* _Nonnull name_)
+{
+    // Adapted from code in Analysis.  Really the only call needed from CBA.
+    // serialize to multiple threads
+    static mutex sMutex;
+    static unordered_map<string, string> sMap;
+    lock_guard<mutex> lock(sMutex);
+
+    string elt(name_);
+    auto it = sMap.find(elt);
+    if (it != sMap.end()) {
+        return it->second.c_str();
+    }
+
+    // Parsing op<, op<<, op>, and op>> seems hard.  Just skip'm all
+    if (strstr(name_, "operator") != nullptr)
+        return nullptr;
+
+    std::string retval;
+    retval.reserve(elt.size());
+    auto b_range = elt.begin();
+    auto e_range = elt.begin();
+    while (b_range != elt.end()) {
+        e_range = std::find(b_range, elt.end(), '<');
+        if (e_range == elt.end())
+            break;
+        ++e_range;
+        retval.append(b_range, e_range);
+        retval.append("$");
+        b_range = e_range;
+        int open_count = 1;
+        // find the matching close angle bracket
+        for (; b_range != elt.end(); ++b_range) {
+            if (*b_range == '<') {
+                ++open_count;
+                continue;
+            }
+            if (*b_range == '>') {
+                if (--open_count == 0) {
+                    break;
+                }
+                continue;
+            }
+        }
+        // b_range is now pointing at a close angle, or it is at the end of the string
+    }
+    if (b_range > e_range) {
+        // we are in a wacky case where something like op> showed up in a mangled name.
+        // just bail.
+        // TODO: this still isn't correct, but it avoids crashes.
+        return nullptr;
+    }
+    // append the footer
+    retval.append(b_range, e_range);
+
+    // add it to the map
+    sMap[elt] = std::move(retval);
+
+    return sMap[elt].c_str();
+}
+
+extern "C" const char* _Nullable demangleSymbolName(const char* _Nonnull symbolName_)
+{
     // serialize to multiple threads
     static mutex sMutex;
     static unordered_map<string, const char*> sSymbolToDemangleName;
     lock_guard<mutex> lock(sMutex);
-    
+
     string symbolName(symbolName_);
     auto it = sSymbolToDemangleName.find(symbolName);
     if (it != sSymbolToDemangleName.end()) {
         return it->second;
     }
-    
+
     // see CBA if want a generalized demangle for Win/Linux
     size_t size = 0;
     int status = 0;
     char* symbol = abi::__cxa_demangle(symbolName.c_str(), nullptr, &size, &status);
     const char* result = nullptr;
     if (status == 0) {
-        
         sSymbolToDemangleName[symbolName] = symbol;
         result = symbol;
         // not freeing the symbols here
@@ -53,14 +162,20 @@ extern "C" const char* _Nonnull demangleSymbolName(const char* _Nonnull symbolNa
     }
     else {
         // This will do repeated demangle though.  Maybe should add to table?
-        result = symbolName_;
+        // Swift fails when returning back the string it marshalled back to stuff back
+        // into String(cstring: ...).   Ugh.  So return empty string.
+        // status = -2 on most of the mangled Win clang-cli symbols.  Nice one
+        // Microsoft.
+        //result = symbolName_;
+
+        result = nullptr;
     }
-    
+
     return result;
 }
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // Copied out of KramLog.cpp
 inline bool endsWithExtension(const char* str, const string& substring)
@@ -166,13 +281,13 @@ void ZipHelper::initZipEntryTables()
 
         ZipEntry& zipEntry = _zipEntrys[index];
         zipEntry.fileIndex = stat.m_file_index;
-        zipEntry.filename = filename;  // can alias
+        zipEntry.filename = filename; // can alias
         zipEntry.uncompressedSize = stat.m_uncomp_size;
         zipEntry.compressedSize = stat.m_comp_size;
-        zipEntry.modificationDate = (int32_t)stat.m_time;  // really a time_t
-        #undef crc32
+        zipEntry.modificationDate = (int32_t)stat.m_time; // really a time_t
+#undef crc32
         zipEntry.crc32 = stat.m_crc32;
-        
+
         // TODO: stat.m_time, state.m_crc32
 
         index++;
@@ -236,7 +351,7 @@ bool ZipHelper::extract(const char* filename, uint8_t* bufferData, uint64_t buff
     if (bufferDataSize < entry->uncompressedSize) {
         return false;
     }
-    
+
     if (!extract(*entry, bufferData, bufferDataSize)) {
         return false;
     }
@@ -244,7 +359,6 @@ bool ZipHelper::extract(const char* filename, uint8_t* bufferData, uint64_t buff
     return true;
 }
 
-
 bool ZipHelper::extractPartial(const char* filename, vector<uint8_t>& buffer) const
 {
     if (buffer.empty()) {
@@ -279,29 +393,29 @@ bool ZipHelper::extract(const ZipEntry& entry, void* buffer, uint64_t bufferSize
     // https://dougallj.wordpress.com/2022/08/20/faster-zlib-deflate-decompression-on-the-apple-m1-and-x86/
 
     // https://developer.apple.com/documentation/compression/1481000-compression_decode_buffer?language=objc
-    
+
     // This call is internal, so caller has already tested failure cases.
-    
+
 #if USE_LIBCOMPRESSION
     const uint8_t* data = mz_zip_reader_get_raw_data(zip.get(), entry.fileIndex);
     if (!data) {
         return false;
     }
     // need to extra data and header
-    
+    char scratchBuffer[compression_decode_scratch_buffer_size(COMPRESSION_ZLIB)];
+
     uint64_t bytesDecoded = compression_decode_buffer(
         (uint8_t*)buffer, entry.uncompressedSize,
         (const uint8_t*)data, entry.compressedSize,
-        NULL, // scratch-buffer that could speed up to pass
+        scratchBuffer,
         COMPRESSION_ZLIB);
-    
+
     bool success = false;
-    if (bytesDecoded == entry.uncompressedSize)
-    {
+    if (bytesDecoded == entry.uncompressedSize) {
         success = true;
     }
 #else
-    
+
     // this pulls pages from mmap, no allocations
     mz_bool success = mz_zip_reader_extract_to_mem(
         zip.get(), entry.fileIndex, buffer, bufferSize, 0);
@@ -331,7 +445,7 @@ bool ZipHelper::extractRaw(const char* filename, const uint8_t** bufferData, uin
     }
 
     *bufferData = data;
-    
+
     // This isn't correct, need to return comp_size.
     // Caller may need the uncompressed size though to decompress fully into.
     //bufferDataSize = stat.m_uncomp_size;
@@ -340,4 +454,4 @@ bool ZipHelper::extractRaw(const char* filename, const uint8_t** bufferData, uin
     return true;
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/kram-profile/Source/KramZipHelper.h b/kram-profile/Source/KramZipHelper.h
index b6618fe6..0dd7d13c 100644
--- a/kram-profile/Source/KramZipHelper.h
+++ b/kram-profile/Source/KramZipHelper.h
@@ -3,13 +3,13 @@
 // TODO: move to KramConfig.h
 #define KRAM_MAC 1
 #define KRAM_IOS 0
-#define NAMESPACE_STL std
+#define STL_NAMESPACE std
 
 #include <stdint.h>
 
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 
 // from miniz
 // had to change miniz from anonymous struct typedef, or can't fwd declare
@@ -18,10 +18,10 @@ struct mz_zip_archive;
 namespace kram {
 
 //struct MmapHelper;
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 struct ZipEntry {
-    const char* filename;  // max 512, aliased
+    const char* filename; // max 512, aliased
     int32_t fileIndex;
 
     // attributes
@@ -75,8 +75,8 @@ struct ZipHelper {
     std::unique_ptr<mz_zip_archive> zip;
     vector<ZipEntry> _zipEntrys;
 
-    const uint8_t* zipData;  // aliased
+    const uint8_t* zipData; // aliased
 
     vector<char> allFilenames;
 };
-}  // namespace kram
+} // namespace kram
diff --git a/kram-profile/Source/KramZipHelperW.h b/kram-profile/Source/KramZipHelperW.h
index bb5659b7..72cfe2c2 100644
--- a/kram-profile/Source/KramZipHelperW.h
+++ b/kram-profile/Source/KramZipHelperW.h
@@ -3,7 +3,7 @@
 #import <Foundation/Foundation.h>
 
 typedef struct ZipEntryW {
-    const char* _Nonnull filename;  // max 512, aliased
+    const char* _Nonnull filename; // max 512, aliased
     int32_t fileIndex;
 
     // attributes
@@ -13,28 +13,30 @@ typedef struct ZipEntryW {
     uint32_t crc32;
 } ZipEntryW;
 
-
 // Use this to bridge the C++ over to Swift for now
 // TODO: form a clang module and reference C++ directly
 @interface ZipHelperW : NSObject
-    - (nonnull instancetype)initWithData:(nonnull NSData*)data;
-    
-    // extract the data.  Can alias into the file.
-    - (nullable NSData*)extract:(nonnull const char*)filename;
-    
-    // pass back vector this way for now, should be property
-    - (nonnull const ZipEntryW*)zipEntrys;
+- (nonnull instancetype)initWithData:(nonnull NSData*)data;
+
+// extract the data.  Can alias into the file.
+- (nullable NSData*)extract:(nonnull const char*)filename;
+
+// pass back vector this way for now, should be property
+- (nonnull const ZipEntryW*)zipEntrys;
 
-    - (NSInteger)zipEntrysCount;
+- (NSInteger)zipEntrysCount;
 
-    // This isn't the fileIndex, but uses count above to avoid needing to do unsafe
-    - (ZipEntryW)zipEntry:(NSInteger)index;
+// This isn't the fileIndex, but uses count above to avoid needing to do unsafe
+- (ZipEntryW)zipEntry:(NSInteger)index;
 
-    // retrieve an entry by filename
-    - (ZipEntryW)zipEntryByName:(nonnull const char*)name;
+// retrieve an entry by filename
+- (ZipEntryW)zipEntryByName:(nonnull const char*)name;
 
 @end
 
 // This is only needed for OptFunction and backend names
-const char* _Nonnull demangleSymbolName(const char* _Nonnull symbolName_);
+const char* _Nullable demangleSymbolName(const char* _Nonnull symbolName_);
 
+// This is really the only call needed out of CBA
+// Convert templated code to collapsed name so get more correspondence in map.
+const char* _Nullable collapseFunctionName(const char* _Nonnull name_);
diff --git a/kram-profile/Source/KramZipHelperW.mm b/kram-profile/Source/KramZipHelperW.mm
index 2ad4da7d..fa232d5e 100644
--- a/kram-profile/Source/KramZipHelperW.mm
+++ b/kram-profile/Source/KramZipHelperW.mm
@@ -1,4 +1,5 @@
 #include "KramZipHelperW.h"
+
 #include "KramZipHelper.h"
 
 using namespace kram;
@@ -7,20 +8,21 @@ @implementation ZipHelperW {
     ZipHelper _helper;
 }
 
-- (nonnull instancetype)initWithData:(nonnull NSData*)data {
+- (nonnull instancetype)initWithData:(nonnull NSData*)data
+{
     _helper.openForRead((const uint8_t*)data.bytes, data.length);
     return self;
 }
 
-- (nullable NSData*)extract:(nonnull const char*)filename {
-   
+- (nullable NSData*)extract:(nonnull const char*)filename
+{
     NSData* data = nil;
-    
+
     auto entry = _helper.zipEntry(filename);
     if (!entry) {
         return nil;
     }
-    
+
     bool isCompressed = entry->uncompressedSize != entry->compressedSize;
     if (isCompressed) {
         // this allocates memory
@@ -30,31 +32,35 @@ - (nullable NSData*)extract:(nonnull const char*)filename {
     else {
         const uint8_t* bytes = nullptr;
         uint64_t bytesLength = 0;
-        
+
         // this aliases the archive
         _helper.extractRaw(filename, &bytes, bytesLength);
         data = [NSData dataWithBytesNoCopy:(void*)bytes length:bytesLength freeWhenDone:NO];
     }
-    
+
     return data;
 }
 
 // Need this for the list data
-- (nonnull const ZipEntryW*)zipEntrys {
+- (nonnull const ZipEntryW*)zipEntrys
+{
     return (const ZipEntryW*)_helper.zipEntrys().data();
 }
-- (NSInteger)zipEntrysCount {
+- (NSInteger)zipEntrysCount
+{
     return _helper.zipEntrys().size();
 }
 
-- (ZipEntryW)zipEntry:(NSInteger)index {
+- (ZipEntryW)zipEntry:(NSInteger)index
+{
     return *(const ZipEntryW*)&_helper.zipEntrys()[index];
 }
 
-- (ZipEntryW)zipEntryByName:(nonnull const char*)name {
+- (ZipEntryW)zipEntryByName:(nonnull const char*)name
+{
     // DONE: fix to return a dummy type, since zips can be missing files
     // from one iteration to the next.
-    static ZipEntryW nilEntry = { "" };
+    static ZipEntryW nilEntry = {""};
     const ZipEntry* entry = _helper.zipEntry(name);
     if (entry) {
         return *(const ZipEntryW*)entry;
@@ -64,6 +70,4 @@ - (ZipEntryW)zipEntryByName:(nonnull const char*)name {
     }
 }
 
-
 @end
-
diff --git a/kram-profile/Source/kram-profile-Bridging-Header.h b/kram-profile/Source/kram-profile-Bridging-Header.h
index f5f1741e..b99d6e26 100644
--- a/kram-profile/Source/kram-profile-Bridging-Header.h
+++ b/kram-profile/Source/kram-profile-Bridging-Header.h
@@ -2,4 +2,5 @@
 //  Use this file to import your target's public headers that you would like to expose to Swift.
 //
 
+#include "CBA.h"
 #include "KramZipHelperW.h"
diff --git a/kram-profile/Source/miniz.h b/kram-profile/Source/miniz.h
index 8867c3c6..3100819f 100644
--- a/kram-profile/Source/miniz.h
+++ b/kram-profile/Source/miniz.h
@@ -112,8 +112,11 @@
 */
 #pragma once
 
-#if 1
 // Alec add this for now (move to define on projects?)
+#if 1
+
+// Make sure large file calls are used.  Should be set across app.
+#define _LARGEFILE64_SOURCE 1
 
 // skip crc read checks to speed up reads
 #define MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS
@@ -124,6 +127,9 @@
 // handling file io separately
 #define MINIZ_NO_STDIO
 
+// These defines annoying conflict with everything (f.e. compress)
+#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
 #endif
 
 
diff --git a/kram-profile/Source/track_event_parser.cpp b/kram-profile/Source/track_event_parser.cpp
index d1291c86..1da63ca1 100644
--- a/kram-profile/Source/track_event_parser.cpp
+++ b/kram-profile/Source/track_event_parser.cpp
@@ -25,22 +25,6 @@
 #include "perfetto/ext/base/base64.h"
 #include "perfetto/ext/base/string_writer.h"
 #include "perfetto/trace_processor/status.h"
-#include "src/trace_processor/importers/common/args_tracker.h"
-#include "src/trace_processor/importers/common/args_translation_table.h"
-#include "src/trace_processor/importers/common/event_tracker.h"
-#include "src/trace_processor/importers/common/flow_tracker.h"
-#include "src/trace_processor/importers/common/process_tracker.h"
-#include "src/trace_processor/importers/common/track_tracker.h"
-#include "src/trace_processor/importers/json/json_utils.h"
-#include "src/trace_processor/importers/proto/packet_analyzer.h"
-#include "src/trace_processor/importers/proto/packet_sequence_state.h"
-#include "src/trace_processor/importers/proto/profile_packet_utils.h"
-#include "src/trace_processor/importers/proto/stack_profile_sequence_state.h"
-#include "src/trace_processor/importers/proto/track_event_tracker.h"
-#include "src/trace_processor/util/debug_annotation_parser.h"
-#include "src/trace_processor/util/proto_to_args_parser.h"
-#include "src/trace_processor/util/status_macros.h"
-
 #include "protos/perfetto/common/android_log_constants.pbzero.h"
 #include "protos/perfetto/trace/extension_descriptor.pbzero.h"
 #include "protos/perfetto/trace/interned_data/interned_data.pbzero.h"
@@ -59,6 +43,21 @@
 #include "protos/perfetto/trace/track_event/thread_descriptor.pbzero.h"
 #include "protos/perfetto/trace/track_event/track_descriptor.pbzero.h"
 #include "protos/perfetto/trace/track_event/track_event.pbzero.h"
+#include "src/trace_processor/importers/common/args_tracker.h"
+#include "src/trace_processor/importers/common/args_translation_table.h"
+#include "src/trace_processor/importers/common/event_tracker.h"
+#include "src/trace_processor/importers/common/flow_tracker.h"
+#include "src/trace_processor/importers/common/process_tracker.h"
+#include "src/trace_processor/importers/common/track_tracker.h"
+#include "src/trace_processor/importers/json/json_utils.h"
+#include "src/trace_processor/importers/proto/packet_analyzer.h"
+#include "src/trace_processor/importers/proto/packet_sequence_state.h"
+#include "src/trace_processor/importers/proto/profile_packet_utils.h"
+#include "src/trace_processor/importers/proto/stack_profile_sequence_state.h"
+#include "src/trace_processor/importers/proto/track_event_tracker.h"
+#include "src/trace_processor/util/debug_annotation_parser.h"
+#include "src/trace_processor/util/proto_to_args_parser.h"
+#include "src/trace_processor/util/status_macros.h"
 
 namespace perfetto {
 namespace trace_processor {
diff --git a/kram-profile/kram-profile.xcodeproj/project.pbxproj b/kram-profile/kram-profile.xcodeproj/project.pbxproj
index 18fc85ba..fb29e9a0 100644
--- a/kram-profile/kram-profile.xcodeproj/project.pbxproj
+++ b/kram-profile/kram-profile.xcodeproj/project.pbxproj
@@ -22,6 +22,12 @@
 		705F69072BA2ED1300437FAA /* track_event_parser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 705F69042BA2ED1200437FAA /* track_event_parser.cpp */; };
 		705F690A2BA3801D00437FAA /* KramZipHelperW.mm in Sources */ = {isa = PBXBuildFile; fileRef = 705F69092BA3801D00437FAA /* KramZipHelperW.mm */; };
 		705F690C2BA3CDEC00437FAA /* File.swift in Sources */ = {isa = PBXBuildFile; fileRef = 705F690B2BA3CDEC00437FAA /* File.swift */; };
+		7061C76F2BAFB715003EC937 /* Utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7061C7672BAFB715003EC937 /* Utils.cpp */; };
+		7061C7702BAFB715003EC937 /* Analysis.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7061C7682BAFB715003EC937 /* Analysis.cpp */; };
+		7061C7712BAFB715003EC937 /* Arena.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7061C7692BAFB715003EC937 /* Arena.cpp */; };
+		7061C7722BAFB715003EC937 /* CBA.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7061C76B2BAFB715003EC937 /* CBA.mm */; };
+		7061C7732BAFB715003EC937 /* BuildEvents.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7061C76C2BAFB715003EC937 /* BuildEvents.cpp */; };
+		7061C7762BAFC323003EC937 /* simdjson.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7061C7742BAFC323003EC937 /* simdjson.cpp */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
@@ -47,6 +53,18 @@
 		705F69082BA3801D00437FAA /* KramZipHelperW.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramZipHelperW.h; sourceTree = "<group>"; };
 		705F69092BA3801D00437FAA /* KramZipHelperW.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = KramZipHelperW.mm; sourceTree = "<group>"; };
 		705F690B2BA3CDEC00437FAA /* File.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = File.swift; sourceTree = "<group>"; };
+		7061C7662BAFB715003EC937 /* Utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Utils.h; sourceTree = "<group>"; };
+		7061C7672BAFB715003EC937 /* Utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Utils.cpp; sourceTree = "<group>"; };
+		7061C7682BAFB715003EC937 /* Analysis.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Analysis.cpp; sourceTree = "<group>"; };
+		7061C7692BAFB715003EC937 /* Arena.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Arena.cpp; sourceTree = "<group>"; };
+		7061C76A2BAFB715003EC937 /* BuildEvents.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BuildEvents.h; sourceTree = "<group>"; };
+		7061C76B2BAFB715003EC937 /* CBA.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CBA.mm; sourceTree = "<group>"; };
+		7061C76C2BAFB715003EC937 /* BuildEvents.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BuildEvents.cpp; sourceTree = "<group>"; };
+		7061C76D2BAFB715003EC937 /* Arena.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Arena.h; sourceTree = "<group>"; };
+		7061C76E2BAFB715003EC937 /* Analysis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Analysis.h; sourceTree = "<group>"; };
+		7061C7742BAFC323003EC937 /* simdjson.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = simdjson.cpp; sourceTree = "<group>"; };
+		7061C7752BAFC323003EC937 /* simdjson.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simdjson.h; sourceTree = "<group>"; };
+		7061C7772BAFD82E003EC937 /* CBA.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CBA.h; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -63,6 +81,7 @@
 		705F68C02B820AD100437FAA = {
 			isa = PBXGroup;
 			children = (
+				7061C7652BAFB715003EC937 /* CBA */,
 				705F68FD2BA2ED1200437FAA /* Source */,
 				705F68CB2B820AD100437FAA /* kram-profile */,
 				705F68CA2B820AD100437FAA /* Products */,
@@ -120,6 +139,25 @@
 			path = Source;
 			sourceTree = "<group>";
 		};
+		7061C7652BAFB715003EC937 /* CBA */ = {
+			isa = PBXGroup;
+			children = (
+				7061C7662BAFB715003EC937 /* Utils.h */,
+				7061C7672BAFB715003EC937 /* Utils.cpp */,
+				7061C76D2BAFB715003EC937 /* Arena.h */,
+				7061C7692BAFB715003EC937 /* Arena.cpp */,
+				7061C76E2BAFB715003EC937 /* Analysis.h */,
+				7061C7682BAFB715003EC937 /* Analysis.cpp */,
+				7061C76A2BAFB715003EC937 /* BuildEvents.h */,
+				7061C76C2BAFB715003EC937 /* BuildEvents.cpp */,
+				7061C7772BAFD82E003EC937 /* CBA.h */,
+				7061C76B2BAFB715003EC937 /* CBA.mm */,
+				7061C7742BAFC323003EC937 /* simdjson.cpp */,
+				7061C7752BAFC323003EC937 /* simdjson.h */,
+			);
+			path = CBA;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 
 /* Begin PBXNativeTarget section */
@@ -194,16 +232,22 @@
 			files = (
 				705F68E12B87EB8000437FAA /* AnyDecodable.swift in Sources */,
 				705F690C2BA3CDEC00437FAA /* File.swift in Sources */,
+				7061C7712BAFB715003EC937 /* Arena.cpp in Sources */,
 				705F68E32B87EB8000437FAA /* AnyEncodable.swift in Sources */,
 				705F68E92B9451CC00437FAA /* Log.swift in Sources */,
 				705F68EB2B94E33800437FAA /* Keycode.swift in Sources */,
 				705F69062BA2ED1300437FAA /* miniz.cpp in Sources */,
 				705F69052BA2ED1300437FAA /* KramZipHelper.cpp in Sources */,
 				705F68E22B87EB8000437FAA /* AnyCodable.swift in Sources */,
+				7061C7732BAFB715003EC937 /* BuildEvents.cpp in Sources */,
+				7061C7762BAFC323003EC937 /* simdjson.cpp in Sources */,
 				705F69072BA2ED1300437FAA /* track_event_parser.cpp in Sources */,
+				7061C76F2BAFB715003EC937 /* Utils.cpp in Sources */,
 				705F68CD2B820AD100437FAA /* kram_profileApp.swift in Sources */,
 				705F68E72B8BEB7100437FAA /* DataCompression.swift in Sources */,
 				705F690A2BA3801D00437FAA /* KramZipHelperW.mm in Sources */,
+				7061C7722BAFB715003EC937 /* CBA.mm in Sources */,
+				7061C7702BAFB715003EC937 /* Analysis.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -214,6 +258,7 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
 				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
 				CLANG_ANALYZER_GCD_PERFORMANCE = YES;
 				CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
@@ -273,11 +318,12 @@
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
 				ONLY_ACTIVE_ARCH = YES;
+				OTHER_CFLAGS = "-ftime-trace";
 				SDKROOT = macosx;
 				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
 				SWIFT_OBJC_INTEROP_MODE = objc;
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 5.0;
+				SWIFT_VERSION = 6.0;
 			};
 			name = Debug;
 		};
@@ -285,6 +331,7 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
+				ARCHS = arm64;
 				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
 				CLANG_ANALYZER_GCD_PERFORMANCE = YES;
 				CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
@@ -321,6 +368,7 @@
 				CLANG_WARN_UNREACHABLE_CODE = YES;
 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
 				COPY_PHASE_STRIP = NO;
+				DEAD_CODE_STRIPPING = YES;
 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
 				ENABLE_NS_ASSERTIONS = NO;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
@@ -337,11 +385,12 @@
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
+				OTHER_CFLAGS = "-ftime-trace";
 				SDKROOT = macosx;
 				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "";
 				SWIFT_COMPILATION_MODE = wholemodule;
 				SWIFT_OBJC_INTEROP_MODE = objc;
-				SWIFT_VERSION = 5.0;
+				SWIFT_VERSION = 6.0;
 			};
 			name = Release;
 		};
@@ -350,15 +399,25 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CLANG_CXX_STANDARD_LIBRARY_HARDENING = none;
 				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_ENTITLEMENTS = "kram-profile/kram_profile.entitlements";
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Developer ID Application";
+				CODE_SIGN_STYLE = Manual;
 				COMBINE_HIDPI_IMAGES = YES;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_ASSET_PATHS = "\"kram-profile/Preview Content\"";
+				DEVELOPMENT_TEAM = "";
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
+				ENABLE_APP_SANDBOX = YES;
+				ENABLE_HARDENED_RUNTIME = NO;
 				ENABLE_PREVIEWS = YES;
+				ENABLE_USER_SELECTED_FILES = readonly;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = "kram-profile/Info.plist";
+				INFOPLIST_KEY_CFBundleDisplayName = "Kram Profiler";
+				INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.developer-tools";
 				INFOPLIST_KEY_NSHumanReadableCopyright = "";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
@@ -367,10 +426,11 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = "com.hialec.kram-profile";
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				PROVISIONING_PROFILE_SPECIFIER = "";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramProfilerProvision;
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_OBJC_BRIDGING_HEADER = "Source/kram-profile-Bridging-Header.h";
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 5.0;
 			};
 			name = Debug;
 		};
@@ -379,15 +439,25 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CLANG_CXX_STANDARD_LIBRARY_HARDENING = fast;
 				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_ENTITLEMENTS = "kram-profile/kram_profile.entitlements";
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_IDENTITY = "Developer ID Application: Alec Miller (LDJ95E4NS8)";
+				"CODE_SIGN_IDENTITY[sdk=macosx*]" = "Developer ID Application";
+				CODE_SIGN_STYLE = Manual;
 				COMBINE_HIDPI_IMAGES = YES;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_ASSET_PATHS = "\"kram-profile/Preview Content\"";
+				DEVELOPMENT_TEAM = "";
+				"DEVELOPMENT_TEAM[sdk=macosx*]" = LDJ95E4NS8;
+				ENABLE_APP_SANDBOX = YES;
+				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_PREVIEWS = YES;
+				ENABLE_USER_SELECTED_FILES = readonly;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = "kram-profile/Info.plist";
+				INFOPLIST_KEY_CFBundleDisplayName = "Kram Profiler";
+				INFOPLIST_KEY_LSApplicationCategoryType = "public.app-category.developer-tools";
 				INFOPLIST_KEY_NSHumanReadableCopyright = "";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
@@ -396,9 +466,10 @@
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = "com.hialec.kram-profile";
 				PRODUCT_NAME = "$(TARGET_NAME)";
+				PROVISIONING_PROFILE_SPECIFIER = "";
+				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = KramProfilerProvision;
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_OBJC_BRIDGING_HEADER = "Source/kram-profile-Bridging-Header.h";
-				SWIFT_VERSION = 5.0;
 			};
 			name = Release;
 		};
diff --git a/kram-profile/kram-profile/File.swift b/kram-profile/kram-profile/File.swift
index 7b0995c6..16b5ed04 100644
--- a/kram-profile/kram-profile/File.swift
+++ b/kram-profile/kram-profile/File.swift
@@ -33,6 +33,7 @@ class BuildStats {
     var totalInstantiateFunction = 0
     var totalInstantiateClass = 0
     var totalCodeGenFunction = 0
+    var totalDebugType = 0
     
     var totalBackend = 0
     var totalOptimizer = 0
@@ -47,6 +48,7 @@ class BuildStats {
         totalInstantiateFunction += rhs.totalInstantiateFunction
         totalInstantiateClass += rhs.totalInstantiateClass
         totalCodeGenFunction += rhs.totalCodeGenFunction
+        totalDebugType += rhs.totalDebugType
         
         totalBackend += rhs.totalBackend
         totalOptimizer += rhs.totalOptimizer
@@ -65,6 +67,7 @@ class BuildStats {
         totalInstantiateFunction /= s
         totalInstantiateClass /= s
         totalCodeGenFunction /= s
+        totalDebugType /= s
         
         totalBackend /= s
         totalOptimizer /= s
@@ -73,7 +76,7 @@ class BuildStats {
     }
 }
 
-class File: Identifiable, Hashable, Equatable, Comparable
+class File: Identifiable, Hashable, Equatable, Comparable, @unchecked Sendable
 {
     // TODO: archive url relative to archive so not unique if multiple archives dropped
     // but currently all lookup is by url, and not url + archive.  Just make sure to
@@ -98,7 +101,8 @@ class File: Identifiable, Hashable, Equatable, Comparable
     
     // This is only updated for Build fileType
     var buildTimings: [String:BuildTiming] = [:]
-    var buildStats: BuildStats!
+    var buildFunctionTimings = BuildFunctionTimings()
+    var buildStats: BuildStats?
     
     // only available for memory file type right now
     var threadInfo = ""
@@ -132,6 +136,29 @@ class File: Identifiable, Hashable, Equatable, Comparable
         return modStamp != loadStamp
     }
     
+    public func eraseFileContent() {
+        // fileContent should get reloaded
+        fileContent = nil
+        
+        // Perfetto should reload the fileContent
+        loadStamp = nil
+    }
+    
+    public func eraseCaches() {
+        duration = 0.0
+        
+        if fileType == .Build {
+            // for build fileType
+            buildTimings.removeAll()
+            buildFunctionTimings.reset()
+            buildStats = nil
+        }
+        else if fileType == .Memory {
+            // for memory fileType
+            threadInfo.removeAll()
+        }
+    }
+    
     public static func fileModificationDate(url: URL) -> Date? {
         do {
             let attr = try FileManager.default.attributesOfItem(atPath: url.path)
@@ -201,11 +228,41 @@ func generateDuration(file: File) -> String {
     return "\(double:file.duration, decimals:3)\(unitText)"
 }
 
-func generateNavigationTitle(_ sel: String?) -> String {
+func generateTotalDuration(_ file: File, _ buildFiles: [File]) -> String {
+    if buildFiles.isEmpty { return "" }
+    
+    // add up duration of all files related to selection
+    var totalDuration = 0.0
+    for buildFile in buildFiles {
+        totalDuration += buildFile.duration
+    }
+    
+    if totalDuration == 0.0 { return "" }
+    var text = "/ \(double:totalDuration, decimals:3)s "
+    
+    // only show percent if high enough
+    let totalPercent = file.duration * 100.0 / totalDuration
+    if totalPercent >= 1 {
+        text += "\(double:totalPercent, decimals:0)% "
+    }
+    return text
+}
+
+@MainActor
+func generateNavigationTitle(_ sel: String?, _ files: [File]) -> String {
     if sel == nil { return "" }
     
     let f = lookupFile(selection: sel!)
-    var text = generateDuration(file: f) + " " + f.name
+    var text = generateDuration(file: f) + " "
+    
+    // total the durations matching the selection
+    if f.fileType == .Build {
+        let buildFiles = findFilesForBuildTimings(files: files, selection: sel!)
+        text += generateTotalDuration(f, buildFiles)
+    }
+    
+    // add the shortened filename
+    text += f.name
     
     // add the archive name
     if let fileArchive = f.archive {
@@ -221,12 +278,13 @@ func generateNavigationTitle(_ sel: String?) -> String {
 // indicate the item is gone, and await its return.
 
 // Holds supported files dropped or opened from Finder, reload reparses this
-var droppedFileCache : [URL] = []
+@MainActor var droppedFileCache : [URL] = []
 
 // Flattened list of supported files from folders and archives
-var fileCache : [URL:File] = [:]
+@MainActor var fileCache : [URL:File] = [:]
 
-func lookupFile(url: URL) -> File {
+@MainActor
+func updateFile(url: URL) -> File {
     let file = File(url:url)
     
     // This preseves the duration previously parsed and stored
@@ -241,11 +299,26 @@ func lookupFile(url: URL) -> File {
     // This wipes the duration, so it can be recomputed
     // TODO: may want to check crc32 if present before wiping all data
     
+    if file.archive == nil {
+        file.eraseFileContent()
+        file.eraseCaches()
+    }
+    
     fileCache[file.url] = file
     
     return file
 }
 
+@MainActor
+func lookupFile(url: URL) -> File {
+    let file = File(url:url)
+    if let fileOld = fileCache[file.url] {
+        return fileOld
+    }
+    return file
+}
+
+@MainActor
 func lookupFile(selection: String) -> File {
     return lookupFile(url:URL(string:selection)!)
 }
@@ -301,8 +374,9 @@ class Archive: Identifiable, /*Hashable, */ Equatable, Comparable {
 }
 
 // cache of archives to avoid creating these each time
-var archiveCache: [URL:Archive] = [:]
+@MainActor var archiveCache: [URL:Archive] = [:]
 
+@MainActor
 func lookupArchive(_ url: URL) -> Archive {
     let archive = Archive(url)
     
@@ -342,8 +416,7 @@ func lookupArchive(_ url: URL) -> Archive {
                 if !isNewEntryMissing && (oldEntry.crc32 == newEntry.crc32) {
                     
                     // erase fileContent since it may alias mmap going away
-                    file.loadStamp = nil
-                    file.fileContent = nil
+                    file.eraseFileContent()
                     
                     // keep any caches
                 }
@@ -352,17 +425,8 @@ func lookupArchive(_ url: URL) -> Archive {
                     file.loadStamp = nil
                     file.fileContent = nil
                     
-                    file.duration = 0.0
-                    
-                    if file.fileType == .Build {
-                        // for build fileType
-                        file.buildTimings.removeAll()
-                        file.buildStats = nil
-                    }
-                    else if file.fileType == .Memory {
-                        // for memory fileType
-                        file.threadInfo.removeAll()
-                    }
+                    file.eraseFileContent()
+                    file.eraseCaches()
                 }
             }
         }
@@ -404,10 +468,17 @@ func loadFileContent(_ file: File) -> Data {
 
 func isSupportedFilename(_ url: URL) -> Bool {
     let ext = url.pathExtension
-    
+
     // what ext does trace.zip, or trace.gz come in as ?
     // should this limit compressed files to the names supported below
     
+    // Apple and Microsoft store resource fork data in "._Filename.trace" files
+    // so need to ignore these in the lists.  These don't occur from CLI zip,
+    // only from using Finder "Compress"
+    if url.lastPathComponent.starts(with: "._") {
+        return false
+    }
+    
     if ext == "gz" {
         return true
     }
@@ -446,6 +517,7 @@ func isSupportedFilename(_ url: URL) -> Bool {
     return false
 }
 
+@MainActor
 func listFilesFromArchive(_ urlArchive: URL) -> [File] {
     
     let archive = lookupArchive(urlArchive)
@@ -469,7 +541,7 @@ func listFilesFromArchive(_ urlArchive: URL) -> [File] {
             
         // TODO: archives don't have full paths, so lookup can get confused
         // if there are multiple archives with same paths.
-        let file = lookupFile(url:url)
+        let file = updateFile(url:url)
         if file.archive != archive {
             file.archive = archive
         }
@@ -478,6 +550,7 @@ func listFilesFromArchive(_ urlArchive: URL) -> [File] {
     return files
 }
 
+@MainActor
 func listFilesFromURLs(_ urls: [URL]) -> [File]
 {
     var files: [File] = []
@@ -505,7 +578,7 @@ func listFilesFromURLs(_ urls: [URL]) -> [File]
                        files += listFilesFromArchive(fileURL)
                     }
                     else {
-                        files.append(lookupFile(url:fileURL));
+                        files.append(updateFile(url:fileURL));
                     }
                 }
             }
@@ -518,7 +591,7 @@ func listFilesFromURLs(_ urls: [URL]) -> [File]
                     files += listFilesFromArchive(url)
                 }
                 else {
-                    files.append(lookupFile(url:url))
+                    files.append(updateFile(url:url))
                 }
             }
         }
diff --git a/kram-profile/kram-profile/Log.swift b/kram-profile/kram-profile/Log.swift
index 83169af8..6c278c9b 100644
--- a/kram-profile/kram-profile/Log.swift
+++ b/kram-profile/kram-profile/Log.swift
@@ -48,15 +48,62 @@ import Darwin
    2018-04-11 14:59:07.122186-0700 SwiftShot[581:21310] [GameSceneViewController] error text
 */
 
-class Log {
+struct LogState {
     // verbose: Whether os_log or print is used to report logs.
-    static var prints = false
+    let prints = false
     // stacktrace: Whether stack trace is logged on errors.
-    static var stacktraces = false
+    let stacktraces = false
     // timestamp: Show timestamps on all entries when printing statements.
-    static var timestamps = false
+    let timestamps = false
     // absoluteTimestamps: Show relative or absolute timestampes.
-    static var absoluteTimestamps = true
+    let absoluteTimestamps = true
+    
+    // Store data for timestamps.
+    let timestampToSeconds = initTimestampToSeconds()
+    let timestampStart: Double
+    let timestampStartDate = Date()
+    let timestampFormatter = initTimestampFormatter()
+
+    // This can be filtered from command line arguments.
+    let subsystem = Bundle.main.bundleIdentifier!
+    
+    init() {
+        timestampStart = LogState.timestampStartTime(timestampToSeconds)
+    }
+    private static func initTimestampFormatter() -> DateFormatter {
+        let formatter = DateFormatter()
+        formatter.locale = Locale.current
+        formatter.setLocalizedDateFormatFromTemplate("HH:mm:ss.SSS") // ms resolution
+        return formatter
+    }
+    
+    private static func initTimestampToSeconds() -> Double {
+        // Cache the conversion.  Note that clock rate can change with cpu throttling.
+        // These are high-resolution timestamps taken from the system timer.
+        var info = mach_timebase_info(numer: 0, denom: 0)
+        mach_timebase_info(&info)
+        let numer = Double(info.numer)
+        let denom = Double(info.denom)
+        return 1e-9 * (numer / denom) // inverse so we can multiply
+    }
+    
+    static func timestampStartTime(_ timestampToSeconds: Double) -> Double {
+        let timestamp = Double(mach_absolute_time())
+        let time = timestamp * timestampToSeconds
+        return time
+    }
+    
+    // need timestamps in other parts of the app
+    func timestamp() -> Double {
+        let timestamp = Double(mach_absolute_time())
+        let time = timestamp * timestampToSeconds
+        return time
+    }
+}
+
+let logState = LogState()
+
+class Log: @unchecked Sendable {
     
     // Custom logging group - usually based on source filename.
     // This has a very limited output, but does go to console
@@ -66,15 +113,7 @@ class Log {
     private var file: String
     // All logs go to this category for filtering.
     private var category: String
-    // This can be filtered from command line arguments.
-    private static var subsystem = Bundle.main.bundleIdentifier!
-    
-    // Store data for timestamps.
-    private static var timestampToSeconds: Double = 0
-    private static var timestampStart = timestamp()
-    private static var timestampStartDate = Date()
-    private static var timestampFormatter = initTimestampFormatter()
-    
+   
     init(_ category: String = #file, file: String = #file) {
         // Strip the path, but allow hierachical category f.e. "Group/Subgroup" wihtout .swift.
         self.category = category
@@ -85,7 +124,7 @@ class Log {
         // Compute once for use in logs.
         self.file = Log.stripFilePathAndExtension(file)
         
-        self.log = OSLog(subsystem: Log.subsystem, category: self.category)
+        self.log = OSLog(subsystem: logState.subsystem, category: self.category)
     }
     
     // Test whether messages are logged for the given levels
@@ -111,7 +150,7 @@ class Log {
     
     func error(_ message: @autoclosure () -> String, _ function: String = #function, _ line: Int = #line) {
         let text = formatMessage(message(), .error, function, line)
-        if Log.prints {
+        if logState.prints {
             print(text)
         } else {
             logToOSLog(text, .error)
@@ -121,7 +160,7 @@ class Log {
     // os_log left out warnings, so reuse default type for that
     func warn(_ message: @autoclosure () -> String, _ function: String = #function, _ line: Int = #line) {
         let text = formatMessage(message(), .default, function, line)
-        if Log.prints {
+        if logState.prints {
             print(text)
         } else {
             logToOSLog(text, .default) // this doesn't get colored yellow like a warning
@@ -130,7 +169,7 @@ class Log {
     
     func info(_ message: @autoclosure () -> String) {
         let text = formatMessage(message(), .info)
-        if Log.prints {
+        if logState.prints {
             print(text)
         } else {
             logToOSLog(text, .info)
@@ -141,7 +180,7 @@ class Log {
         // debug logs are stripped from release builds
         #if DEBUG
         let text = formatMessage(message(), .debug)
-        if Log.prints {
+        if logState.prints {
             print(text)
         } else {
             logToOSLog(text, .debug)
@@ -165,7 +204,7 @@ class Log {
         
         let levelText = formatLevel(level)
         
-        if Log.prints {
+        if logState.prints {
             let timestamp = Log.formatTimestamp()
             
             // These messages never go out to the system console, just the debugger.
@@ -199,7 +238,7 @@ class Log {
             }
         }
         
-        if Log.stacktraces && (level == .error) {
+        if logState.stacktraces && (level == .error) {
             text += "\n"
             
             // Improve this - these are mangled symbols without file/line of where
@@ -218,8 +257,10 @@ class Log {
             queueName = ":" + queueName
         }
         
-        text += " at \(file):\(line)@\(function)\n"
-        text += " on \(threadName)\(queueName)"
+        text += "\n at \(file):\(line)@\(function)"
+        if !threadName.isEmpty || !queueName.isEmpty {
+            text += "\n on \(threadName)\(queueName)"
+        }
         return text
     }
     
@@ -229,27 +270,22 @@ class Log {
     }
     
     // timestamp support
-    private static func initTimestampFormatter() -> DateFormatter {
-        let formatter = DateFormatter()
-        formatter.locale = Locale.current
-        formatter.setLocalizedDateFormatFromTemplate("HH:mm:ss.SSS") // ms resolution
-        return formatter
-    }
+    
     
     private static func timeFromStart() -> Double {
-        return max(0.0, Log.timestamp() - Log.timestampStart)
+        return max(0.0, Log.timestamp() - logState.timestampStart)
     }
     
     private static func timeAbsolute() -> String {
         let timestamp = Log.timeFromStart()
-        let date = Date(timeInterval: timestamp, since: Log.timestampStartDate)
-        return timestampFormatter.string(from: date)
+        let date = Date(timeInterval: timestamp, since: logState.timestampStartDate)
+        return logState.timestampFormatter.string(from: date)
     }
     
     private static func formatTimestamp() -> String {
         var timestamp = ""
-        if Log.timestamps {
-            if Log.absoluteTimestamps {
+        if logState.timestamps {
+            if logState.absoluteTimestamps {
                 timestamp = Log.timeAbsolute() + " "
             } else {
                 timestamp = String(format: "%.3fs ", Log.timeFromStart())
@@ -258,21 +294,8 @@ class Log {
         return timestamp
     }
     
-    // need timestamps in other parts of the app
     static func timestamp() -> Double {
-        if Log.timestampToSeconds == 0 {
-            // Cache the conversion.  Note that clock rate can change with cpu throttling.
-            // These are high-resolution timestamps taken from the system timer.
-            var info = mach_timebase_info(numer: 0, denom: 0)
-            mach_timebase_info(&info)
-            let numer = Double(info.numer)
-            let denom = Double(info.denom)
-            Log.timestampToSeconds = 1e-9 * (numer / denom) // inverse so we can multiply
-        }
-        
-        let timestamp = Double(mach_absolute_time())
-        let time = timestamp * Log.timestampToSeconds
-        return time
+        return logState.timestamp()
     }
 }
 
diff --git a/kram-profile/kram-profile/kram_profile.entitlements b/kram-profile/kram-profile/kram_profile.entitlements
index 625af03d..e00d8415 100644
--- a/kram-profile/kram-profile/kram_profile.entitlements
+++ b/kram-profile/kram-profile/kram_profile.entitlements
@@ -3,7 +3,7 @@
 <plist version="1.0">
 <dict>
 	<key>com.apple.security.app-sandbox</key>
-	<true/>
+	<false/>
 	<key>com.apple.security.files.user-selected.read-only</key>
 	<true/>
 	<key>com.apple.security.network.client</key>
diff --git a/kram-profile/kram-profile/kram_profileApp.swift b/kram-profile/kram-profile/kram_profileApp.swift
index 59937398..7e0e4e0f 100644
--- a/kram-profile/kram-profile/kram_profileApp.swift
+++ b/kram-profile/kram-profile/kram_profileApp.swift
@@ -38,8 +38,13 @@ import UniformTypeIdentifiers
 // Build traces
 // DONE: build hierarchy and self times
 // DONE: background process to compute buildTimings across all files
-// TODO: parse totals from build traces, what CBA is doing
-// TODO: present total time, and % of total in the nav panel
+// DONE: add a total time, and show that in the nav panel, and % of total
+//   then know for a summary what the total time spend compiling is.
+// DONE: parse instantiateFunction totals from build traces, what CBA is doing
+//   avoid InstatiateClass since it's a child
+// DONE: parse optFunction totals from build traces, what CBA is doing
+// TODO: duration may not updating properly when doing Reload on loose files, but thought this fixed
+// TODO: add children of each archive, so those show in the list and can collapse
 
 // Perf traces
 // TODO: build per-thread hierarchy and self times
@@ -51,10 +56,20 @@ import UniformTypeIdentifiers
 // DONE: add/update recent document list (need to hold onto dropped/opened folder)
 // DONE: can't mmap web link, but can load zip off server with timings
 
+// DONE: drop anything in the build report < 1% in one sub-track
+// Could display on subtract but would have to fit under the parent timing (but should).
+// Could stop when remaining total child time < parent.
+// This stops the long tail.  So just total the results, and start setting ts once reached.
+
 // TODO: run cba on files, mmap and decompress each can use incremental mode?
 // TODO: save/load the duration and modstamps for File at quit, and any other metadata (totals per section)
 // TODO: add jump to source/header, but path would need to be correct (sandbox block?)
 
+// TODO: look into fast crc32 ops on M1
+//  can use this on loose fils as a hash, and also compare to zip files
+//  already have a crc32 in the zip lib
+// https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
+
 // Build traces
 // DONE: OptFunction needs demangled.  All backend strings are still mangled.
 //  Don’t need the library CBA uses just use api::__cxa_demangle() on macOS.
@@ -66,7 +81,7 @@ import UniformTypeIdentifiers
 // table if global would need to use same index across all files.
 // Can rebuild references on JS side to send less data.  JS can then alias strings ftw.
 // Just add special ph type that is ignored by web to specify the alias.
-// TODO: work on sending a more efficient form.  Could use Perfetto SDK to write to prototbuf.  The Catapult json format is overly verbose.  Need some thread and scope strings, some open/close timings that reference a scope string and thread.
+// TODO: work on sending a more efficient form.  Could use Perfetto SDK to write to prototbuf.  The perfetto json format is overly verbose.  Need some thread and scope strings, some open/close timings that reference a scope string and thread.
 // TODO: add compressed format, build up Pefetto json or binary from this
 //  may need one for mmap, other for super compact deltas
 //  can still alias strings from mmap
@@ -106,6 +121,12 @@ import UniformTypeIdentifiers
 //   only then can data be handed off toe Pefertto or CBA.  And CBA needs all files.
 //   Maybe extend CBA to read a zip file.  Can just use ZipHelper.
 
+// TODO: use refreshable on the list to allow await on an async tasks
+// so could refresh the durations off that.
+// TODO: passing children field to the List ctor creates a hierarchical list.
+// so could have dropped file, archive, folder in the list to collapse the view
+// Each file array would be a child.  Parent would be clearer then.
+
 // TODO: fix duration update modding the List item and nav title after it updates
 // Currently select list, it updates, then duration is calcualated.
 //   there is a objectWillChange.send() Could maybe send that from The File
@@ -274,11 +295,12 @@ extension String.StringInterpolation {
     }
 
     /// Formats the *elapsed time* since the specified start time.
-    mutating func appendInterpolation(timeSince startTime: TimeInterval, decimals: UInt = 2) {
-        let elapsedTime = CACurrentMediaTime() - startTime
-        let elapsedTimeDescription = String(format: "%.\(decimals)fs", elapsedTime)
-        appendLiteral(elapsedTimeDescription)
-    }
+// don't use logging for profiling
+//    mutating func appendInterpolation(timeSince startTime: TimeInterval, decimals: UInt = 2) {
+//        let elapsedTime = CACurrentMediaTime() - startTime
+//        let elapsedTimeDescription = String(format: "%.\(decimals)fs", elapsedTime)
+//        appendLiteral(elapsedTimeDescription)
+//    }
 }
 
 /* usage
@@ -368,7 +390,7 @@ class MyWebView : WKWebView {
     */
 }
 
-
+@MainActor
 func newWebView(request: URLRequest) -> WKWebView {
     // set preference to run javascript on the view, can then do PostMessage
     let preferences = WKPreferences()
@@ -470,7 +492,7 @@ struct MTKViewWrapper: NSViewRepresentable {
 */
 
 // https to work for some reason, but all data is previewed locally
-var ORIGIN = "https://ui.perfetto.dev"
+let ORIGIN = "https://ui.perfetto.dev"
 
 // https://gist.github.com/pwightman/64c57076b89c5d7f8e8c
 extension String {
@@ -542,6 +564,8 @@ func filenameToTimeRange(_ filename: String) -> TimeRange {
         case .Unknown: duration = 1.0
     }
     
+    duration = 10.0
+    
     return TimeRange(timeStart:0.0, timeEnd:duration)
 }
 
@@ -566,6 +590,7 @@ func buildTimeRangeJson(_ timeRange:TimeRange) -> String? {
     let script = """
         var objTime = {
             perfetto:{
+                keepApiOpen: true,
                 timeStart:\(timeStartInt)n,
                 timeEnd:\(timeEndInt)n,
                 viewPercentage:\(timeRange.viewPercentage)
@@ -627,7 +652,8 @@ func showTimeRangeJS(objTimeScript: String) -> String? {
     return objTimeScript + script
 }
 
-struct CatapultEvent: Codable {
+// These are really json events from perfetto profile.
+struct PerfettoEvent: Codable {
     var cat: String?
     var pid: Int?
     var tid: Int?
@@ -670,10 +696,10 @@ struct CatapultEvent: Codable {
     }
 }
 
-struct CatapultProfile: Codable {
-    var traceEvents: [CatapultEvent]?
+struct PerfettoProfile: Codable {
+    var traceEvents: [PerfettoEvent]?
     
-    // not a part of the Catapult spec, but clang writes this when it zeros
+    // not a part of the perfetto spec, but clang writes this when it zeros
     // out the startTime
     var beginningOfTime: Int?
 }
@@ -742,14 +768,117 @@ class ThreadInfo : Hashable, Equatable, Comparable {
     
 }
 
+class BuildFunctionTiming {
+    var count = 0
+    var duration = 0
+    
+    func combine(_ duration: Int) {
+        self.duration += duration
+        self.count += 1
+    }
+    func combine(_ timing: BuildFunctionTiming) {
+        self.duration += timing.duration
+        self.count += timing.count
+    }
+}
+
+class BuildFunctionTimings {
+    var optFunctions: [String:BuildFunctionTiming] = [:]
+    var instantiateFunctions: [String:BuildFunctionTiming] = [:]
+    
+    func combine(_ event: PerfettoEvent) {
+        if event.name == "OptFunction" {
+            let detail = event.args!["detail"]!.value as! String
+            let dur = event.dur!
+            
+            // With classes need to create a new one to combine into
+            if let f = optFunctions[detail] {
+                f.combine(dur)
+            }
+            else {
+                let f = BuildFunctionTiming()
+                f.combine(dur)
+                optFunctions[detail] = f
+            }
+        }
+        else if event.name == "InstantiateFunction" {
+            let detail = event.args!["detail"]!.value as! String
+            let dur = event.dur!
+            
+            if let f = instantiateFunctions[detail] {
+                f.combine(dur)
+            }
+            else {
+                let f = BuildFunctionTiming()
+                f.combine(dur)
+                instantiateFunctions[detail] = f
+            }
+        }
+    }
+    
+    func combine(_ timings: BuildFunctionTimings, _ collapseNames: Bool = false) {
+        for pair in timings.optFunctions {
+            var detail = pair.key
+            let timing = pair.value
+            
+            // go out to CBA to collapse the names
+            if collapseNames {
+                // skip non-templates
+                if detail.firstIndex(of: "<") == nil { continue }
+                
+                if let newDetail = collapseFunctionName(detail) {
+                    detail = String(cString: newDetail)
+                }
+            }
+            
+            if let f = optFunctions[detail] {
+                f.combine(timing)
+            }
+            else {
+                let f = BuildFunctionTiming()
+                f.combine(timing)
+                optFunctions[detail] = f
+            }
+        }
+        for pair in timings.instantiateFunctions {
+            var detail = pair.key
+            let timing = pair.value
+            
+            // go out to CBA to collapse the names
+            if collapseNames {
+                // skip non-templates
+                if detail.firstIndex(of: "<") == nil { continue }
+                    
+                if let newDetail = collapseFunctionName(detail) {
+                    detail = String(cString: newDetail)
+                }
+            }
+            
+            if let f = instantiateFunctions[detail] {
+                f.combine(timing)
+            }
+            else {
+                let f = BuildFunctionTiming()
+                f.combine(timing)
+                instantiateFunctions[detail] = f
+            }
+        }
+    }
+    
+    func reset() {
+        optFunctions.removeAll()
+        instantiateFunctions.removeAll()
+    }
+}
+
 // Could also process each build timings in a threaded task.  That what CBA is doing.
 class BuildTiming: NSCopying {
-    var name = ""
+    var name = "" // duped with key of map
+    var type = ""
     var count = 0
     var duration = 0
     var durationSub = 0
     var durationSelf: Int { return max(0, duration - durationSub) }
-    var type = ""
     
     func combine(_ duration: Int, _ durationSub: Int) {
         self.duration += duration
@@ -757,6 +886,12 @@ class BuildTiming: NSCopying {
         self.count += 1
     }
     
+    func combine(_ timing: BuildTiming) {
+        self.duration += timing.duration
+        self.durationSub += timing.durationSub
+        self.count += timing.count
+    }
+    
     // This is annoying in Swift
     func copy(with zone: NSZone? = nil) -> Any {
         let copy = BuildTiming()
@@ -769,7 +904,7 @@ class BuildTiming: NSCopying {
       }
 }
 
-func updateFileBuildTimings(_ events: [CatapultEvent]) -> [String:BuildTiming] {
+func updateFileBuildTimings(_ events: [PerfettoEvent]) -> [String:BuildTiming] {
     var buildTimings: [String:BuildTiming] = [:]
     
     // DONE: would be nice to compute the self times.  This involves
@@ -778,7 +913,7 @@ func updateFileBuildTimings(_ events: [CatapultEvent]) -> [String:BuildTiming] {
     // See what CBA and Perfetto do to establish this.
     
     // Would be good to establish this nesting once and store the level
-    // with each event.d
+    // with each event.
     
     // run through each file, and build a local map of name to size count
     for i in 0..<events.count {
@@ -807,6 +942,7 @@ func updateFileBuildTimings(_ events: [CatapultEvent]) -> [String:BuildTiming] {
     return buildTimings
 }
 
+@MainActor
 func findFilesForBuildTimings(files: [File], selection: String) -> [File] {
     let selectedFile = lookupFile(url:URL(string:selection)!)
     let isArchive = selectedFile.archive != nil
@@ -828,7 +964,23 @@ func postBuildTimingsReport(files: [File]) -> String? {
     if buildTimings.isEmpty { return nil }
     let buildStats = mergeFileBuildStats(files:files)
     
-    let buildJsonBase64 = generateBuildReport(buildTimings: buildTimings, buildStats: buildStats)
+    // merge the function stats
+    // TODO: could to more to highlight and crunch template strings
+    let buildFunctionTimings = BuildFunctionTimings()
+    for file in files {
+        buildFunctionTimings.combine(file.buildFunctionTimings)
+    }
+    
+    // Compute more consolidation by collapsing names
+    let buildTemplateFunctionTimings = BuildFunctionTimings()
+    buildTemplateFunctionTimings.combine(buildFunctionTimings, true)
+    
+    let buildJsonBase64 = generateBuildReport(
+        buildTimings: buildTimings,
+        buildFunctionTimings: buildFunctionTimings,
+        buildTemplateFunctionTimings: buildTemplateFunctionTimings,
+        buildStats: buildStats)
+    
     let buildJS = postLoadFileJS(fileContentBase64: buildJsonBase64, title: "BuildTimings")
     return buildJS
 }
@@ -836,7 +988,9 @@ func postBuildTimingsReport(files: [File]) -> String? {
 func mergeFileBuildStats(files: [File]) -> BuildStats {
     let buildStats = BuildStats()
     for file in files {
-        buildStats.combine(file.buildStats)
+        if file.buildStats != nil {
+            buildStats.combine(file.buildStats!)
+        }
     }
     
     buildStats.frontendStart = 0
@@ -857,12 +1011,13 @@ func mergeFileBuildTimings(files: [File]) -> [String:BuildTiming] {
     for file in files {
         // merge and combine duplicates
         for buildTiming in file.buildTimings {
-            if buildTimings[buildTiming.key] == nil {
-                buildTimings[buildTiming.key] = (buildTiming.value.copy() as! BuildTiming)
+            let v = buildTiming.value
+            if let bt = buildTimings[buildTiming.key] {
+                bt.combine(v.duration, v.durationSub)
             }
             else {
-                let v = buildTiming.value
-                buildTimings[buildTiming.key]!.combine(v.duration, v.durationSub)
+                // need to copy to setup name/type
+                buildTimings[buildTiming.key] = (v.copy() as! BuildTiming)
             }
         }
         // buildTimings.merge didn't work, combine src values
@@ -871,23 +1026,52 @@ func mergeFileBuildTimings(files: [File]) -> [String:BuildTiming] {
     return buildTimings
 }
 
-func generateBuildReport(buildTimings: [String:BuildTiming], buildStats: BuildStats) -> String {
+func generateBuildReport(buildTimings: [String:BuildTiming], 
+                         buildFunctionTimings: BuildFunctionTimings,
+                         buildTemplateFunctionTimings: BuildFunctionTimings,
+                         buildStats: BuildStats) -> String
+{
     // now convert those timings back into a perfetto displayable report
     // So just need to build up the json above into events on tracks
-    var events: [CatapultEvent] = []
+    var events: [PerfettoEvent] = []
 
     // Also sort or assign a sort_index to the tracks.  Sort biggest to smallest.
     // Make the threadName for the track be the short filename.
     
     // add the thread names, only using 3 threads
     if true {
-        let names = ["ParseTime", "ParseCount", "ParseSelf", "OptimizeTime"]
+        let names = ["ParseTime", "ParseCount", "ParseSelf", 
+                     "OptimizeTime",
+                     "InstFunc", "OptimizeFunc",
+                     "InstTplFunc", "InstTplCount", "OptimizeTplFunc"
+        ]
         for i in 0..<names.count {
-            let event = CatapultEvent(tid: i+1, threadName: names[i])
+            let event = PerfettoEvent(tid: i+1, threadName: names[i])
             events.append(event)
         }
     }
     
+    // total the parse and optimization timings
+    var parseTiming = 0
+    var optimizeTiming = 0
+    
+    for t in buildTimings.values {
+        let isHeader = t.type == "Source"
+        let isOptimize = t.type == "OptFunction"
+        
+        if isHeader {
+            parseTiming += t.duration
+        }
+        else if isOptimize {
+            optimizeTiming += t.duration
+        }
+    }
+    
+    let parseTimingInv = 1.0 / Double(parseTiming)
+    let optimizeTimingInv = 1.0 / Double(optimizeTiming)
+    
+    var event = PerfettoEvent(0, "", 0)
+    
     for buildTiming in buildTimings {
         let t = buildTiming.value
         
@@ -896,14 +1080,19 @@ func generateBuildReport(buildTimings: [String:BuildTiming], buildStats: BuildSt
         
         let dur = Double(t.duration) * 1e-6
         let durSelf = Double(t.durationSelf) * 1e-6
-        var event = CatapultEvent(0, "", t.duration)
+        event.dur = t.duration
         
         // Need to see this in the name due to multiple sorts
         
         let isHeader = t.type == "Source"
+        let isOptimize = t.type == "OptFunction"
         
         // add count in seconds, so can view sorted by count below the duration above
         if isHeader {
+            // for now skip small contributions
+            let percent = Double(t.duration) * parseTimingInv
+            if percent < 0.01 { continue }
+            
             event.name = "\(shortFilename) \(t.count)x \(double: dur, decimals:2, zero: false)s"
             
             // ParseTime
@@ -925,7 +1114,11 @@ func generateBuildReport(buildTimings: [String:BuildTiming], buildStats: BuildSt
                 events.append(event)
             }
         }
-        else {
+        else if isOptimize {
+            // for now skip small contributions
+            let percent = Double(t.duration) * optimizeTimingInv
+            if percent < 0.01 { continue }
+            
             event.name = "\(shortFilename) \(double: dur, decimals:2, zero: false)s"
             
             // OptimizeTime
@@ -934,6 +1127,56 @@ func generateBuildReport(buildTimings: [String:BuildTiming], buildStats: BuildSt
         }
     }
     
+    func printTimings(_ functions: [String:BuildFunctionTiming], _ event: inout PerfettoEvent, _ events: inout [PerfettoEvent], isCount: Bool = false ) {
+        // compute inverse timings
+        var timing = 0
+        for time in functions.values {
+            timing += time.duration
+        }
+        let timingInv = 1.0 / Double(timing)
+        
+        // dump the highest duration
+        for tPair in functions{
+            let duration = tPair.value.duration
+            let count = tPair.value.count
+            
+            let percent = Double(duration) * timingInv
+            if percent < 0.01 { continue }
+            
+            let dur = Double(duration) * 1e-6
+            event.name = "\(tPair.key) \(double: dur, decimals:2, zero: false)s \(count)x"
+            if isCount {
+                // in 0.1s per count so they show up
+                event.dur = count * 10000
+            }
+            else {
+                event.dur = duration
+            }
+            events.append(event)
+        }
+    }
+    
+    let doFunctionTimings = true
+    if doFunctionTimings {
+        // function (templates + regular)
+        event.tid = 5
+        printTimings(buildFunctionTimings.instantiateFunctions, &event, &events)
+        
+        event.tid = 6
+        printTimings(buildFunctionTimings.optFunctions, &event, &events)
+        
+        //------
+        // templates
+        event.tid = 7
+        printTimings(buildTemplateFunctionTimings.instantiateFunctions, &event, &events)
+        
+        event.tid = 8
+        printTimings(buildTemplateFunctionTimings.instantiateFunctions, &event, &events, isCount: true)
+        
+        event.tid = 9
+        printTimings(buildTemplateFunctionTimings.optFunctions, &event, &events)
+    }
+    
     events.sort {
         // want threadnames first, could just prepend these to array?
         if $0.ph! != $1.ph! {
@@ -959,12 +1202,12 @@ func generateBuildReport(buildTimings: [String:BuildTiming], buildStats: BuildSt
     let totalTrackEvents = convertStatsToTotalTrack(buildStats)
     events += totalTrackEvents
     
-    let catapultProfile = CatapultProfile(traceEvents: events)
+    let perfettoProfile = PerfettoProfile(traceEvents: events)
     
     do {
         // json encode, compress, and then base64 encode that
         let encoder = JSONEncoder()
-        let fileContentFixed = try encoder.encode(catapultProfile)
+        let fileContentFixed = try encoder.encode(perfettoProfile)
         
         // gzip compress the data before sending it over
         guard let compressedData = fileContentFixed.gzip() else { return "" }
@@ -981,25 +1224,25 @@ func generateBuildReport(buildTimings: [String:BuildTiming], buildStats: BuildSt
 
 
 // TODO: Hook this up for memory traces, build more efficient array of thread events
-func sortThreadsByName(_ catapultProfile: inout CatapultProfile) {
+func sortThreadsByName(_ perfettoProfile: inout PerfettoProfile) {
     
     var threads: [Int: [Int]] = [:]
     
-    // first sort each thread by
-    for i in 0..<catapultProfile.traceEvents!.count {
-        let event = catapultProfile.traceEvents![i]
+    // first sort each thread
+    for i in 0..<perfettoProfile.traceEvents!.count {
+        let event = perfettoProfile.traceEvents![i]
         
         guard let tid = event.tid else { continue }
         if event.ts == nil || event.dur == nil { continue }
         
-        if event.name != nil && (event.name! == "thread_name" || event.name! == "process_name") {
+        if event.ph! == "M" && event.name != nil && (event.name! == "thread_name" || event.name! == "process_name") {
             continue
         }
         
         if threads[tid] == nil {
             threads[tid] = []
         }
-        // just store the even index
+        // just store the event index
         threads[tid]!.append(i)
     }
     
@@ -1010,8 +1253,8 @@ func sortThreadsByName(_ catapultProfile: inout CatapultProfile) {
         
         // sort each thread by name then dur
         thread.sort {
-            let lval = catapultProfile.traceEvents![$0]
-            let rval = catapultProfile.traceEvents![$1]
+            let lval = perfettoProfile.traceEvents![$0]
+            let rval = perfettoProfile.traceEvents![$1]
             
             let lname = lval.name ?? ""
             let rname = lval.name ?? ""
@@ -1026,30 +1269,31 @@ func sortThreadsByName(_ catapultProfile: inout CatapultProfile) {
         // Note this 0's them out, but could preserve min startTime
         var startTime = 0
         for i in thread {
-            catapultProfile.traceEvents![i].ts = startTime
-            startTime += catapultProfile.traceEvents![i].dur!
+            perfettoProfile.traceEvents![i].ts = startTime
+            startTime += perfettoProfile.traceEvents![i].dur!
         }
         
         // combine nodes, and store a count into the name
         // easier to mke a new array, and replace the other
         //var combineIndex = 0
-        // for i in 1..<catapultProfile.traceEvents![i]
+        // for i in 1..<perfettoProfile.traceEvents![i]
         
     }
     
     // have option to consolidate and rename, but must remove nodes
 }
 
-// parse json trace
-func updateThreadInfo(_ catapultProfile: CatapultProfile, _ file: inout File) {
+// these are per thread min/max for memory reports
+func updateThreadInfo(_ perfettoProfile: PerfettoProfile, _ file: File) {
     // was using Set<>, but having trouble with lookup
     var threadInfos: [Int: ThreadInfo] = [:]
     
-    for i in 0..<catapultProfile.traceEvents!.count {
-        let event = catapultProfile.traceEvents![i]
+    for i in 0..<perfettoProfile.traceEvents!.count {
+        let event = perfettoProfile.traceEvents![i]
         
         // have to have tid to associate with ThreadInfo
-        guard let tid = event.tid else { continue }
+        guard let tid = event.tid, 
+              let phase = event.ph else { continue }
         
         if threadInfos[tid] == nil {
             let info = ThreadInfo()
@@ -1058,15 +1302,19 @@ func updateThreadInfo(_ catapultProfile: CatapultProfile, _ file: inout File) {
             threadInfos[tid] = info
         }
         
-        if event.name != nil && event.name! == "thread_name" {
-            let threadName = event.args!["name"]!.value as! String
-            threadInfos[tid]!.threadName = threadName
+        if phase == "M" {
+            if event.name != nil && event.name! == "thread_name" {
+                let threadName = event.args!["name"]!.value as! String
+                threadInfos[tid]!.threadName = threadName
+            }
         }
-        else if event.ts != nil && event.dur != nil {
-            let s = event.ts!
-            let d = event.dur!
-            
-            threadInfos[tid]!.combine(s, d, event.name)
+        else if phase == "X" {
+            if event.ts != nil && event.dur != nil {
+                let s = event.ts!
+                let d = event.dur!
+                
+                threadInfos[tid]!.combine(s, d, event.name)
+            }
         }
     }
     
@@ -1081,14 +1329,14 @@ func updateThreadInfo(_ catapultProfile: CatapultProfile, _ file: inout File) {
     file.threadInfo = text
 }
 
-func updateDuration(_ events: [CatapultEvent]) -> Double {
+func updateDuration(_ events: [PerfettoEvent]) -> Double {
     var startTime = Int.max
     var endTime = Int.min
     
     for i in 0..<events.count {
         let event = events[i]
         
-        if event.ts != nil && event.dur != nil {
+        if event.ph != nil && event.ph! == "X" && event.ts != nil && event.dur != nil {
             let s = event.ts!
             let d = event.dur!
             
@@ -1107,14 +1355,14 @@ func updateDuration(_ events: [CatapultEvent]) -> Double {
 
 // After calling this, can compute the self time, and have the parent hierarchy to draw
 // events as a flamegraph.
-func computeEventParentsAndDurSub(_ events: inout [CatapultEvent]) {
+func computeEventParentsAndDurSub(_ events: inout [PerfettoEvent]) {
     // see CBA FindParentChildrenIndices for the adaption here
     // Clang Build Analyzer https://github.com/aras-p/ClangBuildAnalyzer
     // SPDX-License-Identifier: Unlicense
     // https://github.com/aras-p/ClangBuildAnalyzer/blob/main/src/Analysis.cpp
     
     // copy the events, going to replace this array with more data
-    //var events = catapultProfile.traceEvents!
+    //var events = perfettoProfile.traceEvents!
     
     var sortedIndices: [Int] = []
     for i in 0..<events.count {
@@ -1217,21 +1465,53 @@ func computeEventParentsAndDurSub(_ events: inout [CatapultEvent]) {
     }
 }
 
-// Fire this off any time the list changes and there
-// are build events in it.  This will update the data within,
-// so that the user doesn't have to visit every file manually.
-
-// TODO: move to timer class
-var kTickToSeconds = 0.0
 
-func updateTimebase() {
-    if kTickToSeconds != 0.0 { return }
+class Timer {
+    private static let kTickToSeconds = updateTimebase()
+    private var time: Double = -Timer.getTime()
     
-    var machTimebase = mach_timebase_info(numer: 0, denom: 0)
-    mach_timebase_info(&machTimebase)
-    kTickToSeconds = 1e-9 * Double(machTimebase.numer) / Double(machTimebase.denom) // 125/3
+    deinit {
+        stop()
+    }
+    
+    func timeElapsed() -> Double {
+        return time
+    }
+    
+    func restart() {
+        if time > 0.0 {
+            time = -Timer.getTime()
+        }
+    }
+    
+    func start() {
+        if time > 0.0 {
+            time -= Timer.getTime()
+        }
+    }
+    
+    func stop() {
+        if time < 0.0 {
+            time += Timer.getTime()
+        }
+    }
+    
+    private static func getTime() -> Double {
+        return Double(mach_absolute_time()) * kTickToSeconds
+    }
+    
+    private static func updateTimebase() -> Double {
+        var machTimebase = mach_timebase_info(numer: 0, denom: 0)
+        mach_timebase_info(&machTimebase)
+        
+        // AS = 125/3, Intel = 1/1
+        return 1e-9 * Double(machTimebase.numer) / Double(machTimebase.denom)
+    }
 }
 
+// Fire this off any time the list changes and there
+// are build events in it.  This will update the data within,
+// so that the user doesn't have to visit every file manually.
 func updateBuildTimingsTask(_ files: [File]) {
     // Can use counter for progress.  Could add up size instead of just count.
     var counter = 0
@@ -1243,13 +1523,34 @@ func updateBuildTimingsTask(_ files: [File]) {
     
     if counter == 0 { return }
     
+    #if false
     
-    updateTimebase()
+    
+        let backgroundTaskGroup = await withTaskGroup(of: Void.self) { group in
+            for file in files {
+                if file.fileType == .Build {
+                    _ = group.addTaskUnlessCancelled {
+                        guard Task.isCancelled == false else { return }
+                              
+                        do {
+                            try await updateBuildTimingTask(file)
+                        }
+                        catch {
+                            log.error(error.localizedDescription)
+                        }
+                        
+                    }
+                }
+            }
+        }
+    
+    #else
     
     let _ = Task(priority: .medium, operation: {
-        var time = -Double(mach_absolute_time()) * kTickToSeconds
+        let timer = Timer()
         
         for file in files {
+                     
             if file.fileType == .Build {
                 do {
                     try updateBuildTimingTask(file)
@@ -1260,12 +1561,14 @@ func updateBuildTimingsTask(_ files: [File]) {
             }
         }
         
-        time += Double(mach_absolute_time()) * kTickToSeconds
-        log.info("finished updating build timings in \(double:time, decimals:3)s")
+        timer.stop()
+        log.info("finished updating build timings in \(double:timer.timeElapsed(), decimals:3)s")
     })
-}
     
-func updateBuildTimingTask(_ file: File) throws {
+    #endif
+}
+
+func updateBuildTimingTask(_ file: File) /*async */ throws {
     assert(file.fileType == .Build)
     
     // only checking this, and not duration == 0
@@ -1290,21 +1593,54 @@ func updateBuildTimingTask(_ file: File) throws {
     }
     
     let decoder = JSONDecoder()
-    let catapultProfile = try decoder.decode(CatapultProfile.self, from: json)
-    if catapultProfile.traceEvents == nil { // an array
+    let perfettoProfile = try decoder.decode(PerfettoProfile.self, from: json)
+    if perfettoProfile.traceEvents == nil { // an array
         return
     }
     
-    var events = catapultProfile.traceEvents!
+    var events = perfettoProfile.traceEvents!
     
     // demangle the OptFunction name
     for i in 0..<events.count {
         let event = events[i]
-        if event.name == "OptFunction" {
+        if event.name == "OptFunction"  {
+            let detail = event.args!["detail"]!.value as! String
+            
+            // demangle worked
+            if let demangledName = demangleSymbolName(detail) {
+                var symbolName = String(cString: demangledName)
+                
+                // remove namespaces for readability
+                replaceFunctionNamespaces(&symbolName)
+                
+                if symbolName != detail {
+                    events[i].args!["detail"] = AnyCodable(symbolName)
+                }
+            }
+            else {
+                // couldn't demangle, so this probably won't work
+                var symbolName = detail
+                
+                // remove namespaces for readability
+                replaceFunctionNamespaces(&symbolName)
+                
+                if symbolName != detail {
+                    events[i].args!["detail"] = AnyCodable(symbolName)
+                }
+            }
+        }
+        else if event.name == "InstantiateFunction" {
+            // This is already demangled by clang before recorded
+            // so only need to replace the namespace
             let detail = event.args!["detail"]!.value as! String
-            let symbolName = String(cString: demangleSymbolName(detail))
             
-            events[i].args!["detail"] = AnyCodable(symbolName)
+            var symbolName = detail
+            
+            replaceFunctionNamespaces(&symbolName)
+            
+            if symbolName != detail {
+                events[i].args!["detail"] = AnyCodable(symbolName)
+            }
         }
     }
     
@@ -1313,7 +1649,7 @@ func updateBuildTimingTask(_ file: File) throws {
         file.duration = updateDuration(events)
     }
     
-    // Do this before the names are replaced below
+    // this empty test is at the top too
     if file.buildTimings.isEmpty {
         file.buildStats = generateStatsForTotalTrack(events)
         
@@ -1322,9 +1658,31 @@ func updateBuildTimingTask(_ file: File) throws {
         
         file.buildTimings = updateFileBuildTimings(events)
     }
+    
+    // losing the events at the end of this call, they should
+    // each be unique, so don't really need a map.  But could be
+    // non-unique if strip out the template types.
+    for i in 0..<events.count {
+        let event = events[i]
+        
+        // TODO: may want to strip template args for more consolidation
+        // like that would say which templates are over instantiated
+        
+        // InstantiateFunction has deep nesting, OptFunction may not be
+        if event.parentIndex == nil || event.parentIndex! < 0 { continue }
+        let parentEvent = events[event.parentIndex!]
+        
+        if event.name == "OptFunction"  {
+            if parentEvent.name == "OptFunction" { continue }
+            file.buildFunctionTimings.combine(event)
+        }
+        else if event.name == "InstantiateFunction"  {
+            if parentEvent.name == "InstantiateFunction" { continue }
+            file.buildFunctionTimings.combine(event)
+        }
+    }
 }
 
-
 /* These are types CBA is looking at.  It's not looking at any totals
  DebugType isn't in this.
  
@@ -1388,7 +1746,7 @@ func updateBuildTimingTask(_ file: File) throws {
  */
 
 // add a single track with hierarchical totals
-func generateStatsForTotalTrack(_ events: [CatapultEvent]) -> BuildStats {
+func generateStatsForTotalTrack(_ events: [PerfettoEvent]) -> BuildStats {
     let stats = BuildStats()
     
     // useful totals to track, many more in the files
@@ -1420,6 +1778,9 @@ func generateStatsForTotalTrack(_ events: [CatapultEvent]) -> BuildStats {
         else if event.name == "Total CodeGen Function" {
             stats.totalCodeGenFunction = event.dur!
         }
+        else if event.name == "Total DebugType" {
+            stats.totalDebugType = event.dur!
+        }
         
         // backend
         else if event.name == "Total Backend" {
@@ -1449,38 +1810,40 @@ func generateStatsForTotalTrack(_ events: [CatapultEvent]) -> BuildStats {
     return stats
 }
 
-func convertStatsToTotalTrack(_ stats: BuildStats) -> [CatapultEvent] {
+func convertStatsToTotalTrack(_ stats: BuildStats) -> [PerfettoEvent] {
     
-    var totalEvents: [CatapultEvent] = []
+    var totalEvents: [PerfettoEvent] = []
     
-    // This is really ugly, having these be a struct
+    // This is really ugly, change to using class?
     
     let tid = 0
-    let trackEvent = CatapultEvent(tid: tid, threadName: "Build Totals")
+    let trackEvent = PerfettoEvent(tid: tid, threadName: "Build Totals")
     totalEvents.append(trackEvent)
     
     // This is a struct, so can modify copy and add
-    var event: CatapultEvent
+    var event: PerfettoEvent
     
-    func makeDurEvent(_ tid: Int, _ name: String, _ dur: Int, _ total: Int) -> CatapultEvent {
+    func makeDurEvent(_ tid: Int, _ name: String, _ dur: Int, _ total: Int) -> PerfettoEvent {
         let percent = 100.0 * Double(dur) / Double(total)
-        return CatapultEvent(tid, "\(name) \(double:percent, decimals:0)%", dur)
+        return PerfettoEvent(tid, "\(double:percent, decimals:0)% \(name)", dur)
     }
     let total = stats.totalExecuteCompiler
     
-    event = makeDurEvent(tid, "Total ExecuteCompiler", stats.totalExecuteCompiler, total)
+    // Removed Total from all these strings
+    
+    event = makeDurEvent(tid, "ExecuteCompiler", stats.totalExecuteCompiler, total)
     totalEvents.append(event)
     
-    event = makeDurEvent(tid, "Total Frontend", stats.totalFrontend, total)
+    event = makeDurEvent(tid, "Frontend", stats.totalFrontend, total)
     event.ts = stats.frontendStart
     totalEvents.append(event)
     
     // sub-areas of frontend
-    event = makeDurEvent(tid, "Total Source", stats.totalSource, total)
+    event = makeDurEvent(tid, "Source", stats.totalSource, total)
     event.ts = stats.frontendStart
     totalEvents.append(event)
     
-    event = makeDurEvent(tid, "Total InstantiateFunction", stats.totalInstantiateFunction, total)
+    event = makeDurEvent(tid, "InstantiateFunction", stats.totalInstantiateFunction, total)
     event.ts = stats.frontendStart + stats.totalSource
     totalEvents.append(event)
     
@@ -1492,32 +1855,53 @@ func convertStatsToTotalTrack(_ stats: BuildStats) -> [CatapultEvent] {
     
     // This overlaps with some Source, and some InstantiateFunction, so it's sort of double
     // counted, so clamp it for now so Perfetto doesn't freak out and get the event order wrong.
-    event = makeDurEvent(tid, "Total InstantiateClass", totalInstantiateClass, total)
+    event = makeDurEvent(tid, "InstantiateClass", totalInstantiateClass, total)
     event.ts = stats.frontendStart + stats.totalSource
     totalEvents.append(event)
     
-    event = makeDurEvent(tid, "Total CodeGen Function", stats.totalCodeGenFunction, total)
-    event.ts = stats.frontendStart + stats.totalSource + stats.totalInstantiateFunction
+    // This total can exceed when backend start, so clamp it too
+    let tsCodeGenFunction = stats.frontendStart + stats.totalSource + stats.totalInstantiateFunction
+    
+    var totalCodeGenFunction = stats.totalCodeGenFunction
+    if tsCodeGenFunction + totalCodeGenFunction > stats.backendStart {
+        totalCodeGenFunction = stats.backendStart - tsCodeGenFunction
+    }
+    
+    event = makeDurEvent(tid, "CodeGen Function", totalCodeGenFunction, total)
+    event.ts = tsCodeGenFunction
+    totalEvents.append(event)
+    
+    
+    // can gen a lot of debug types, and clamp to backend
+    let tsDebugType = tsCodeGenFunction + totalCodeGenFunction
+     
+    var totalDebugType = stats.totalDebugType
+    if totalDebugType + totalDebugType > stats.backendStart {
+        totalDebugType = stats.backendStart - tsDebugType
+    }
+     
+    event = makeDurEvent(tid, "Debug", totalDebugType, total)
+    event.ts = tsDebugType
     totalEvents.append(event)
     
     // backend
-    event = makeDurEvent(tid, "Total Backend", stats.totalBackend, total)
+    event = makeDurEvent(tid, "Backend", stats.totalBackend, total)
     event.ts = stats.backendStart
     totalEvents.append(event)
     
-    event = makeDurEvent(tid, "Total Optimizer", stats.totalOptimizer, total)
+    event = makeDurEvent(tid, "Optimizer", stats.totalOptimizer, total)
     event.ts = stats.backendStart
     totalEvents.append(event)
     
-    // event = makeDurEvent(tid, "Total OptModule", stats.totalOptModule, total)
+    // event = makeDurEvent(tid, "OptModule", stats.totalOptModule, total)
     // event.ts = stats.backendStart + stats.totalOptimizer
     // totalEvents.append(event)
     
-    event = makeDurEvent(tid, "Total CodeGenPasses", stats.totalCodeGenPasses, total)
+    event = makeDurEvent(tid, "CodeGenPasses", stats.totalCodeGenPasses, total)
     event.ts = stats.backendStart + stats.totalOptimizer
     totalEvents.append(event)
     
-    event = makeDurEvent(tid, "Total OptFunction", stats.totalOptFunction, total)
+    event = makeDurEvent(tid, "OptFunction", stats.totalOptFunction, total)
     event.ts = stats.backendStart + stats.totalOptimizer
     totalEvents.append(event)
     
@@ -1540,14 +1924,15 @@ func convertStatsToTotalTrack(_ stats: BuildStats) -> [CatapultEvent] {
     return totalEvents
 }
 
-func loadFileJS(_ path: String) -> String? {
-    
-    let fileURL = URL(string: path)!
-    
-    // Note may need to modify directly
-    var file = lookupFile(url: fileURL)
-    
-    log.debug(path)
+func replaceFunctionNamespaces(_ detail: inout String) {
+    // replace namespaces in the detail
+    let namespaces = ["std::", "kram::", "eastl::"]
+    for namespace in namespaces {
+        detail = detail.replacing(namespace, with:"")
+    }
+}
+
+func loadFileJS(_ file: File) -> String? {
     
     do {
         // use this for binary data, but need to fixup some json before it's sent
@@ -1589,23 +1974,23 @@ func loadFileJS(_ path: String) -> String? {
                 // walk the file and compute the duration if we don't already have it
                 if isJson && file.duration == 0.0 {
                     let decoder = JSONDecoder()
-                    let catapultProfile = try decoder.decode(CatapultProfile.self, from: fileContent)
+                    let perfettoProfile = try decoder.decode(PerfettoProfile.self, from: fileContent)
                     
-                    if catapultProfile.traceEvents == nil {
+                    if perfettoProfile.traceEvents == nil {
                         return nil
                     }
             
-                    file.duration = updateDuration(catapultProfile.traceEvents!)
+                    file.duration = updateDuration(perfettoProfile.traceEvents!)
                     
                     // For now, just log the per-thread info
                     if file.fileType == .Memory {
-                        updateThreadInfo(catapultProfile, &file)
+                        updateThreadInfo(perfettoProfile, file)
                     }
                     
-                    // This mods the catapult profile to store parentIndex and durSub
+                    // This mods the perfetto profile to store parentIndex and durSub
                     // the call has build specific code right now
                     //else if file.fileType == .Perf {
-                    //    computeEventParentsAndDurSub(&catapultProfile)
+                    //    computeEventParentsAndDurSub(&perfettoProfile)
                     //}
                 }
             }
@@ -1635,28 +2020,32 @@ func loadFileJS(_ path: String) -> String? {
             // here having to ungzip and decode just to display the content
             // have already processed the build files in an async task
             let decoder = JSONDecoder()
-            var catapultProfile = try decoder.decode(CatapultProfile.self, from: json)
+            var perfettoProfile = try decoder.decode(PerfettoProfile.self, from: json)
             
-            if catapultProfile.traceEvents == nil {
+            if perfettoProfile.traceEvents == nil {
                 return nil
             }
             
             // demangle the OptFunction name
-            for i in 0..<catapultProfile.traceEvents!.count {
-                let event = catapultProfile.traceEvents![i]
+            // This is the only name not demangled
+            // https://github.com/llvm/llvm-project/issues/45901
+            for i in 0..<perfettoProfile.traceEvents!.count {
+                let event = perfettoProfile.traceEvents![i]
                 if event.name == "OptFunction" {
                     let detail = event.args!["detail"]!.value as! String
-                    let symbolName = String(cString: demangleSymbolName(detail))
-                    
-                    catapultProfile.traceEvents![i].args!["detail"] = AnyCodable(symbolName)
+                    if let demangledName = demangleSymbolName(detail) {
+                        let symbolName = String(cString: demangledName)
+                        
+                        perfettoProfile.traceEvents![i].args!["detail"] = AnyCodable(symbolName)
+                    }
                 }
             }
             
             // Replace Source with actual file name on Clang.json files
             // That's just for the parse phase, probably need for optimization
             // phase too.
-            for i in 0..<catapultProfile.traceEvents!.count {
-                let event = catapultProfile.traceEvents![i]
+            for i in 0..<perfettoProfile.traceEvents!.count {
+                let event = perfettoProfile.traceEvents![i]
                 if  event.name == "Source" ||
                         event.name == "OptModule"
                 {
@@ -1666,42 +2055,49 @@ func loadFileJS(_ path: String) -> String? {
                     
                     // stupid immutable struct.  Makes this code untempable
                     // maybe can use class instead of struct?
-                    catapultProfile.traceEvents![i].name = url.lastPathComponent
+                    perfettoProfile.traceEvents![i].name = url.lastPathComponent
                 }
                 else if event.name == "InstantiateFunction" ||
-                            event.name == "InstantiateClass" ||
+                            event.name == "CodeGen Function" ||
                             event.name == "OptFunction" ||
+                            event.name == "InstantiateClass" ||
                             event.name == "ParseClass" ||
                             event.name == "DebugType" || // these take a while
-                            event.name == "CodeGen Function" ||
                             event.name == "RunPass"
                 {
+                    // Note: instantiationFunction/Class are nested
+                    // so really only want to track times on the top call.
+                    
                     // This is a symbol name
-                    let detail = event.args!["detail"]!.value as! String
-                    catapultProfile.traceEvents![i].name = detail
+                    var detail = event.args!["detail"]!.value as! String
+                    
+                    // replace namespaces in the detail
+                    replaceFunctionNamespaces(&detail)
+                    
+                    perfettoProfile.traceEvents![i].name = detail
                 }
                 
                 // knock out the pid.  There are "M" events setting the process_name
                 // Otherwise, display will collapse pid sections since totalTrack has no pid
-                catapultProfile.traceEvents![i].pid = nil
+                perfettoProfile.traceEvents![i].pid = nil
             }
             
             if file.buildStats != nil {
                 let totalEvents = convertStatsToTotalTrack(file.buildStats!)
                 
                 // combine these onto the end, could remove the individual tracks storing these
-                catapultProfile.traceEvents! += totalEvents
+                perfettoProfile.traceEvents! += totalEvents
             }
             
             let encoder = JSONEncoder()
-            let fileContentFixed = try encoder.encode(catapultProfile)
+            let fileContentFixed = try encoder.encode(perfettoProfile)
             
             // gzip compress the data before sending it over
             guard let compressedData = fileContentFixed.gzip() else { return nil }
             fileContentBase64 = compressedData.base64EncodedString()
         }
         
-        return postLoadFileJS(fileContentBase64: fileContentBase64, title:fileURL.lastPathComponent)
+        return postLoadFileJS(fileContentBase64: fileContentBase64, title:file.name)
     }
     catch {
         log.error(error.localizedDescription)
@@ -1946,17 +2342,19 @@ struct kram_profileApp: App {
     func openFileSelection(_ webView: WKWebView) {
         if let sel = selection {
             
+            let file = lookupFile(selection: sel)
+            
             // This should only reload if selection previously loaded
             // to a valid file, or if modstamp changed on current selection
             
             // TODO: fix this
             let objTimeScript: String? = nil // buildTimeRangeJson(filenameToTimeRange(sel))
             
-            var str = loadFileJS(sel)
+            var str = loadFileJS(file)
             if str != nil {
                 runJavascript(webView, str!)
                 
-                let file = lookupFile(selection: sel)
+                // This means Perfetto UI loaded the fileContent, not that fileContent was loaded
                 file.setLoadStamp()
             }
             
@@ -2201,7 +2599,7 @@ A tool to help profile mem, perf, and builds.
                 openFileSelection(myWebView)
                 //focusedField = .webView
             }
-            .navigationTitle(generateNavigationTitle(selection))
+            .navigationTitle(generateNavigationTitle(selection, fileSearcher.files))
             .onOpenURL { url in
                 openFileFromURLs(urls: [url])
                 //focusedField = .webView
@@ -2326,6 +2724,60 @@ A tool to help profile mem, perf, and builds.
                 }
                 .disabled(selection == nil)
                 
+                Button("Build Report CBA") {
+                    // DONE: have to reload fileContent if new zip is
+                    // loaded.  The buildTiming data is still cached, but the fileContent
+                    // is nulled out and must be reloaded.
+                    
+                    let buildFiles = findFilesForBuildTimings(files: fileSearcher.files, selection: selection!)
+                    
+                    if buildFiles.isEmpty { return }
+                    
+                    var fileDatas: [Data] = []
+                    var filenames: [String] = []
+                    
+                    for file in buildFiles {
+                        var fileContent = file.fileContent
+                        
+                        if fileContent == nil  {
+                            fileContent = loadFileContent(file)
+                            
+                            // skip it
+                            if fileContent == nil {
+                                continue
+                            }
+                        }
+                        
+                        fileDatas.append(fileContent!)
+                        filenames.append(file.url.absoluteString)
+                    }
+                    // Extract the fileContent and names.  This avoids CBA needing to do IO.
+                    // But CBA is reparsing all of the json in C++ to build up its tables.
+                    // Also demangling names again.  And unlike the build report which is
+                    // cached per file, this is doing all on main thread.
+                    
+                    let timer = Timer()
+                    
+                    // TODO: call parse from the buildTimings task.  Then it's done in
+                    // background and cached.  A new file  should use the same unique filename.
+                    // This this needs to analyze only specific files passed to analyze.
+                    let cba = CBA()
+                    cba.parseAll(fileDatas, filenames: filenames)
+                    let cbaReport = cba.analyzeAll()
+                    
+                    timer.stop()
+                    log.info("finished updating CBA timings in \(double:timer.timeElapsed(), decimals:3)s")
+                    
+                    // Can't use log here, since it's not setup to chop up long
+                    // strings by newlines yet.  Print doesn't go to console
+                    // so this string is only shown if app run from debugger.
+                    // TODO: use popover like info button.
+                    
+                    // print(cbaReport) looks terrible, so have to format it
+                    print("\(cbaReport)")
+                }
+                .disabled(selection == nil)
+                
                 
                 // must call through NSWindow
                 Button("See Below") {
diff --git a/kram-thumb-win/CMakeLists.txt b/kram-thumb-win/CMakeLists.txt
index 2ed0d934..7f263221 100644
--- a/kram-thumb-win/CMakeLists.txt
+++ b/kram-thumb-win/CMakeLists.txt
@@ -1,4 +1,4 @@
-﻿
+
 # dll output can be renamed for debug vs. release, but is hard to debug
 set(myTargetLib kram-thumb-win)
 
@@ -17,12 +17,18 @@ set(SOURCE_FILES
 # Module is a DLL library
 add_library(${myTargetLib} MODULE ${SOURCE_FILES})
 
+# This doesn't work for dll based viewer
+# Use the static linked libs, or the exe needs the VCRuntimeDLL installed
+# This has to occur after library defined above.
+# set_property(TARGET ${myTargetLib} PROPERTY
+#    MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+
  # to turn off exceptions/rtti use /GR and /EHsc replacement
 string(REGEX REPLACE "/GR" "/GR-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 string(REGEX REPLACE "/EHsc" "/EHs-c-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
     
-# all warnings, AVX1, and multiprocess compiles
-target_compile_options(${myTargetLib} PRIVATE /W3 /arch:AVX -mf16c /MP /GF /FC)
+# all warnings, AVX2, and multiprocess compiles
+target_compile_options(${myTargetLib} PRIVATE /W3 -march=haswell -mf16c -mfma /MP /GF /FC)
   
 target_compile_definitions(${myTargetLib} PRIVATE -D_ITERATOR_DEBUG_LEVEL=0 -D_HAS_EXCEPTIONS=0 -DUNICODE -D_UNICODE)
     
diff --git a/kram-thumb-win/Dll.cpp b/kram-thumb-win/Dll.cpp
index 7d13f5ac..267e3c5e 100644
--- a/kram-thumb-win/Dll.cpp
+++ b/kram-thumb-win/Dll.cpp
@@ -1,19 +1,20 @@
 // based on QOI Thumbnail Provider for Windows Explorer
 // Written by iOrange in 2021
-// 
+//
 // Based on Microsoft's example
 // https://github.com/microsoft/windows-classic-samples/tree/main/Samples/Win7Samples/winui/shell/appshellintegration/RecipeThumbnailProvider
-// 
+//
 // Also more info here:
 // https://docs.microsoft.com/en-us/previous-versions/windows/desktop/legacy/cc144118(v=vs.85)
 
 #include <objbase.h>
+#include <shlobj.h> // For SHChangeNotify
 #include <shlwapi.h>
 #include <thumbcache.h> // For IThumbnailProvider.
-#include <shlobj.h>     // For SHChangeNotify
-#include <new>
+
 #include <atomic>
-#include <vector>       // For std::size
+#include <new>
+#include <vector> // For std::size
 
 // from KramThumbProvider.cpp
 extern HRESULT KramThumbProvider_CreateInstance(REFIID riid, void** ppv);
@@ -27,49 +28,53 @@ extern HRESULT KramThumbProvider_CreateInstance(REFIID riid, void** ppv);
 #define SZ_CLSID_KramTHUMBHANDLER L"{a9a47ef5-c238-42a9-a4e6-a85558811dac}"
 constexpr CLSID kCLSID_KramThumbHandler = {0xa9a47ef5, 0xc238, 0x42a9, {0xa4, 0xe6, 0xa8, 0x55, 0x58, 0x81, 0x1d, 0xac}};
 
-
-typedef HRESULT(*PFNCREATEINSTANCE)(REFIID riid, void** ppvObject);
+typedef HRESULT (*PFNCREATEINSTANCE)(REFIID riid, void** ppvObject);
 struct CLASS_OBJECT_INIT {
-    const CLSID*        pClsid;
-    PFNCREATEINSTANCE   pfnCreate;
+    const CLSID* pClsid;
+    PFNCREATEINSTANCE pfnCreate;
 };
 
 // add classes supported by this module here
 constexpr CLASS_OBJECT_INIT kClassObjectInit[] = {
-    { &kCLSID_KramThumbHandler, KramThumbProvider_CreateInstance }
-};
+    {&kCLSID_KramThumbHandler, KramThumbProvider_CreateInstance}};
 
-
-std::atomic_long    gModuleReferences(0);
-HINSTANCE           gModuleInstance = nullptr;
+std::atomic_long gModuleReferences(0);
+HINSTANCE gModuleInstance = nullptr;
 
 // Standard DLL functions
-STDAPI_(BOOL) DllMain(HINSTANCE hInstance, DWORD dwReason, void*) {
+STDAPI_(BOOL)
+DllMain(HINSTANCE hInstance, DWORD dwReason, void*)
+{
     if (DLL_PROCESS_ATTACH == dwReason) {
         gModuleInstance = hInstance;
         ::DisableThreadLibraryCalls(hInstance);
-    } else if (DLL_PROCESS_DETACH == dwReason) {
+    }
+    else if (DLL_PROCESS_DETACH == dwReason) {
         gModuleInstance = nullptr;
     }
     return TRUE;
 }
 
-STDAPI DllCanUnloadNow() {
+STDAPI DllCanUnloadNow()
+{
     // Only allow the DLL to be unloaded after all outstanding references have been released
     return (gModuleReferences > 0) ? S_FALSE : S_OK;
 }
 
-void DllAddRef() {
+void DllAddRef()
+{
     ++gModuleReferences;
 }
 
-void DllRelease() {
+void DllRelease()
+{
     --gModuleReferences;
 }
 
 class CClassFactory : public IClassFactory {
 public:
-    static HRESULT CreateInstance(REFCLSID clsid, const CLASS_OBJECT_INIT* pClassObjectInits, size_t cClassObjectInits, REFIID riid, void** ppv) {
+    static HRESULT CreateInstance(REFCLSID clsid, const CLASS_OBJECT_INIT* pClassObjectInits, size_t cClassObjectInits, REFIID riid, void** ppv)
+    {
         *ppv = NULL;
         HRESULT hr = CLASS_E_CLASSNOTAVAILABLE;
         for (size_t i = 0; i < cClassObjectInits; ++i) {
@@ -87,29 +92,34 @@ class CClassFactory : public IClassFactory {
     }
 
     CClassFactory(PFNCREATEINSTANCE pfnCreate)
-        : mReferences(1)
-        , mCreateFunc(pfnCreate) {
+        : mReferences(1), mCreateFunc(pfnCreate)
+    {
         DllAddRef();
     }
 
-    virtual ~CClassFactory() {
+    virtual ~CClassFactory()
+    {
         DllRelease();
     }
 
     // IUnknown
-    IFACEMETHODIMP QueryInterface(REFIID riid, void** ppv) {
+    IFACEMETHODIMP QueryInterface(REFIID riid, void** ppv)
+    {
         static const QITAB qit[] = {
             QITABENT(CClassFactory, IClassFactory),
-            { 0 }
-        };
+            {0}};
         return QISearch(this, qit, riid, ppv);
     }
 
-    IFACEMETHODIMP_(ULONG) AddRef() {
+    IFACEMETHODIMP_(ULONG)
+    AddRef()
+    {
         return ++mReferences;
     }
 
-    IFACEMETHODIMP_(ULONG) Release() {
+    IFACEMETHODIMP_(ULONG)
+    Release()
+    {
         const long refs = --mReferences;
         if (!refs) {
             delete this;
@@ -118,38 +128,43 @@ class CClassFactory : public IClassFactory {
     }
 
     // IClassFactory
-    IFACEMETHODIMP CreateInstance(IUnknown* punkOuter, REFIID riid, void** ppv) {
+    IFACEMETHODIMP CreateInstance(IUnknown* punkOuter, REFIID riid, void** ppv)
+    {
         return punkOuter ? CLASS_E_NOAGGREGATION : mCreateFunc(riid, ppv);
     }
 
-    IFACEMETHODIMP LockServer(BOOL fLock) {
+    IFACEMETHODIMP LockServer(BOOL fLock)
+    {
         if (fLock) {
             DllAddRef();
-        } else {
+        }
+        else {
             DllRelease();
         }
         return S_OK;
     }
 
 private:
-    std::atomic_long    mReferences;
-    PFNCREATEINSTANCE   mCreateFunc;
+    std::atomic_long mReferences;
+    PFNCREATEINSTANCE mCreateFunc;
 };
 
-STDAPI DllGetClassObject(REFCLSID clsid, REFIID riid, void** ppv) {
+STDAPI DllGetClassObject(REFCLSID clsid, REFIID riid, void** ppv)
+{
     return CClassFactory::CreateInstance(clsid, kClassObjectInit, std::size(kClassObjectInit), riid, ppv);
 }
 
 // A struct to hold the information required for a registry entry
 struct REGISTRY_ENTRY {
-    HKEY   hkeyRoot;
+    HKEY hkeyRoot;
     PCWSTR pszKeyName;
     PCWSTR pszValueName;
     PCWSTR pszData;
 };
 
 // Creates a registry key (if needed) and sets the default value of the key
-HRESULT CreateRegKeyAndSetValue(const REGISTRY_ENTRY* pRegistryEntry) {
+HRESULT CreateRegKeyAndSetValue(const REGISTRY_ENTRY* pRegistryEntry)
+{
     HKEY hKey;
     HRESULT hr = HRESULT_FROM_WIN32(RegCreateKeyExW(pRegistryEntry->hkeyRoot,
                                                     pRegistryEntry->pszKeyName,
@@ -166,28 +181,30 @@ HRESULT CreateRegKeyAndSetValue(const REGISTRY_ENTRY* pRegistryEntry) {
 }
 
 // Registers this COM server
-STDAPI DllRegisterServer() {
+STDAPI DllRegisterServer()
+{
     HRESULT hr;
-    WCHAR szModuleName[MAX_PATH] = { 0 };
+    WCHAR szModuleName[MAX_PATH] = {0};
 
     if (!GetModuleFileNameW(gModuleInstance, szModuleName, ARRAYSIZE(szModuleName))) {
         hr = HRESULT_FROM_WIN32(GetLastError());
-    } else {
+    }
+    else {
         // List of registry entries we want to create
         const REGISTRY_ENTRY registryEntries[] = {
             // RootKey          KeyName                                                                      ValueName          Data
-            {HKEY_CURRENT_USER, L"Software\\Classes\\CLSID\\" SZ_CLSID_KramTHUMBHANDLER,                      nullptr,           SZ_KramTHUMBHANDLER},
-            {HKEY_CURRENT_USER, L"Software\\Classes\\CLSID\\" SZ_CLSID_KramTHUMBHANDLER L"\\InProcServer32",  nullptr,           szModuleName},
-            {HKEY_CURRENT_USER, L"Software\\Classes\\CLSID\\" SZ_CLSID_KramTHUMBHANDLER L"\\InProcServer32",  L"ThreadingModel", L"Apartment"},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\CLSID\\" SZ_CLSID_KramTHUMBHANDLER, nullptr, SZ_KramTHUMBHANDLER},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\CLSID\\" SZ_CLSID_KramTHUMBHANDLER L"\\InProcServer32", nullptr, szModuleName},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\CLSID\\" SZ_CLSID_KramTHUMBHANDLER L"\\InProcServer32", L"ThreadingModel", L"Apartment"},
 
             // libkram can decode any of these and create a thumbnail
             // The Vista GUID for the thumbnail handler Shell extension is E357FCCD-A995-4576-B01F-234630154E96.
-            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx",                                                  L"PerceivedType",  L"image"},
-            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx\\ShellEx\\{e357fccd-a995-4576-b01f-234630154e96}", nullptr,           SZ_CLSID_KramTHUMBHANDLER},
-            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx2",                                                  L"PerceivedType",  L"image"},
-            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx2\\ShellEx\\{e357fccd-a995-4576-b01f-234630154e96}", nullptr,           SZ_CLSID_KramTHUMBHANDLER},
-            {HKEY_CURRENT_USER, L"Software\\Classes\\.dds",                                                  L"PerceivedType",  L"image"},
-            {HKEY_CURRENT_USER, L"Software\\Classes\\.dds\\ShellEx\\{e357fccd-a995-4576-b01f-234630154e96}", nullptr,           SZ_CLSID_KramTHUMBHANDLER},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx", L"PerceivedType", L"image"},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx\\ShellEx\\{e357fccd-a995-4576-b01f-234630154e96}", nullptr, SZ_CLSID_KramTHUMBHANDLER},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx2", L"PerceivedType", L"image"},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\.ktx2\\ShellEx\\{e357fccd-a995-4576-b01f-234630154e96}", nullptr, SZ_CLSID_KramTHUMBHANDLER},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\.dds", L"PerceivedType", L"image"},
+            {HKEY_CURRENT_USER, L"Software\\Classes\\.dds\\ShellEx\\{e357fccd-a995-4576-b01f-234630154e96}", nullptr, SZ_CLSID_KramTHUMBHANDLER},
             //{HKEY_CURRENT_USER, L"Software\\Classes\\.png",                                                  L"PerceivedType", L"image"},
             //{HKEY_CURRENT_USER, L"Software\\Classes\\.png\\ShellEx\\{e357fccd-a995-4576-b01f-234630154e96}", nullptr, SZ_CLSID_KramTHUMBHANDLER},
         };
@@ -208,7 +225,8 @@ STDAPI DllRegisterServer() {
 }
 
 // Unregisters this COM server
-STDAPI DllUnregisterServer() {
+STDAPI DllUnregisterServer()
+{
     HRESULT hr = S_OK;
 
     const PCWSTR regKeys[] = {
@@ -216,7 +234,7 @@ STDAPI DllUnregisterServer() {
         L"Software\\Classes\\.ktx",
         L"Software\\Classes\\.ktx2",
         L"Software\\Classes\\.dds",
-       // L"Software\\Classes\\.png", // only need this if Win png bg is bad
+        // L"Software\\Classes\\.png", // only need this if Win png bg is bad
     };
 
     // Delete the registry entries
diff --git a/kram-thumb-win/KramThumbProvider.cpp b/kram-thumb-win/KramThumbProvider.cpp
index 30b31894..e49ccfd5 100644
--- a/kram-thumb-win/KramThumbProvider.cpp
+++ b/kram-thumb-win/KramThumbProvider.cpp
@@ -1,14 +1,15 @@
-﻿#include "KramLib.h"
-
 #include <shlwapi.h>
 #include <thumbcache.h> // For IThumbnailProvider.
 #include <wrl/client.h> // For ComPtr
-#include <new>
+
 #include <atomic>
+#include <new>
 #include <vector>
 
+#include "KramLib.h"
+
 using namespace kram;
-using namespace std;
+using namespace std; // or STL_NAMESPACE
 
 template <typename T>
 using ComPtr = Microsoft::WRL::ComPtr<T>;
@@ -36,10 +37,10 @@ inline void* KLOGF(uint32_t code, const char* format, ...)
 
     // Console prints this as <private>, so what's the point of producing a localizedString ?
     // This doesn't seem to work to Console app, but maybe if logs are to terminal
-    // sudo log config --mode "level:debug" --subsystem com.ba.kramv
+    // sudo log config --mode "level:debug" --subsystem com.hialec.kramv
 
     //NSString* errorText = [NSString stringWithUTF8String:str.c_str()];
-    // return [NSError errorWithDomain:@"com.ba.kramv" code:code userInfo:@{NSLocalizedDescriptionKey : errorText}];
+    // return [NSError errorWithDomain:@"com.hialec.kramv" code:code userInfo:@{NSLocalizedDescriptionKey : errorText}];
     return nullptr;
 }
 
@@ -48,32 +49,37 @@ struct ImageToPass {
     KTXImageData imageData;
 };
 
-class KramThumbProvider final : public IInitializeWithStream, public IThumbnailProvider 
-{
+class KramThumbProvider final : public IInitializeWithStream, public IThumbnailProvider {
 public:
     KramThumbProvider()
-        : mReferences(1)
-        , mStream{} {
+        : mReferences(1), mStream{}
+    {
     }
 
-    virtual ~KramThumbProvider() {
+    virtual ~KramThumbProvider()
+    {
     }
 
     // IUnknown
-    IFACEMETHODIMP QueryInterface(REFIID riid, void** ppv) {
+    IFACEMETHODIMP QueryInterface(REFIID riid, void** ppv)
+    {
         static const QITAB qit[] = {
             QITABENT(KramThumbProvider, IInitializeWithStream),
             QITABENT(KramThumbProvider, IThumbnailProvider),
-            { 0 },
+            {0},
         };
         return QISearch(this, qit, riid, ppv);
     }
 
-    IFACEMETHODIMP_(ULONG) AddRef() {
+    IFACEMETHODIMP_(ULONG)
+    AddRef()
+    {
         return ++mReferences;
     }
 
-    IFACEMETHODIMP_(ULONG) Release() {
+    IFACEMETHODIMP_(ULONG)
+    Release()
+    {
         long refs = --mReferences;
         if (!refs) {
             delete this;
@@ -82,8 +88,9 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
     }
 
     // IInitializeWithStream
-    IFACEMETHODIMP Initialize(IStream* pStream, DWORD /*grfMode*/) {
-        HRESULT hr = E_UNEXPECTED;  // can only be inited once
+    IFACEMETHODIMP Initialize(IStream* pStream, DWORD /*grfMode*/)
+    {
+        HRESULT hr = E_UNEXPECTED; // can only be inited once
         if (!mStream) {
             // take a reference to the stream if we have not been inited yet
             hr = pStream->QueryInterface(mStream.ReleaseAndGetAddressOf());
@@ -92,8 +99,8 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
     }
 
     // IThumbnailProvider
-    IFACEMETHODIMP GetThumbnail(UINT cx, HBITMAP* phbmp, WTS_ALPHATYPE* pdwAlpha) {
-       
+    IFACEMETHODIMP GetThumbnail(UINT cx, HBITMAP* phbmp, WTS_ALPHATYPE* pdwAlpha)
+    {
         // read from stream and create a thumbnail
         if (!ImageToHBITMAP(cx, phbmp)) {
             return E_OUTOFMEMORY;
@@ -101,7 +108,7 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
 
         // always 4 channels
         *pdwAlpha = WTSAT_ARGB;
-         
+
         return S_OK;
     }
 
@@ -122,11 +129,10 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
         vector<uint8_t> streamData;
         streamData.resize(streamSize);
         ULONG bytesRead = 0;
-        HRESULT hr = mStream->Read(streamData.data(), streamSize, &bytesRead);  // can only read ULONG
+        HRESULT hr = mStream->Read(streamData.data(), streamSize, &bytesRead); // can only read ULONG
         if (FAILED(hr) || streamSize != bytesRead)
             return false;
 
-
         // https://learn.microsoft.com/en-us/windows/win32/api/thumbcache/nf-thumbcache-ithumbnailprovider-getthumbnail
 
         std::shared_ptr<ImageToPass> imageToPass = std::make_shared<ImageToPass>();
@@ -143,14 +149,14 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
             }
 
             // This will set decoder
-            auto textureType = MyMTLTextureType2D;  // image.textureType
+            auto textureType = MyMTLTextureType2D; // image.textureType
             if (!validateFormatAndDecoder(textureType, image.pixelFormat, decoderType)) {
                 KLOGF(3, "format decode only supports ktx and ktx2 output");
                 return false;
             }
 
-            imageWidth = NAMESPACE_STL::max(1U, image.width);
-            imageHeight = NAMESPACE_STL::max(1U, image.height);
+            imageWidth = std::max(1U, image.width);
+            imageHeight = std::max(1U, image.height);
         }
 
         // This is retina factor
@@ -162,7 +168,7 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
             float width;
             float height;
         };
-        NSSize contextSize = { (float)maxSize, (float)maxSize };  
+        NSSize contextSize = {(float)maxSize, (float)maxSize};
 
         // compute w/h from aspect ratio of image
         float requestWidth, requestHeight;
@@ -206,7 +212,7 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
 
         //-----------------
 
-        uint32_t chunkNum = 0;  // TODO: could embed chunk(s) to gen thumbnail from, cube/array?
+        uint32_t chunkNum = 0; // TODO: could embed chunk(s) to gen thumbnail from, cube/array?
         uint32_t numChunks = image.totalChunks();
 
         vector<uint8_t> mipData;
@@ -294,19 +300,17 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
             mipData.resize(mipSize);
             memcpy(mipData.data(), image2D.pixels().data(), mipSize);
         }
-        
 
-    
         //---------------------
-        
+
         // create a bitmap, and allocate memory for the pixels
         BITMAPINFO bmi = {};
         bmi.bmiHeader.biSize = sizeof(bmi.bmiHeader);
         bmi.bmiHeader.biWidth = static_cast<LONG>(w);
-        bmi.bmiHeader.biHeight = -static_cast<LONG>(h);  // -h to be top-down
+        bmi.bmiHeader.biHeight = -static_cast<LONG>(h); // -h to be top-down
         bmi.bmiHeader.biPlanes = 1;
         bmi.bmiHeader.biBitCount = 32;
-        bmi.bmiHeader.biCompression = BI_RGB;  // TODO: use BI_PNG to shrink thumbnails
+        bmi.bmiHeader.biCompression = BI_RGB; // TODO: use BI_PNG to shrink thumbnails
 
         Color* dstPixels = nullptr;
         HBITMAP hbmp = CreateDIBSection(nullptr, &bmi, DIB_RGB_COLORS, reinterpret_cast<void**>(&dstPixels), nullptr, 0);
@@ -329,7 +333,7 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
 
             // setting to 1 for premul is equivalent of blend to opaque black
             dstPixels[i].a = 255;
-             
+
             if (!isPremul) {
                 uint32_t alpha = srcPixels[i].a;
                 if (alpha < 255) {
@@ -345,11 +349,12 @@ class KramThumbProvider final : public IInitializeWithStream, public IThumbnailP
     }
 
 private:
-    std::atomic_long    mReferences;
-    ComPtr<IStream>     mStream;     // provided during initialization.
+    std::atomic_long mReferences;
+    ComPtr<IStream> mStream; // provided during initialization.
 };
 
-HRESULT KramThumbProvider_CreateInstance(REFIID riid, void** ppv) {
+HRESULT KramThumbProvider_CreateInstance(REFIID riid, void** ppv)
+{
     KramThumbProvider* provider = new (std::nothrow) KramThumbProvider();
     HRESULT hr = provider ? S_OK : E_OUTOFMEMORY;
     if (SUCCEEDED(hr)) {
@@ -357,4 +362,4 @@ HRESULT KramThumbProvider_CreateInstance(REFIID riid, void** ppv) {
         provider->Release();
     }
     return hr;
-}
\ No newline at end of file
+}
diff --git a/kram-thumb-win/resource.h b/kram-thumb-win/resource.h
index 4494a87d..1fb651e3 100644
--- a/kram-thumb-win/resource.h
+++ b/kram-thumb-win/resource.h
@@ -2,17 +2,17 @@
 // Microsoft Visual C++ generated include file.
 // Used by Dll.rc
 //
-#define VER_DEBUG                       0
-#define VS_VERSION_INFO                 1
-#define IDC_STATIC                      -1
+#define VER_DEBUG 0
+#define VS_VERSION_INFO 1
+#define IDC_STATIC -1
 
 // Next default values for new objects
-// 
+//
 #ifdef APSTUDIO_INVOKED
 #ifndef APSTUDIO_READONLY_SYMBOLS
-#define _APS_NEXT_RESOURCE_VALUE        101
-#define _APS_NEXT_COMMAND_VALUE         40001
-#define _APS_NEXT_CONTROL_VALUE         1000
-#define _APS_NEXT_SYMED_VALUE           101
+#define _APS_NEXT_RESOURCE_VALUE 101
+#define _APS_NEXT_COMMAND_VALUE 40001
+#define _APS_NEXT_CONTROL_VALUE 1000
+#define _APS_NEXT_SYMED_VALUE 101
 #endif
 #endif
diff --git a/kram-thumb/KramThumbnailProvider.h b/kram-thumb/KramThumbnailProvider.h
index 17498b80..3730f045 100644
--- a/kram-thumb/KramThumbnailProvider.h
+++ b/kram-thumb/KramThumbnailProvider.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
diff --git a/kram-thumb/KramThumbnailProvider.mm b/kram-thumb/KramThumbnailProvider.mm
index 25ca813a..6daf720d 100644
--- a/kram-thumb/KramThumbnailProvider.mm
+++ b/kram-thumb/KramThumbnailProvider.mm
@@ -1,77 +1,78 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #import "KramThumbnailProvider.h"
-#include "KramLib.h"
 
+#import <Accelerate/Accelerate.h> // for vImage
 #import <CoreGraphics/CoreGraphics.h>
 #import <Foundation/Foundation.h>
-#import <Accelerate/Accelerate.h> // for vImage
+
+#include "KramLib.h"
 
 using namespace kram;
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 @implementation KramThumbnailProvider
 
-inline NSError* KLOGF(uint32_t code, const char* format, ...) {
+inline NSError* KLOGF(uint32_t code, const char* format, ...)
+{
     string str;
-    
+
     va_list args;
     va_start(args, format);
     /* int32_t len = */ append_vsprintf(str, format, args);
     va_end(args);
-    
+
     // log here, so it can see it in Console.  But this never appears.
     // How are you supposed to debug failures?  Resorted to passing a unique code into this call.
     // It wasn't originally supposed to generate an NSError
     //NSLog(@"%s", str.c_str());
-    
+
     // Console prints this as <private>, so what's the point of producing a localizedString ?
     // This doesn't seem to work to Console app, but maybe if logs are to terminal
     // sudo log config --mode "level:debug" --subsystem com.hialec.kramv
-    
+
     NSString* errorText = [NSString stringWithUTF8String:str.c_str()];
-    return [NSError errorWithDomain:@"com.hialec.kramv" code:code userInfo:@{NSLocalizedDescriptionKey:errorText}];
+    return [NSError errorWithDomain:@"com.hialec.kramv" code:code userInfo:@{NSLocalizedDescriptionKey : errorText}];
 }
 
-struct ImageToPass
-{
+struct ImageToPass {
     KTXImage image;
     KTXImageData imageData;
 };
 
-- (void)provideThumbnailForFileRequest:(QLFileThumbnailRequest *)request completionHandler:(void (^)(QLThumbnailReply * _Nullable, NSError * _Nullable))handler {
+- (void)provideThumbnailForFileRequest:(QLFileThumbnailRequest*)request completionHandler:(void (^)(QLThumbnailReply* _Nullable, NSError* _Nullable))handler
+{
+    // Draw the thumbnail into a context passed to your block, set up with Core Graphics's coordinate system.
 
-    //  Draw the thumbnail into a context passed to your block, set up with Core Graphics's coordinate system.
-    
     const char* filename = [request.fileURL fileSystemRepresentation];
 
     // DONE: could return NSError to caller if non-null
     NSError* error = nil;
     string errorText;
-    
+
     // TODO: use first x-many bytes also to validate, open will do that
     if (!isSupportedFilename(filename)) {
         error = KLOGF(1, "kramv %s only supports ktx,ktx2,dds,png files\n", filename);
         handler(nil, error);
         return;
     }
-       
+
     std::shared_ptr<ImageToPass> imageToPass = std::make_shared<ImageToPass>();
     TexEncoder decoderType = kTexEncoderUnknown;
     uint32_t imageWidth, imageHeight;
-    
+
     {
         KTXImage& image = imageToPass->image;
         KTXImageData& imageData = imageToPass->imageData;
-        
+
         if (!imageData.open(filename, image)) {
             error = KLOGF(2, "kramv %s could not open file\n", filename);
             handler(nil, error);
             return;
         }
-        
+
         // This will set decoder
         auto textureType = MyMTLTextureType2D; // image.textureType
         if (!validateFormatAndDecoder(textureType, image.pixelFormat, decoderType)) {
@@ -79,210 +80,206 @@ - (void)provideThumbnailForFileRequest:(QLFileThumbnailRequest *)request complet
             handler(nil, error);
             return;
         }
-        
-        imageWidth = NAMESPACE_STL::max(1U, image.width);
-        imageHeight = NAMESPACE_STL::max(1U, image.height);
+
+        imageWidth = STL_NAMESPACE::max(1U, image.width);
+        imageHeight = STL_NAMESPACE::max(1U, image.height);
     }
 
     // This is retina factor
     float requestScale = request.scale;
-    
+
     // One of the sides must match maximumSize, but can have
     // different aspect ratios below that on a given sides.
     NSSize contextSize = request.maximumSize;
-   
+
     // compute w/h from aspect ratio of image
     float requestWidth, requestHeight;
-    
+
     float imageAspect = imageWidth / (float)imageHeight;
-    if (imageAspect >= 1.0f)
-    {
+    if (imageAspect >= 1.0f) {
         requestWidth = contextSize.width;
         requestHeight = std::clamp((contextSize.width / imageAspect), 1.0, contextSize.height);
     }
-    else
-    {
+    else {
         requestWidth = std::clamp((contextSize.height * imageAspect), 1.0, contextSize.width);
         requestHeight = contextSize.height;
     }
-    
+
     // will be further scaled by requestScale
     contextSize = CGSizeMake(requestWidth, requestHeight);
-    
-    handler([QLThumbnailReply replyWithContextSize:contextSize drawingBlock:^BOOL(CGContextRef  _Nonnull context)
-    {
-        KTXImage& image = imageToPass->image;
-        
-        bool isPremul = image.isPremul();
-        bool isSrgb = isSrgbFormat(image.pixelFormat);
-        
-        //-----------------
-        
-        // unpack a level to get the blocks
-        uint32_t mipNumber = 0;
-        uint32_t mipCount = image.mipCount();
-        
-        uint32_t w, h, d;
-        for (uint32_t i = 0; i < mipCount; ++i) {
-            image.mipDimensions(i, w, h, d);
-            if (w > request.maximumSize.width || h > request.maximumSize.height) {
-                mipNumber++;
-            }
-        }
-        
-        // clamp to smallest
-        mipNumber = std::min(mipNumber, mipCount - 1);
-        image.mipDimensions(mipNumber, w, h, d);
-        
-        //-----------------
-        
-        uint32_t chunkNum = 0; // TODO: could embed chunk(s) to gen thumbnail from, cube/array?
-        uint32_t numChunks = image.totalChunks();
-        
-        vector<uint8_t> mipData;
-        
-        // now decode the blocks in that chunk to Color
-        if (isBlockFormat(image.pixelFormat)) {
-            
-            // then decode any blocks to rgba8u, not dealing with HDR formats yet
-            uint64_t mipLength = image.mipLevels[mipNumber].length;
-
-            if (image.isSupercompressed()) {
-                const uint8_t* srcData = image.fileData + image.mipLevels[mipNumber].offset;
-
-                mipData.resize(mipLength * numChunks);
-                uint8_t* dstData = mipData.data();
-                if (!image.unpackLevel(mipNumber, srcData, dstData)) {
-                   //KLOGF("kramv %s failed to unpack mip\n", filename);
-                   return NO;
-                }
-
-                // now extract the chunk for the thumbnail out of that level
-                if (numChunks > 1) {
-                   macroUnusedVar(chunkNum);
-                   assert(chunkNum == 0);
-
-                   // this just truncate to chunk 0 instead of copying chunkNum first
-                   mipData.resize(mipLength);
-                }
-            }
-            else
-            {
-                // this just truncate to chunk 0 instead of copying chunkNum first
-                mipData.resize(mipLength);
-
-                const uint8_t* srcData = image.fileData + image.mipLevels[mipNumber].offset;
-
-                memcpy(mipData.data(), srcData, mipLength);
-            }
-            
-            KramDecoder decoder;
-            KramDecoderParams params;
-            params.decoder = decoderType;
-            
-            // TODO: should honor swizzle in the ktx image
-            // TODO: probaby need an snorm rgba format to convert the snorm versions, so they're not all red
-            // if sdf, will be signed format and that will stay red
-            
-            switch(image.pixelFormat)
-            {
-                // To avoid showing single channel content in red, replicate to rgb
-                case MyMTLPixelFormatBC4_RUnorm:
-                case MyMTLPixelFormatEAC_R11Unorm:
-                    params.swizzleText = "rrr1";
-                    break;
-                    
-                default:
-                    break;
-            }
-            
-            vector<uint8_t> dstMipData;
-            
-            // only space for one chunk for now
-            dstMipData.resize(h * w * sizeof(Color));
-            
-            // want to just decode one chunk of the level that was unpacked abovve
-            if (!decoder.decodeBlocks(w, h, mipData.data(), (int32_t)mipData.size(), image.pixelFormat, dstMipData, params)) {
-                // Can't return NSError
-                //error = KLOGF("kramv %s failed to decode blocks\n", filename);
-                return NO;
-            }
-            
-            // copy over original encoded data
-            mipData = dstMipData;
-        }
-        else if (isExplicitFormat(image.pixelFormat)) {
-            // explicit formats like r/rg/rgb and 16f/32F need to be converted to rgba8 here
-            // this should currently clamp, but could do range tonemap, see Image::convertToFourChannel()
-            // but this needs to be slightly different.  This will decompress mip again
-            
-            Image image2D;
-            if (!image2D.loadThumbnailFromKTX(image, mipNumber)) {
-                //KLOGF("kramv %s failed to convert image to 4 channels\n", filename);
-                return NO;
-            }
-            
-            // copy from Color back to uint8_t
-            uint32_t mipSize = h * w * sizeof(Color);
-            mipData.resize(mipSize);
-            memcpy(mipData.data(), image2D.pixels().data(), mipSize);
-        }
-        
-        // https://developer.apple.com/library/archive/documentation/GraphicsImaging/Conceptual/drawingwithquartz2d/dq_images/dq_images.html#//apple_ref/doc/uid/TP30001066-CH212-TPXREF101
-
-        uint32_t rowBytes = w * sizeof(Color);
-
-        // use vimage in the Accelerate.framework
-        // https://developer.apple.com/library/archive/releasenotes/Performance/RN-vecLib/index.html#//apple_ref/doc/uid/TP40001049
-
-        vImage_Buffer buf = { mipData.data(), h, w, rowBytes };
-
-        // Declare the pixel format for the vImage_Buffer
-        vImage_CGImageFormat format = {
-            .bitsPerComponent   = 8,
-            .bitsPerPixel       = 32,
-        };
-        
-        format.bitmapInfo = kCGBitmapByteOrderDefault | (CGBitmapInfo)(isPremul ? kCGImageAlphaPremultipliedLast : kCGImageAlphaLast);
-        format.colorSpace = isSrgb ? CGColorSpaceCreateWithName(kCGColorSpaceSRGB) : CGColorSpaceCreateDeviceRGB();
-        
-        // don't need to allocate, can reuse memory from mip
-        bool skipPixelCopy = true;
-        
-        vImage_Error err = 0;
-        CGImageRef cgImage = vImageCreateCGImageFromBuffer(&buf, &format, NULL, NULL, skipPixelCopy ? kvImageNoAllocate : kvImageNoFlags, &err);
-        if (err) {
-            // Can't return NSError
-            //error = KLOGF("kramv %s failed create cgimage\n", filename);
-            return NO;
-        }
-        
-        CGRect rect = CGRectMake(0, 0,
-                                 (uint32_t)roundf(contextSize.width * requestScale),
-                                 (uint32_t)roundf(contextSize.height * requestScale));
-
-        // Default is white, but that messes up all content that uses alpha
-        // and doesn't match the preview code or kramv background (or Preview).
-        CGContextSetFillColorWithColor(context, CGColorGetConstantColor(kCGColorBlack));
-        CGContextFillRect(context, rect);
-        
-        // TODO: should this clear to NSColor clearColor ?
-        // don't want default white?
-        
-        // The image is scaled—disproportionately
-        
-        //CGContextSetBlendMode(context, kCGBlendModeCopy);
-        CGContextSetBlendMode(context, kCGBlendModeNormal);
-        
-        CGContextDrawImage(context, rect, cgImage);
-
-        // This seems to cause plugin to fail
-        // Needed?
-        if (!skipPixelCopy)
-            CGImageRelease(cgImage);
-        
-        return YES;
-     }], nil);
+
+    handler([QLThumbnailReply replyWithContextSize:contextSize
+                                      drawingBlock:^BOOL(CGContextRef _Nonnull context) {
+                                          KTXImage& image = imageToPass->image;
+
+                                          bool isPremul = image.isPremul();
+                                          bool isSrgb = isSrgbFormat(image.pixelFormat);
+
+                                          //-----------------
+
+                                          // unpack a level to get the blocks
+                                          uint32_t mipNumber = 0;
+                                          uint32_t mipCount = image.mipCount();
+
+                                          uint32_t w, h, d;
+                                          for (uint32_t i = 0; i < mipCount; ++i) {
+                                              image.mipDimensions(i, w, h, d);
+                                              if (w > request.maximumSize.width || h > request.maximumSize.height) {
+                                                  mipNumber++;
+                                              }
+                                          }
+
+                                          // clamp to smallest
+                                          mipNumber = std::min(mipNumber, mipCount - 1);
+                                          image.mipDimensions(mipNumber, w, h, d);
+
+                                          //-----------------
+
+                                          uint32_t chunkNum = 0; // TODO: could embed chunk(s) to gen thumbnail from, cube/array?
+                                          uint32_t numChunks = image.totalChunks();
+
+                                          vector<uint8_t> mipData;
+
+                                          // now decode the blocks in that chunk to Color
+                                          if (isBlockFormat(image.pixelFormat)) {
+                                              // then decode any blocks to rgba8u, not dealing with HDR formats yet
+                                              uint64_t mipLength = image.mipLevels[mipNumber].length;
+
+                                              if (image.isSupercompressed()) {
+                                                  const uint8_t* srcData = image.fileData + image.mipLevels[mipNumber].offset;
+
+                                                  mipData.resize(mipLength * numChunks);
+                                                  uint8_t* dstData = mipData.data();
+                                                  if (!image.unpackLevel(mipNumber, srcData, dstData)) {
+                                                      //KLOGF("kramv %s failed to unpack mip\n", filename);
+                                                      return NO;
+                                                  }
+
+                                                  // now extract the chunk for the thumbnail out of that level
+                                                  if (numChunks > 1) {
+                                                      macroUnusedVar(chunkNum);
+                                                      assert(chunkNum == 0);
+
+                                                      // this just truncate to chunk 0 instead of copying chunkNum first
+                                                      mipData.resize(mipLength);
+                                                  }
+                                              }
+                                              else {
+                                                  // this just truncate to chunk 0 instead of copying chunkNum first
+                                                  mipData.resize(mipLength);
+
+                                                  const uint8_t* srcData = image.fileData + image.mipLevels[mipNumber].offset;
+
+                                                  memcpy(mipData.data(), srcData, mipLength);
+                                              }
+
+                                              KramDecoder decoder;
+                                              KramDecoderParams params;
+                                              params.decoder = decoderType;
+
+                                              // TODO: should honor swizzle in the ktx image
+                                              // TODO: probaby need an snorm rgba format to convert the snorm versions, so they're not all red
+                                              // if sdf, will be signed format and that will stay red
+
+                                              switch (image.pixelFormat) {
+                                                  // To avoid showing single channel content in red, replicate to rgb
+                                                  case MyMTLPixelFormatBC4_RUnorm:
+                                                  case MyMTLPixelFormatEAC_R11Unorm:
+                                                      params.swizzleText = "rrr1";
+                                                      break;
+
+                                                  default:
+                                                      break;
+                                              }
+
+                                              vector<uint8_t> dstMipData;
+
+                                              // only space for one chunk for now
+                                              dstMipData.resize(h * w * sizeof(Color));
+
+                                              // want to just decode one chunk of the level that was unpacked abovve
+                                              if (!decoder.decodeBlocks(w, h, mipData.data(), (int32_t)mipData.size(), image.pixelFormat, dstMipData, params)) {
+                                                  // Can't return NSError
+                                                  //error = KLOGF("kramv %s failed to decode blocks\n", filename);
+                                                  return NO;
+                                              }
+
+                                              // copy over original encoded data
+                                              mipData = dstMipData;
+                                          }
+                                          else if (isExplicitFormat(image.pixelFormat)) {
+                                              // explicit formats like r/rg/rgb and 16f/32F need to be converted to rgba8 here
+                                              // this should currently clamp, but could do range tonemap, see Image::convertToFourChannel()
+                                              // but this needs to be slightly different.  This will decompress mip again
+
+                                              Image image2D;
+                                              if (!image2D.loadThumbnailFromKTX(image, mipNumber)) {
+                                                  //KLOGF("kramv %s failed to convert image to 4 channels\n", filename);
+                                                  return NO;
+                                              }
+
+                                              // copy from Color back to uint8_t
+                                              uint32_t mipSize = h * w * sizeof(Color);
+                                              mipData.resize(mipSize);
+                                              memcpy(mipData.data(), image2D.pixels().data(), mipSize);
+                                          }
+
+                                          // https://developer.apple.com/library/archive/documentation/GraphicsImaging/Conceptual/drawingwithquartz2d/dq_images/dq_images.html#//apple_ref/doc/uid/TP30001066-CH212-TPXREF101
+
+                                          uint32_t rowBytes = w * sizeof(Color);
+
+                                          // use vimage in the Accelerate.framework
+                                          // https://developer.apple.com/library/archive/releasenotes/Performance/RN-vecLib/index.html#//apple_ref/doc/uid/TP40001049
+
+                                          vImage_Buffer buf = {mipData.data(), h, w, rowBytes};
+
+                                          // Declare the pixel format for the vImage_Buffer
+                                          vImage_CGImageFormat format = {
+                                              .bitsPerComponent = 8,
+                                              .bitsPerPixel = 32,
+                                          };
+
+                                          format.bitmapInfo = kCGBitmapByteOrderDefault | (CGBitmapInfo)(isPremul ? kCGImageAlphaPremultipliedLast : kCGImageAlphaLast);
+                                          format.colorSpace = isSrgb ? CGColorSpaceCreateWithName(kCGColorSpaceSRGB) : CGColorSpaceCreateDeviceRGB();
+
+                                          // don't need to allocate, can reuse memory from mip
+                                          bool skipPixelCopy = true;
+
+                                          vImage_Error err = 0;
+                                          CGImageRef cgImage = vImageCreateCGImageFromBuffer(&buf, &format, NULL, NULL, skipPixelCopy ? kvImageNoAllocate : kvImageNoFlags, &err);
+                                          if (err) {
+                                              // Can't return NSError
+                                              //error = KLOGF("kramv %s failed create cgimage\n", filename);
+                                              return NO;
+                                          }
+
+                                          CGRect rect = CGRectMake(0, 0,
+                                                                   (uint32_t)roundf(contextSize.width * requestScale),
+                                                                   (uint32_t)roundf(contextSize.height * requestScale));
+
+                                          // Default is white, but that messes up all content that uses alpha
+                                          // and doesn't match the preview code or kramv background (or Preview).
+                                          CGContextSetFillColorWithColor(context, CGColorGetConstantColor(kCGColorBlack));
+                                          CGContextFillRect(context, rect);
+
+                                          // TODO: should this clear to NSColor clearColor ?
+                                          // don't want default white?
+
+                                          // The image is scaled—disproportionately
+
+                                          //CGContextSetBlendMode(context, kCGBlendModeCopy);
+                                          CGContextSetBlendMode(context, kCGBlendModeNormal);
+
+                                          CGContextDrawImage(context, rect, cgImage);
+
+                                          // This seems to cause plugin to fail
+                                          // Needed?
+                                          if (!skipPixelCopy)
+                                              CGImageRelease(cgImage);
+
+                                          return YES;
+                                      }],
+            nil);
 }
 
 @end
diff --git a/kram-thumb/kram_thumb.entitlements b/kram-thumb/kram_thumb.entitlements
index f2ef3ae0..18aff0ce 100644
--- a/kram-thumb/kram_thumb.entitlements
+++ b/kram-thumb/kram_thumb.entitlements
@@ -2,9 +2,9 @@
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
-    <key>com.apple.security.app-sandbox</key>
-    <true/>
-    <key>com.apple.security.files.user-selected.read-only</key>
-    <true/>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.security.files.user-selected.read-only</key>
+	<true/>
 </dict>
 </plist>
diff --git a/kramc/CMakeLists.txt b/kramc/CMakeLists.txt
index a592301f..bfb1c9e2 100644
--- a/kramc/CMakeLists.txt
+++ b/kramc/CMakeLists.txt
@@ -1,57 +1,41 @@
-#cmake_minimum_required(VERSION 3.19.1 FATAL_ERROR)
-
-#-----------------------------------------------------
-
-set(BUILD_MAC FALSE)
-set(BUILD_WIN FALSE)
-
-if (APPLE)
-    message("build for macOS")
-    set(BUILD_MAC TRUE)
-elseif (WIN32)
-    message("build for win x64")
-    set(BUILD_WIN TRUE)
-endif()
-
 #-----------------------------------------------------
 # kram 
 
 # now setup the app project, and link to libkram
 set(myTargetApp kram)
-
-# not using version in cmake anymore, this is pulled in by KramVersion.h
-if (BUILD_MAC)
-    project(${myTargetApp} LANGUAGES C CXX OBJCXX)
-elseif (BUILD_WIN)
-    project(${myTargetApp} LANGUAGES C CXX)
-endif()
-
-# **** this the executable target ****, for a CLI App
+project(${myTargetApp})
 add_executable(${myTargetApp})
 
 #-----------------------------------------------------
     
+if (BUILD_LINUX)
+    # Enable all warnings, and also enable f16c sims op (only x64 though)
+    target_compile_options(${myTargetApp} PRIVATE -W -Wall 
+        -mavx2 -mfma -mf16c 
+        -fno-exceptions -fno-rtti
+        -fdata-sections -ffunction-sections
+    )
+
+    # librt is for clock_gettime
+    target_link_libraries(${myTargetApp} PUBLIC 
+        libkram 
+        rt)
+
+endif()
+
 if (BUILD_MAC)
     # ate is a macOS/iOS only library, and it varies in encode support by OS revision
-    target_link_libraries(${myTargetApp}
+    target_link_libraries(${myTargetApp} PUBLIC
         ate
-        libkram)
+        libkram
+    )
 
     set_target_properties(${myTargetApp} PROPERTIES
-         # Note: match this up with CXX version
-        # c++11 min
-        XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++20"
-        XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++"
-
-        # avx1
-        XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx"
+        #XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++20"
+        #XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++"
         
-        # turn off exceptions/rtti
-        XCODE_ATTRIBUTE_GCC_ENABLE_CPP_EXCEPTIONS NO
-        XCODE_ATTRIBUTE_GCC_ENABLE_CPP_RTTI NO
-    
         # can't believe this isn't on by default in CMAKE
-        XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES
+        #XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES
     
         #-------------------------
         
@@ -59,23 +43,46 @@ if (BUILD_MAC)
         XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT "dwarf-with-dsym"
         XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH "NO"
     
+        #-------------------------
+        # app specific settings
+       
         # this drops app from 762KB to 174KB with only ATE enabled
         # note about needing -gfull instead of -gused here or debug info messed up:
         # https://gist.github.com/tkersey/39b4fe69e14b859889ffadccb009e397
-        XCODE_ATTRIBUTE_DEAD_CODE_STRIPPING YES
-        XCODE_ATTRIBUTE_LLVM_LTO[variant=Release] "Incremental"
+        #XCODE_ATTRIBUTE_DEAD_CODE_STRIPPING YES
+
+        # This is LTO
+        #XCODE_ATTRIBUTE_LLVM_LTO[variant=Release] "Incremental"
     
         #-------------------------
         # for now disable signing, and just "sign to run locally"
-        XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER "com.ba.kram"
+        XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER "com.hialec.kram"
         XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO"
         XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY ""
     )
 
-    target_compile_options(${myTargetApp} PRIVATE -W -Wall)
+    target_compile_options(${myTargetApp} PRIVATE -W -Wall
+        -mavx2 -mfma -mf16c 
+        -fno-exceptions -fno-rtti
+        -fdata-sections -ffunction-sections
+    )
+
+    # only turn on in Release in case this disables incremental linking
+    if (CMAKE_BUILD_TYPE EQUAL "Release")
+        add_link_options(${myTargetApp}, "-dead_strip")
+    endif()
+endif()
+
+#-----------------------------------------------------
 
-elseif (BUILD_WIN)
-    target_link_libraries(${myTargetApp} libkram)
+if (BUILD_WIN)
+
+    # Use the static linked libs, or the exe needs the VCRuntimeDLL installed
+    set_property(TARGET ${myTargetApp} PROPERTY
+        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
+    )
+    
+    target_link_libraries(${myTargetApp} PUBLIC libkram)
 
     # When Win rebuilds library, it doesn't relink app to correct code when you
     # build the app target project.  Breakpoints stop working after any library source edit,
@@ -85,26 +92,21 @@ elseif (BUILD_WIN)
     SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR})
     SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
 
-    # TODO: switch to add_target_definitions
-
-    # to turn off exceptions/rtti use /GR and /EHsc replacement
-    string(REGEX REPLACE "/GR" "/GR-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    string(REGEX REPLACE "/EHsc" "/EHs-c-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
     # don't need force with apps, since they only access kram folder files which include KramConfig
     
-    # all warnings, AVX1, and multiprocess compiles
-    target_compile_options(${myTargetApp} PRIVATE /W3 /arch:AVX -f16c /MP /GF /FC)
+    # all warnings, AVX2, and multiprocess compiles
+    target_compile_options(${myTargetApp} PRIVATE /W3 
+	   -march=haswell -mf16c -mfma 
+	   /GR- /EHs-c-
+	   /MP /GF /FC
+    )
     
     # fix STL
     target_compile_definitions(${myTargetApp} PRIVATE -D_ITERATOR_DEBUG_LEVEL=0 -D_HAS_EXCEPTIONS=0)
-    
-    if (CMAKE_BUILD_TYPE EQUAL "Debug")
-        target_compile_definitions(${myTargetLib} PRIVATE "/INCREMENTAL")
-        
-    elseif (CMAKE_BUILD_TYPE EQUAL "Release")
-        # only dead strip on Release builds since this disables Incremental linking, may want Profile build that doesn't use this
-        target_compile_definitions(${myTargetLib} PRIVATE "/OPT:REF")
+         
+    if (CMAKE_BUILD_TYPE EQUAL "Release")
+        # This disables Incremental linking, so only set on Release
+        add_link_options(${myTargetApp} PRIVATE "/OPT:REF")
         
         # other possibliities
         # /GL - whole program optimization
diff --git a/kramc/KramMain.cpp b/kramc/KramMain.cpp
index e449bf16..2bf1f999 100644
--- a/kramc/KramMain.cpp
+++ b/kramc/KramMain.cpp
@@ -1,13 +1,366 @@
 #include "KramLib.h"
 
+#if KRAM_MAC
+#include <sys/sysctl.h>
+#endif
+
+#if KRAM_WIN
+#include <intrin.h>
+#endif
+
+using namespace STL_NAMESPACE;
+
+// These aren't avx2 specific, but just don't want unused func warning
+#if SIMD_AVX2
+#if KRAM_MAC
+
+inline const char* getMacOSVersion() {
+    static char str[256] = {};
+    if (str[0] == 0) {
+        size_t size = sizeof(str);
+        if (sysctlbyname("kern.osproductversion", str, &size, NULL, 0) == 0) {
+            return str;
+        }
+    }
+    return str;
+}
+
+inline bool isRunningUnderRosetta() {
+    int ret = 0;
+    size_t size = sizeof(ret);
+    if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) == -1) {
+        if (errno == ENOENT) {
+            // sysctl doesn't exist - not running under Rosetta
+            return false;
+        }
+        // Other error occurred
+        return false;
+    }
+    return ret > 0;
+}
+
+
+inline uint32_t getMacOSMajorVersion() {
+    // 15.4
+    static uint32_t majorOSVersion = 0;
+    if (majorOSVersion == 0) {
+        sscanf(getMacOSVersion(), "%u", &majorOSVersion);
+    }
+    return majorOSVersion;
+}
+
+#endif
+#endif
+
+// TODO: move this into vectormath
+void checkSimdSupport()
+{
+    // Check for AVX2, FMA, F16C support on Intel.
+    // Still need to set compile flags, and arm64 emulators are also spotty.
+    // arm64 native has everything needed.  No holes to check, or legacy simd.
+    
+#if SIMD_AVX2
+#if KRAM_MAC
+    // Apple added AVX2 and F16C? support to Rosetta in macOS 15 with no way
+    // to detect it.   Really awesome, so skip the test.  There are
+    // no supporting Intel hw devices on macOS 15 that don't have AVX2.
+    // const char* macOSVersion = getMacOSVersion();
+    // KLOGI("kram", "%s", macOSVersion);
+    uint32_t majorOSVersion = getMacOSMajorVersion();
+    if (majorOSVersion >= 15) {
+        return;
+    }
+    
+    bool hasSimdSupport = true;
+    
+    vector<char> cpuName;
+    size_t cpuNameSize = 0;
+    
+    const char* cpuNameProp = "machdep.cpu.brand_string";
+    
+    if (sysctlbyname(cpuNameProp, nullptr, &cpuNameSize, nullptr, 0) >= 0) {
+        cpuName.resize(cpuNameSize);
+        
+        // Assuming this is ascii
+        sysctlbyname(cpuNameProp, cpuName.data(), &cpuNameSize, nullptr, 0);
+    }
+    
+    
+    // can also check AVX1.0
+    // F16C (avx/avx2 imply F16C and assume Rosetta too)
+    
+    // https://csharpmulticore.blogspot.com/2014/12/how-to-check-intel-avx2-support-on-mac-os-x-haswell.html
+    // machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM PBE SSE3 PCLMULQDQ DTES64 MON DSCPL VMX EST TM2 SSSE3 FMA CX16 TPR PDCM SSE4.1 SSE4.2 x2APIC MOVBE POPCNT AES PCID XSAVE OSXSAVE SEGLIM64 TSCTMR AVX1.0 RDRAND F16C
+    // machdep.cpu.leaf7_features: SMEP ERMS RDWRFSGS TSC_THREAD_OFFSET BMI1 AVX2 BMI2 INVPCID
+    const char* missingFeatures[4] = { "", "", "", "" };
+    uint32_t missingFeaturesCount = 0;
+    
+    const char* leaf7Features = "machdep.cpu.leaf7_features";
+    
+    size_t leaf7FeatureSize = 0;
+    sysctlbyname(leaf7Features, nullptr, &leaf7FeatureSize, nullptr, 0);
+    
+    vector<char> bufferLeaf7;
+    
+    if (leaf7FeatureSize == 0) {
+        hasSimdSupport = false;
+    }
+    else {
+        bufferLeaf7.resize(leaf7FeatureSize);
+        
+        // TODO: check failure
+        sysctlbyname(leaf7Features, bufferLeaf7.data(), &leaf7FeatureSize, nullptr, 0);
+    }
+    
+    const char* cpuFeatures = "machdep.cpu.features";
+    
+    size_t cpuFeatureSize = 0;
+    sysctlbyname(cpuFeatures, nullptr, &cpuFeatureSize, nullptr, 0);
+    
+    vector<char> bufferFeatures;
+
+    if (!hasSimdSupport || cpuFeatureSize == 0) {
+        hasSimdSupport = false;
+    }
+    else {
+        bufferFeatures.resize(cpuFeatureSize);
+        
+        // TODO: check failure
+        sysctlbyname(cpuFeatures, bufferFeatures.data(), &cpuFeatureSize, nullptr, 0);
+    }
+
+    const char* features = !bufferFeatures.empty() ? bufferFeatures.data() : "";
+    const char* features7 = !bufferLeaf7.empty() ? bufferLeaf7.data() : "";
+   
+    // If don't find avx2, then support is not present.
+    // could be running under Rosetta2 but it's supposed to add AVX2 soon.
+    bool hasAVX2 = strstr(features7, "AVX2") != nullptr;
+
+    if (!hasAVX2) {
+        missingFeatures[missingFeaturesCount++] = "AVX2 ";
+        hasSimdSupport = false;
+    }
+
+    // Make sure compile has enabled these on AVX2.
+    // Rosetta2 and Prism often don't emulate these.
+    // (f.e. BMI and F16C)
+    
+    bool hasAVX = strstr(features, "AVX") != nullptr;
+    bool hasF16C = strstr(features, "F16C") != nullptr;
+    bool hasFMA = strstr(features, "FMA") != nullptr;
+    
+    if (!hasAVX) {
+        missingFeatures[missingFeaturesCount++] = "AVX ";
+        hasSimdSupport = false;
+    }
+    if (!hasF16C) {
+        missingFeatures[missingFeaturesCount++] = "F16C ";
+        hasSimdSupport = false;
+    }
+    if (!hasFMA) {
+        missingFeatures[missingFeaturesCount++] = "FMA ";
+        hasSimdSupport = false;
+    }
+    
+    if (!hasSimdSupport) {
+        bool isEmulated = isRunningUnderRosetta() && (majorOSVersion < 15);
+        const char* emulatedHint = isEmulated ? " install macOS 15.0+" : "";
+        
+        KLOGE("Main", "Missing simd support for %s%s%s%son %s%s",
+              missingFeatures[0], missingFeatures[1], missingFeatures[2], missingFeatures[3],
+              cpuName.data(), emulatedHint);
+        exit(1);
+    }
+    
+#elif KRAM_WIN
+    bool hasSimdSupport = true;
+    
+    // Also handles Win for ARM (f.e. Prism is SSE4 -> AVX2 soon).
+    // See here for more bits (f.e. AVX512)
+    // https://learn.microsoft.com/en-us/cpp/intrinsics/cpuid-cpuidex?view=msvc-170
+    
+    // f1.ecx bit  0 is sse3
+    // f1.ecx bit 12 is fma
+    // f1.ecx bit 19 is sse4.1
+    // f1.ecx bit 20 is sse4.2
+    // f1.ecx bit 28 is avx
+    // f1.ecx bit 29 is f16c (docs are wrong about this being avx2)
+    
+    // f7.ebx bit 5 is avx2
+    // f7.ebx bit 16 is avx-512f
+    // f7.ebx bit 26 is avx-512pf
+    // f7.ebx bit 27 is avx-512er
+    // f7.ebx bit 28 is avx-512cd
+    
+    // This returns a count of the ids from mthe docs.
+    struct CpuInfo {
+        int eax, ebx, ecx, edx;
+    };
+    
+    // numIds in 0
+    // vendorId (12 char string) returned in 1,3,2
+    // can tell intel from amd off vendorId
+    CpuInfo cpuInfo = {};
+    __cpuid((int*)&cpuInfo, 0);
+    
+    // This is GenuineIntel or AuthenticAMD
+    char vendorId[12+1] = {};
+    *reinterpret_cast<int*>(vendorId + 0) = cpuInfo.ebx;
+    *reinterpret_cast<int*>(vendorId + 4) = cpuInfo.edx;
+    *reinterpret_cast<int*>(vendorId + 8) = cpuInfo.ecx;
+    
+    const char* missingFeatures[4] = { "", "", "", "" };
+    uint32_t missingFeaturesCount = 0;
+    
+    int numIds = cpuInfo.eax;
+    if (numIds < 7) {
+        hasSimdSupport = false;
+    }
+    else {
+        // +1 since 0 is the count and vendorId
+        vector<CpuInfo> cpuInfoByIndex;
+        cpuInfoByIndex.resize(numIds+1);
+        
+        // This has sse4, avx, f16c
+        __cpuidex((int*)&cpuInfo, 1, 0);
+        cpuInfoByIndex[1] = cpuInfo;
+        
+        // This has AVX2, avx512
+        __cpuidex((int*)&cpuInfo, 7, 0);
+        cpuInfoByIndex[7] = cpuInfo;
+        
+        bool hasAVX2 = cpuInfoByIndex[7].ebx & (1 << 5);
+        
+        bool hasFMA = cpuInfoByIndex[1].ecx & (1 << 12);
+        bool hasAVX = cpuInfoByIndex[1].ecx & (1 << 28);
+        bool hasF16C = cpuInfoByIndex[1].ecx & (1 << 29);
+        
+        if (!hasAVX2) {
+            missingFeatures[missingFeaturesCount++] = "AVX2 ";
+            hasSimdSupport = false;
+        }
+        if (!hasAVX) {
+            missingFeatures[missingFeaturesCount++] = "AVX ";
+            hasSimdSupport = false;
+        }
+        if (!hasFMA) {
+            missingFeatures[missingFeaturesCount++] = "FMA ";
+            hasSimdSupport = false;
+        }
+        if (!hasF16C) {
+            missingFeatures[missingFeaturesCount++] = "F16C ";
+            hasSimdSupport = false;
+        }
+    }
+    
+    // extended cpuid attributes
+    int extBase = 0x80000000;
+    __cpuid((int*)&cpuInfo, extBase);
+    numIds = cpuInfo.eax - extBase;
+    
+    char brandId[48+1] = {};
+    
+    if (numIds >= 4)
+    {
+        vector<CpuInfo> cpuInfoByIndex;
+        cpuInfoByIndex.resize(numIds+1);
+        
+        // f81
+        __cpuidex((int*)&cpuInfo, extBase+1, 0);
+        cpuInfoByIndex[1] = cpuInfo;
+        
+        // brand
+        __cpuidex((int*)&cpuInfo, extBase+2, 0);
+        cpuInfoByIndex[2] = cpuInfo;
+        __cpuidex((int*)&cpuInfo, extBase+3, 0);
+        cpuInfoByIndex[3] = cpuInfo;
+        __cpuidex((int*)&cpuInfo, extBase+4, 0);
+        cpuInfoByIndex[4] = cpuInfo;
+        
+        memcpy(brandId +  0, &cpuInfoByIndex[2], sizeof(CpuInfo));
+        memcpy(brandId + 16, &cpuInfoByIndex[3], sizeof(CpuInfo));
+        memcpy(brandId + 32, &cpuInfoByIndex[4], sizeof(CpuInfo));
+    }
+    
+    if (!hasSimdSupport) {
+        KLOGE("Main", "Missing simd support for %s%s%s%son %s",
+              missingFeatures[0], missingFeatures[1], missingFeatures[2], missingFeatures[3],
+              brandId);
+        exit(1);
+    }
+        
+#elif KRAM_LINUX // || KRAM_MAC
+        
+    // This should apply to all clang and gcc builds.  So may want
+    // to use on all platforms.
+        
+    //        Common CPU features that can be checked with __builtin_cpu_supports include:
+    //        sse, sse2, sse3, ssse3, sse4.1, sse4.2
+    //        avx, avx2, avx512f
+    //        fma
+    //        bmi, bmi2
+    //        popcnt
+    //        lzcnt
+    //        mmx
+        
+        
+    bool hasSimdSupport = true;
+        
+    bool hasAVX2 = __builtin_cpu_supports("avx2");
+    
+    bool hasFMA = __builtin_cpu_supports("fma");
+    bool hasAVX = __builtin_cpu_supports("avx");
+    
+    // macOS doesn't support f16c as string?
+    #if  KRAM_MAC
+    bool hasF16C = true; // a lie
+    #else
+    bool hasF16C = __builtin_cpu_supports("f16c");
+    #endif
+    
+    const char* missingFeatures[4] = { "", "", "", "" };
+    uint32_t missingFeaturesCount = 0;
+    
+    if (!hasAVX2) {
+        missingFeatures[missingFeaturesCount++] = "AVX2 ";
+        hasSimdSupport = false;
+    }
+    if (!hasAVX) {
+        missingFeatures[missingFeaturesCount++] = "AVX ";
+        hasSimdSupport = false;
+    }
+    if (!hasFMA) {
+        missingFeatures[missingFeaturesCount++] = "FMA ";
+        hasSimdSupport = false;
+    }
+    if (!hasF16C) {
+        missingFeatures[missingFeaturesCount++] = "F16C ";
+        hasSimdSupport = false;
+    }
+       
+    if (!hasSimdSupport) {
+        KLOGE("Main", "Missing simd support for %s%s%s%s",
+              missingFeatures[0], missingFeatures[1], missingFeatures[2], missingFeatures[3]);
+        exit(1);
+    }
+    
+#endif
+#endif
+}
+
 int main(int argc, char* argv[])
 {
-    int errorCode = kram::kramAppMain(argc, argv);
+    // This will exit if insufficient simd support on x64.
+    // arm64+neon has full support of all operations.
+    checkSimdSupport();
     
+    // verify that machine has simd support to run
+    int errorCode = kram::kramAppMain(argc, argv);
+
     // returning -1 from main results in exit code of 255, so fix this to return 1 on failure.
     if (errorCode != 0) {
         exit(1);
     }
-    
+
     return 0;
 }
diff --git a/kramv/CMakeLists.txt b/kramv/CMakeLists.txt
index 97d844bf..1e76eab6 100644
--- a/kramv/CMakeLists.txt
+++ b/kramv/CMakeLists.txt
@@ -39,11 +39,11 @@ target_link_libraries(${myTargetApp}
 set_target_properties(${myTargetApp} PROPERTIES
     # Note: match this up with CXX version
     # c++11 min
-    XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++14"
+    XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++20"
     XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++"
 
     # avx1
-    XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx"
+    XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx2"
     
     # turn off exceptions/rtti
     XCODE_ATTRIBUTE_GCC_ENABLE_CPP_EXCEPTIONS NO
diff --git a/kramv/KramLoader.h b/kramv/KramLoader.h
index 3e61ccfb..3b56d4eb 100644
--- a/kramv/KramLoader.h
+++ b/kramv/KramLoader.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -16,7 +16,7 @@
 namespace kram {
 class KTXImage;
 class KTXImageData;
-}
+} //namespace kram
 
 // This loads KTX/2 and PNG data.  Moving towards KTX/2 files only, with a PNG
 // to KTX/2 conversion.
@@ -38,7 +38,7 @@ class KTXImageData;
 - (nullable id<MTLTexture>)loadTextureFromImage:(const kram::KTXImage &)image
                                  originalFormat:
                                      (nullable MTLPixelFormat *)originalFormat
-                                name:(nonnull const char*)name;
+                                           name:(nonnull const char *)name;
 
 // load into KTXImage and KTXImageData, can use with loadTextureFromImage
 - (BOOL)loadImageFromURL:(nonnull NSURL *)url
@@ -69,8 +69,8 @@ class KTXImageData;
 //#include <string>
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // provide access to lowercase strings
 string toLower(const string &text);
-}
+} //namespace kram
diff --git a/kramv/KramLoader.mm b/kramv/KramLoader.mm
index 8e227337..b9b33426 100644
--- a/kramv/KramLoader.mm
+++ b/kramv/KramLoader.mm
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -11,8 +11,8 @@
 #include "KramLib.h"
 
 using namespace kram;
-using namespace NAMESPACE_STL;
-using namespace simd;
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
 
 using mymutex = std::recursive_mutex;
 using mylock = std::unique_lock<mymutex>;
@@ -45,12 +45,12 @@
 @implementation KramLoader {
     // only one of these for now
     id<MTLBuffer> _buffer;
-    uint8_t* _data;
+    uint8_t *_data;
     uint32_t _bufferOffset;
 
     vector<KramBlit> _blits;
-    NSMutableArray<id<MTLTexture>>* _blitTextures;
-    NSMutableArray<id<MTLTexture>>* _mipgenTextures;
+    NSMutableArray<id<MTLTexture>> *_blitTextures;
+    NSMutableArray<id<MTLTexture>> *_mipgenTextures;
 }
 
 - (instancetype)init
@@ -72,15 +72,13 @@ - (instancetype)init
                       originalFormat:originalFormat];
 }
 
-
-
 // this means format isnt supported on platform, but can be decoded to rgba to
 // display
 bool isDecodeImageNeeded(MyMTLPixelFormat pixelFormat, MyMTLTextureType type)
 {
     bool needsDecode = false;
 
-#if USE_SSE
+#if SIMD_SSE
     if (isETCFormat(pixelFormat)) {
         needsDecode = true;
     }
@@ -99,7 +97,9 @@ bool decodeImage(const KTXImage &image, KTXImage &imageDecoded)
 {
     KramDecoderParams decoderParams;
     KramDecoder decoder;
-#if USE_SSE
+
+    // macOS Intel only had BC support, and already have macOS arm64 build
+#if SIMD_SSE
     if (isETCFormat(image.pixelFormat)) {
         if (!decoder.decode(image, imageDecoded, decoderParams)) {
             return NO;
@@ -118,7 +118,7 @@ bool decodeImage(const KTXImage &image, KTXImage &imageDecoded)
     }
 #endif
     else {
-        KASSERT(false);  // don't call this routine if decode not needed
+        KASSERT(false); // don't call this routine if decode not needed
     }
 
     // TODO: decode BC format on iOS when not supported, but viewer only on macOS
@@ -178,27 +178,23 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format)
     KTXImage image;
 
     if (imageDataLength > 3 &&
-        imageData[0] == 0xff && imageData[1] == 0xd8 && imageData[2] == 0xff )
-    {
+        imageData[0] == 0xff && imageData[1] == 0xd8 && imageData[2] == 0xff) {
         KLOGE("kramv", "loader does not support jpg files");
         return nil;
     }
-        
+
     // if png, then need to load from KTXImageData which uses loadpng
     // \x89, P, N, G
     if (imageDataLength > 4 &&
-        imageData[0] == 137 && imageData[1] == 'P' && imageData[2] == 'N' && imageData[3] == 'G')
-    {
+        imageData[0] == 137 && imageData[1] == 'P' && imageData[2] == 'N' && imageData[3] == 'G') {
         KTXImageData imageDataReader;
         if (!imageDataReader.open(imageData, imageDataLength, image)) {
             return nil;
         }
-        
+
         return [self loadTextureFromImage:image originalFormat:originalFormat name:""];
     }
-    else
-    {
-    
+    else {
         // isInfoOnly = true keeps compressed mips on KTX2 and aliases original mip
         // data but have decode etc2/astc path below that uncompressed mips and the
         // rgb conversion path below as well in the viewer. games would want to
@@ -217,7 +213,7 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format)
 - (nullable id<MTLTexture>)loadTextureFromImage:(const KTXImage &)image
                                  originalFormat:
                                      (nullable MTLPixelFormat *)originalFormat
-                                           name:(const char*)name
+                                           name:(const char *)name
 {
 #if SUPPORT_RGB
     if (isInternalRGBFormat(image.pixelFormat)) {
@@ -234,7 +230,7 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format)
         dstImageInfoArgs.textureType = image.textureType;
         dstImageInfoArgs.pixelFormat = remapInternalRGBFormat(image.pixelFormat);
         dstImageInfoArgs.doMipmaps =
-            image.mipCount() > 1;  // ignore 0
+            image.mipCount() > 1; // ignore 0
         dstImageInfoArgs.textureEncoder = kTexEncoderExplicit;
 
         // set chunk count, so it's explicit
@@ -255,7 +251,7 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format)
         if (originalFormat != nullptr) {
             *originalFormat =
                 (MTLPixelFormat)rbgaImage2
-                    .pixelFormat;  // TODO: should this return rgbaImage.pixelFormat ?
+                    .pixelFormat; // TODO: should this return rgbaImage.pixelFormat ?
         }
 
         return [self blitTextureFromImage:rbgaImage2 name:name];
@@ -274,18 +270,17 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format)
 
         return [self blitTextureFromImage:imageDecoded name:name];
     }
-    else
-    {
+    else {
         // fast load path directly from mmap'ed data, decompress direct to staging
         return [self blitTextureFromImage:image name:name];
     }
 }
 
 - (BOOL)loadImageFromURL:(nonnull NSURL *)url
-                   image:(KTXImage&)image
-               imageData:(KTXImageData&)imageData
+                   image:(KTXImage &)image
+               imageData:(KTXImageData &)imageData
 {
-    const char* path = url.absoluteURL.path.UTF8String;
+    const char *path = url.absoluteURL.path.UTF8String;
     if (!imageData.open(path, image)) {
         return NO;
     }
@@ -293,9 +288,9 @@ - (BOOL)loadImageFromURL:(nonnull NSURL *)url
     return YES;
 }
 
-- (nullable id<MTLTexture>)loadTextureFromURL:(nonnull NSURL*)url
+- (nullable id<MTLTexture>)loadTextureFromURL:(nonnull NSURL *)url
                                originalFormat:
-                                   (nullable MTLPixelFormat*)originalFormat
+                                   (nullable MTLPixelFormat *)originalFormat
 {
     KTXImage image;
     KTXImageData imageData;
@@ -310,7 +305,7 @@ - (BOOL)loadImageFromURL:(nonnull NSURL *)url
 - (nullable id<MTLTexture>)createTexture:(const KTXImage &)image
                                isPrivate:(bool)isPrivate
 {
-    MTLTextureDescriptor* textureDescriptor = [[MTLTextureDescriptor alloc] init];
+    MTLTextureDescriptor *textureDescriptor = [[MTLTextureDescriptor alloc] init];
 
     // Indicate that each pixel has a blue, green, red, and alpha channel, where
     // each channel is an 8-bit unsigned normalized value (i.e. 0 maps to 0.0 and
@@ -328,10 +323,10 @@ - (BOOL)loadImageFromURL:(nonnull NSURL *)url
     // This is inefficient to set, but needed for viewwer.
     // Only set if texture type is toggleable.
     // only need this if changing components, type, etc.
-//    {
-//        textureDescriptor.usage |= MTLTextureUsagePixelFormatView;
-//    }
-    
+    // {
+    //     textureDescriptor.usage |= MTLTextureUsagePixelFormatView;
+    // }
+
     // ignoring 0 (auto mip), but might need to support for explicit formats
     // must have hw filtering support for format, and 32f filtering only first
     // appeared on A14/M1 and only get box filtering in API-level filters.  But
@@ -362,7 +357,7 @@ - (void)createStagingBufffer:(uint64_t)dataSize
     // enough to upload 4k x 4k @ 4 bytes no mips, careful with array and cube
     // that get too big
 
-    // allocate system memory for bufffer, can memcopy to this
+    // allocate system memory for buffer, can memcpy to this
     posix_memalign((void **)&_data, getpagesize(), dataSize);
 
     // allocate memory for circular staging buffer, only need to memcpy to this
@@ -383,13 +378,13 @@ - (void)uploadTexturesIfNeeded:(id<MTLBlitCommandEncoder>)blitEncoder
                  commandBuffer:(id<MTLCommandBuffer>)commandBuffer
 {
     mylock lock(gTextureLock);
-    
+
     if (!_blits.empty()) {
         // now upload from staging MTLBuffer to private MTLTexture
-        for (const auto& blit : _blits) {
+        for (const auto &blit : _blits) {
             MTLRegion region = {
-                {0, 0, 0},                                   // MTLOrigin
-                {(NSUInteger)blit.w, (NSUInteger)blit.h, 1}  // MTLSize
+                {0, 0, 0}, // MTLOrigin
+                {(NSUInteger)blit.w, (NSUInteger)blit.h, 1} // MTLSize
             };
 
             uint32_t chunkNum = blit.chunkNum;
@@ -428,8 +423,8 @@ - (void)uploadTexturesIfNeeded:(id<MTLBlitCommandEncoder>)blitEncoder
                 self->_bufferOffset = 0;
         }];
     }
-    
-    // mipgen after possible initial blit above
+
+    // mipgen possible after initial blit above
     if (_mipgenTextures.count > 0) {
         for (id<MTLTexture> texture in _mipgenTextures) {
             // autogen mips will include srgb conversions, so toggling srgb on/off
@@ -437,7 +432,7 @@ - (void)uploadTexturesIfNeeded:(id<MTLBlitCommandEncoder>)blitEncoder
             [blitEncoder generateMipmapsForTexture:texture];
         }
 
-        // reset the arra
+        // reset the array
         [_mipgenTextures removeAllObjects];
     }
 }
@@ -445,11 +440,11 @@ - (void)uploadTexturesIfNeeded:(id<MTLBlitCommandEncoder>)blitEncoder
 - (void)releaseAllPendingTextures
 {
     mylock lock(gTextureLock);
-    
+
     _bufferOffset = 0;
-    
+
     _blits.clear();
-    
+
     [_mipgenTextures removeAllObjects];
     [_blitTextures removeAllObjects];
 }
@@ -463,14 +458,14 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment)
 // (f.e. ktx), and another path for private that uses a blitEncoder and must
 // have block aligned data (f.e. ktxa, ktx2). Could repack ktx data into ktxa
 // before writing to temporary file, or when copying NSData into MTLBuffer.
-- (nullable id<MTLTexture>)blitTextureFromImage:(const KTXImage &)image name:(const char*)name
+- (nullable id<MTLTexture>)blitTextureFromImage:(const KTXImage &)image name:(const char *)name
 {
     mylock lock(gTextureLock);
-    
+
     if (_buffer == nil) {
         // Was set to 128, but models like FlightHelmet.gltf exceeded that buffer
         static const size_t kStagingBufferSize = 256 * 1024 * 1024;
-        
+
         // this is enough to upload 4k x 4x @ RGBA8u with mips, 8k x 8k compressed
         // with mips @96MB
         [self createStagingBufffer:kStagingBufferSize];
@@ -486,10 +481,10 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment)
     id<MTLTexture> texture = [self createTexture:image isPrivate:true];
     if (!texture)
         return nil;
-    
+
     // set a label so can identify in captures
     texture.label = [NSString stringWithUTF8String:name];
-    
+
     // this is index where texture will be added
     uint32_t textureIndex = (uint32_t)_blitTextures.count;
 
@@ -497,7 +492,7 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment)
     // upload mip levels
 
     // TODO: about aligning to 4k for base + length
-    // http://metalkit.org/2017/05/26/working-with-memory-in-metal-part-2.html
+    // http://metalkit.org/working-with-memory-in-metal/
 
     uint32_t w = image.width;
     uint32_t h = image.height;
@@ -515,8 +510,8 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment)
     size_t blockSize = image.blockSize();
 
     vector<uint64_t> bufferOffsets;
-    uint8_t* bufferData = (uint8_t*)_buffer.contents;
-    const uint8_t* mipData = (const uint8_t*)image.fileData;
+    uint8_t *bufferData = (uint8_t *)_buffer.contents;
+    const uint8_t *mipData = (const uint8_t *)image.fileData;
     bufferOffsets.resize(image.mipLevels.size());
 
     uint32_t numChunks = image.totalChunks();
@@ -534,8 +529,7 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment)
             KLOGE("kramv", "Ran out of buffer space to upload images");
             return nil;
         }
-        
-        
+
         // this may have to decompress the level data
         if (!image.unpackLevel(i, mipData + mipLevel.offset,
                                bufferData + bufferOffset)) {
@@ -568,7 +562,7 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment)
                     uint32_t bytesPerRow = 0;
 
                     // 1D/1DArray textures set bytesPerRow to 0
-                    if (  // image.textureType != MyMTLTextureType1D &&
+                    if ( // image.textureType != MyMTLTextureType1D &&
                         image.textureType != MyMTLTextureType1DArray) {
                         // for compressed, bytesPerRow needs to be multiple of block size
                         // so divide by the number of blocks making up the height
@@ -616,7 +610,7 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment)
                             mipLevelNumber, mipStorageSize, mipOffset,
 
                             textureIndex, bytesPerRow,
-                            is3D  // could derive from textureIndex lookup
+                            is3D // could derive from textureIndex lookup
                         });
                     }
                 }
diff --git a/kramv/KramRenderer.h b/kramv/KramRenderer.h
index de20df75..adc72336 100644
--- a/kramv/KramRenderer.h
+++ b/kramv/KramRenderer.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -8,7 +8,7 @@
 #import <MetalKit/MetalKit.h>
 
 #include "KramLib.h"
-#import "KramShaders.h"  // for TextureChannels
+#import "KramShaders.h" // for TextureChannels
 
 // Turn on GLTF loading support for 3d models.  This relies on Warren Moore's first GLTFKit
 // which only handles import and synchronous loading.
@@ -28,19 +28,17 @@
 //@import GLTFMTL;
 #endif
 
-
 namespace kram {
 class ShowSettings;
 class Data;
 class KTXImage;
-}
+} //namespace kram
 
 // Need renderer to be able to call back up to view to update hud.
 @protocol MyMTKViewUpdates <NSObject>
 - (void)updateEyedropperText;
 @end
 
-
 // Our platform independent renderer class.   Implements the MTKViewDelegate
 // protocol which
 //   allows it to accept per-frame update and drawable resize callbacks.
@@ -53,7 +51,7 @@ class KTXImage;
 - (nonnull instancetype)initWithMetalKitView:(nonnull MTKView *)view
                                     settings:
                                         (nonnull kram::ShowSettings *)settings
-                                    data:(nonnull kram::Data *)data;
+                                        data:(nonnull kram::Data *)data;
 
 - (BOOL)loadTextureFromImage:(nonnull const char *)fullFilenameString
                    timestamp:(double)timestamp
@@ -66,18 +64,17 @@ class KTXImage;
 
 - (BOOL)hotloadShaders:(nonnull const char *)filename;
 
-
 // unload textures and gltf model textures
 - (void)releaseAllPendingTextures;
 
 // load a gltf model
-- (BOOL)loadModel:(nonnull const char*)url;
+- (BOOL)loadModel:(nonnull const char *)url;
 
 // unload gltf model
 - (void)unloadModel;
 
 // called from view and renderer in render loop
-- (void)updateAnimationState:(nonnull MTKView*)view;
+- (void)updateAnimationState:(nonnull MTKView *)view;
 
 // So caller can respond to completed callback
 - (void)setEyedropperDelegate:(nullable id)delegate;
@@ -86,13 +83,12 @@ class KTXImage;
 - (void)setFramePacingEnabled:(bool)enable;
 
 // can play animations in gltf models
-@property (nonatomic) BOOL playAnimations;
+@property(nonatomic) BOOL playAnimations;
 
 // can toggle on/off srgb if that is psosible
-@property (nonatomic) BOOL isToggleView;
+@property(nonatomic) BOOL isToggleView;
 
 // true if a toggle is present
-@property (nonatomic) BOOL hasToggleView;
+@property(nonatomic) BOOL hasToggleView;
 
 @end
-
diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm
index 50895c0a..c5985f94 100644
--- a/kramv/KramRenderer.mm
+++ b/kramv/KramRenderer.mm
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -16,6 +16,7 @@
 
 // c interface to signposts similar to dtrace on macOS/iOS
 #include <os/signpost.h>
+
 #include <mutex> // for recursive_mutex
 
 using mymutex = std::recursive_mutex;
@@ -23,8 +24,7 @@
 
 os_log_t gLogKramv = os_log_create("com.hialec.kramv", "");
 
-class Signpost
-{
+class Signpost {
 public:
     Signpost(const char* name)
         : _name(name), _ended(false)
@@ -34,12 +34,12 @@
         else
             _ended = true;
     }
-    
+
     ~Signpost()
     {
         stop();
     }
-    
+
     void stop()
     {
         if (!_ended) {
@@ -47,13 +47,12 @@ void stop()
             _ended = true;
         }
     }
-    
+
 private:
     const char* _name;
     bool _ended;
 };
 
-
 #if USE_GLTF
 
 // TODO: make part of renderer
@@ -64,12 +63,12 @@ void stop()
 
 @interface KramGLTFTextureLoader : NSObject <IGLTFMTLTextureLoader>
 - (instancetype)initWithLoader:(KramLoader*)loader;
-- (id<MTLTexture> _Nullable)newTextureWithContentsOfURL:(NSURL *)url options:(NSDictionary * _Nullable)options error:(NSError **)error;
-- (id<MTLTexture> _Nullable)newTextureWithData:(NSData *)data options:(NSDictionary * _Nullable)options error:(NSError **)error;
+- (id<MTLTexture> _Nullable)newTextureWithContentsOfURL:(NSURL*)url options:(NSDictionary* _Nullable)options error:(NSError**)error;
+- (id<MTLTexture> _Nullable)newTextureWithData:(NSData*)data options:(NSDictionary* _Nullable)options error:(NSError**)error;
 @end
 
 @interface KramGLTFTextureLoader ()
-@property (nonatomic, strong) KramLoader* loader;
+@property(nonatomic, strong) KramLoader* loader;
 @end
 
 @implementation KramGLTFTextureLoader
@@ -83,13 +82,13 @@ - (instancetype)initWithLoader:(KramLoader*)loader
 }
 
 // TODO: this ignores options and error.  Default png loading may need to request srgb.
-- (id<MTLTexture> _Nullable)newTextureWithContentsOfURL:(NSURL *)url options:(NSDictionary * _Nullable)options error:(NSError * __autoreleasing *)error
+- (id<MTLTexture> _Nullable)newTextureWithContentsOfURL:(NSURL*)url options:(NSDictionary* _Nullable)options error:(NSError* __autoreleasing*)error
 {
     return [_loader loadTextureFromURL:url originalFormat:nil];
 }
 
 // TODO: this ignores options and error.  Default png loading may need to request srgb.
-- (id<MTLTexture> _Nullable)newTextureWithData:(NSData *)data options:(NSDictionary * _Nullable)options error:(NSError * __autoreleasing *)error
+- (id<MTLTexture> _Nullable)newTextureWithData:(NSData*)data options:(NSDictionary* _Nullable)options error:(NSError* __autoreleasing*)error
 {
     return [_loader loadTextureFromData:data originalFormat:nil];
 }
@@ -98,11 +97,10 @@ - (instancetype)initWithLoader:(KramLoader*)loader
 
 #endif
 
-
 static const NSUInteger MaxBuffersInFlight = 3;
 
 using namespace kram;
-using namespace simd;
+using namespace SIMD_NAMESPACE;
 
 // Capture what we need to build the renderPieplines, without needing view
 struct ViewFramebufferData {
@@ -126,7 +124,7 @@ @implementation Renderer {
     id<MTLRenderPipelineState> _pipelineStateVolume;
 
     id<MTLRenderPipelineState> _pipelineStateDrawLines;
-    
+
     id<MTLComputePipelineState> _pipelineState1DArrayCS;
     id<MTLComputePipelineState> _pipelineStateImageCS;
     id<MTLComputePipelineState> _pipelineStateImageArrayCS;
@@ -137,7 +135,6 @@ @implementation Renderer {
     id<MTLDepthStencilState> _depthStateFull;
     id<MTLDepthStencilState> _depthStateNone;
 
-    
     MTLVertexDescriptor* _mtlVertexDescriptor;
 
     // TODO: Array< id<MTLTexture> > _textures;
@@ -145,9 +142,9 @@ @implementation Renderer {
     id<MTLTexture> _colorMapView;
     id<MTLTexture> _normalMap;
     id<MTLTexture> _diffMap;
-    
+
     id<MTLTexture> _lastDrawableTexture;
-    
+
     // border is a better edge sample, but at edges it filters in the transparent
     // color around the border which is undesirable.  It would be better if the hw
     // did clamp to edge until uv outside 0 to 1.  This results in having to inset
@@ -168,12 +165,11 @@ @implementation Renderer {
 
     uint8_t _uniformBufferIndex;
 
-   
     // float _rotation;
     KramLoader* _loader;
     MTKMesh* _mesh;
 
-    MDLVertexDescriptor *_mdlVertexDescriptor;
+    MDLVertexDescriptor* _mdlVertexDescriptor;
 
     MTKMesh* _meshRect;
     MTKMesh* _meshBox;
@@ -181,7 +177,7 @@ @implementation Renderer {
     MTKMesh* _meshSphereMirrored;
     // MTKMesh* _meshCylinder;
     MTKMesh* _meshCapsule;
-    MTKMeshBufferAllocator *_metalAllocator;
+    MTKMeshBufferAllocator* _metalAllocator;
 
     id<MTLLibrary> _shaderLibrary;
     NSURL* _metallibFileURL;
@@ -190,22 +186,22 @@ @implementation Renderer {
 
     ShowSettings* _showSettings;
     Data* _data;
-    
+
 #if USE_GLTF
     KramGLTFTextureLoader* _textureLoader;
     id<GLTFBufferAllocator> _bufferAllocator;
     GLTFMTLRenderer* _gltfRenderer;
     GLTFAsset* _asset; // only 1 for now
     double _animationTime;
-    
+
     id<MTLTexture> _environmentTexture;
     bool _environmentNeedsUpdate;
-    
+
     NSURLSession* _urlSession;
 #endif
 
     __weak id _delegateHud;
-    
+
     bool _useFramePacing;
     double _avgGpuTime;
 }
@@ -214,9 +210,9 @@ @implementation Renderer {
 @synthesize isToggleView;
 @synthesize hasToggleView;
 
-- (nonnull instancetype)initWithMetalKitView:(nonnull MTKView *)view
-                                    settings:(nonnull ShowSettings *)settings
-                                    data:(nonnull Data*)data
+- (nonnull instancetype)initWithMetalKitView:(nonnull MTKView*)view
+                                    settings:(nonnull ShowSettings*)settings
+                                        data:(nonnull Data*)data
 {
     self = [super init];
     if (self) {
@@ -224,6 +220,11 @@ - (nonnull instancetype)initWithMetalKitView:(nonnull MTKView *)view
         _data = data;
         _device = view.device;
 
+        // 11gb on a 16gb machine.
+        // Represents the max size of a render encoder.
+        double kInvOneMB = 1.0 / (1024.0 * 1024.0);
+        KLOGI("Rendererr", "%0.3f mb", _device.recommendedMaxWorkingSetSize * kInvOneMB );
+        
         _loader = [KramLoader new];
         _loader.device = _device;
 
@@ -232,25 +233,24 @@ - (nonnull instancetype)initWithMetalKitView:(nonnull MTKView *)view
         _inFlightSemaphore = dispatch_semaphore_create(MaxBuffersInFlight);
         [self _loadMetalWithView:view];
         [self _loadAssets];
-        
+
 #if USE_GLTF
         _bufferAllocator = [[GLTFMTLBufferAllocator alloc] initWithDevice:_device];
         _gltfRenderer = [[GLTFMTLRenderer alloc] initWithDevice:_device];
-        
+
         // This aliases the existing kram loader, can handle png, ktx, ktx2
         _textureLoader = [[KramGLTFTextureLoader alloc] initWithLoader:_loader];
         _gltfRenderer.textureLoader = _textureLoader;
-        
+
         // load the environment from a cube map for now
         // runs this after _shaderLibrary established above
-        _gltfRenderer.lightingEnvironment = [[GLTFMTLLightingEnvironment alloc] initWithLibrary: _shaderLibrary];
-        
+        _gltfRenderer.lightingEnvironment = [[GLTFMTLLightingEnvironment alloc] initWithLibrary:_shaderLibrary];
+
         //NSURL* environmentURL = [[NSBundle mainBundle] URLForResource:@"piazza_san_marco" withExtension:@"ktx"];
         NSURL* environmentURL = [[NSBundle mainBundle] URLForResource:@"tropical_beach" withExtension:@"ktx"];
         _environmentTexture = [_loader loadTextureFromURL:environmentURL originalFormat:nil];
         _environmentNeedsUpdate = true;
 #endif
-
     }
 
     return self;
@@ -258,7 +258,7 @@ - (nonnull instancetype)initWithMetalKitView:(nonnull MTKView *)view
 
 - (void)_createSamplers
 {
-    MTLSamplerDescriptor *samplerDescriptor = [MTLSamplerDescriptor new];
+    MTLSamplerDescriptor* samplerDescriptor = [MTLSamplerDescriptor new];
     samplerDescriptor.minFilter = MTLSamplerMinMagFilterNearest;
     samplerDescriptor.magFilter = MTLSamplerMinMagFilterNearest;
     samplerDescriptor.mipFilter = MTLSamplerMipFilterNearest;
@@ -294,7 +294,7 @@ - (void)_createSamplers
     samplerDescriptor.minFilter = MTLSamplerMinMagFilterLinear;
     samplerDescriptor.magFilter = MTLSamplerMinMagFilterLinear;
     samplerDescriptor.mipFilter = MTLSamplerMipFilterLinear;
-    samplerDescriptor.maxAnisotropy = 4;  // 1,2,4,8,16 are choices
+    samplerDescriptor.maxAnisotropy = 4; // 1,2,4,8,16 are choices
 
     samplerDescriptor.sAddressMode = MTLSamplerAddressModeClampToBorderColor;
     samplerDescriptor.tAddressMode = MTLSamplerAddressModeClampToBorderColor;
@@ -332,19 +332,19 @@ - (void)_createVertexDescriptor
         BufferIndexMeshPosition;
 
     _mtlVertexDescriptor.attributes[VertexAttributeTexcoord].format =
-        MTLVertexFormatFloat2;  // TODO: compress
+        MTLVertexFormatFloat2; // TODO: compress
     _mtlVertexDescriptor.attributes[VertexAttributeTexcoord].offset = 0;
     _mtlVertexDescriptor.attributes[VertexAttributeTexcoord].bufferIndex =
         BufferIndexMeshUV0;
 
     _mtlVertexDescriptor.attributes[VertexAttributeNormal].format =
-        MTLVertexFormatFloat3;  // TODO: compress
+        MTLVertexFormatFloat3; // TODO: compress
     _mtlVertexDescriptor.attributes[VertexAttributeNormal].offset = 0;
     _mtlVertexDescriptor.attributes[VertexAttributeNormal].bufferIndex =
         BufferIndexMeshNormal;
 
     _mtlVertexDescriptor.attributes[VertexAttributeTangent].format =
-        MTLVertexFormatFloat4;  // TODO: compress
+        MTLVertexFormatFloat4; // TODO: compress
     _mtlVertexDescriptor.attributes[VertexAttributeTangent].offset = 0;
     _mtlVertexDescriptor.attributes[VertexAttributeTangent].bufferIndex =
         BufferIndexMeshTangent;
@@ -373,7 +373,7 @@ - (void)_createVertexDescriptor
         MDLVertexAttributeTangent;
 }
 
-- (void)_loadMetalWithView:(nonnull MTKView *)view
+- (void)_loadMetalWithView:(nonnull MTKView*)view
 {
     /// Load Metal state objects and initialize renderer dependent view properties
 
@@ -382,19 +382,18 @@ - (void)_loadMetalWithView:(nonnull MTKView *)view
     // true is good for non-srgb -> rgba16f
     CGColorSpaceRef viewColorSpace;
     MTLPixelFormat format = MTLPixelFormatRGBA16Float;
-    
+
     // This doesn't look like Figma or Photoshop for a rgb,a = 255,0 to 255,1 gradient across a 256px wide rect.   The shader is saturating
     // the color to 0,1.  So can get away with SRGB color space for now.
     // This also lines up with Preview.
     //viewColorSpace  = CGColorSpaceCreateWithName(kCGColorSpaceGenericRGBLinear);
-    
-    
+
     //CAMetalLayer* metalLayer = (CAMetalLayer*)[view layer];
-    
+
     // was using 16f so could sample hdr images from it
     //  and also so hdr data went out to the display
     uint32_t colorSpaceChoice = 1;
-    switch(colorSpaceChoice) {
+    switch (colorSpaceChoice) {
         default:
         case 0:
             // This is best so far
@@ -402,88 +401,86 @@ - (void)_loadMetalWithView:(nonnull MTKView *)view
             viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceSRGB);
             //viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceLinearSRGB);
             break;
-            
+
         case 1: {
             // Display P3 is a standard made by Apple that covers the same colour space as DCI-P3, but uses the more neutral D65 as a white point instead of the green white of the DCI standard.
             // Ideally feed 16-bit color to P3.
-            
+
             // This also works
             // 25% larger than srgb
             format = MTLPixelFormatRGBA16Float;
-           
+
             // This is industry format
             // viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceDCIP3);
-            
+
             // This isn't edr
             // viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceDisplayP3);
-            
+
             // Use this because it exists from 10.14.3+
             viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceExtendedLinearDisplayP3);
-           
+
             // don't set this yet.
             // metalLayer.wantsExtendedDynamicRangeContent = YES;
-            
+
             // https://developer.apple.com/videos/play/wwdc2021/10161/
-            
+
             /* Can detect if on HDR display or not
                 user can mod the brightness, or move to another monitor,
                 need to listen for notification when this changes.
-             
+
             NSScreen* screen = NSScreen.mainScreen;
-            
+
             // This reports 1
             CGFloat val1 = screen.maximumExtendedDynamicRangeColorComponentValue;
-            
+
             // This is 16
             CGFloat maxPot =  screen.maximumPotentialExtendedDynamicRangeColorComponentValue;
-            
+
             // This is 0
             CGFloat maxRef = screen.maximumReferenceExtendedDynamicRangeColorComponentValue;
             */
-            
+
             // M1 monitor
-            
-            
+
             break;
         }
         case 2:
             // This doesn't match wnen srgb is turned off on TestColorGradient
             format = MTLPixelFormatRGBA8Unorm_sRGB;
             viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceSRGB);
-            
+
             // this looks totally wrong
             //viewColorSpace = CGColorSpaceCreateWithName(kCGColorLinearSpaceSRGB);
             break;
-            
-        /*
-        case 3: {
-            // There is an exrMetadata field on NSView to set as well.
-            // https://developer.apple.com/documentation/metal/hdr_content/using_color_spaces_to_display_hdr_content?language=objc
-            
-            // Rec2020 color primaries, with PQ Transfer function.
-            // Would have to get into Rec2020 colors to set this, also go from 10bit
-            format = MTLPixelFormatBGR10A2Unorm;
-            viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceITUR_2100_PQ);
-            
-            metalLayer.wantsExtendedDynamicRangeContent = YES;
-            
-            // https://developer.apple.com/documentation/metal/hdr_content/using_system_tone_mapping_on_video_content?language=objc
-            // must do os version check on this
-            // 1.0 is 100 nits of output
-            CAEDRMetadata* edrMetaData = [CAEDRMetadata HDR10MetadataWithMinLuminance: 0.005 maxLuminance: 1000 opticalOutputScale: 100];
-            metalLayer.EDRMetadata = edrMetaData;
-            
-            break;
-        }
-        */
+
+            /*
+            case 3: {
+                // There is an exrMetadata field on NSView to set as well.
+                // https://developer.apple.com/documentation/metal/hdr_content/using_color_spaces_to_display_hdr_content?language=objc
+
+                // Rec2020 color primaries, with PQ Transfer function.
+                // Would have to get into Rec2020 colors to set this, also go from 10bit
+                format = MTLPixelFormatBGR10A2Unorm;
+                viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceITUR_2100_PQ);
+
+                metalLayer.wantsExtendedDynamicRangeContent = YES;
+
+                // https://developer.apple.com/documentation/metal/hdr_content/using_system_tone_mapping_on_video_content?language=objc
+                // must do os version check on this
+                // 1.0 is 100 nits of output
+                CAEDRMetadata* edrMetaData = [CAEDRMetadata HDR10MetadataWithMinLuminance: 0.005 maxLuminance: 1000 opticalOutputScale: 100];
+                metalLayer.EDRMetadata = edrMetaData;
+
+                break;
+            }
+            */
     }
-    
-    
+
     view.colorPixelFormat = format;
     view.colorspace = viewColorSpace;
-   
+
     CGColorSpaceRelease(viewColorSpace);
-    
+
     view.depthStencilPixelFormat = MTLPixelFormatDepth32Float_Stencil8;
     view.sampleCount = 1;
 
@@ -501,17 +498,13 @@ - (void)_loadMetalWithView:(nonnull MTKView *)view
 
     //-----------------------
 
-    MTLDepthStencilDescriptor *depthStateDesc =
+    MTLDepthStencilDescriptor* depthStateDesc =
         [[MTLDepthStencilDescriptor alloc] init];
-    depthStateDesc.depthCompareFunction = _showSettings->isReverseZ
-                                              ? MTLCompareFunctionGreaterEqual
-                                              : MTLCompareFunctionLessEqual;
+    depthStateDesc.depthCompareFunction = MTLCompareFunctionGreaterEqual;
     depthStateDesc.depthWriteEnabled = YES;
     _depthStateFull = [_device newDepthStencilStateWithDescriptor:depthStateDesc];
 
-    depthStateDesc.depthCompareFunction = _showSettings->isReverseZ
-                                              ? MTLCompareFunctionGreaterEqual
-                                              : MTLCompareFunctionLessEqual;
+    depthStateDesc.depthCompareFunction = MTLCompareFunctionGreaterEqual;
     depthStateDesc.depthWriteEnabled = NO;
     _depthStateNone = [_device newDepthStencilStateWithDescriptor:depthStateDesc];
 
@@ -534,7 +527,7 @@ - (void)_loadMetalWithView:(nonnull MTKView *)view
     [self _createSampleRender];
 }
 
-- (BOOL)hotloadShaders:(const char *)filename
+- (BOOL)hotloadShaders:(const char*)filename
 {
     _metallibFileURL =
         [NSURL fileURLWithPath:[NSString stringWithUTF8String:filename]];
@@ -553,7 +546,7 @@ - (BOOL)hotloadShaders:(const char *)filename
     _metallibFileDate = fileDate;
 
     // Now dynamically load the metallib
-    NSData *dataNS = [NSData dataWithContentsOfURL:_metallibFileURL
+    NSData* dataNS = [NSData dataWithContentsOfURL:_metallibFileURL
                                            options:NSDataReadingMappedIfSafe
                                              error:&err];
     if (dataNS == nil) {
@@ -579,7 +572,7 @@ - (BOOL)hotloadShaders:(const char *)filename
     return YES;
 }
 
-- (id<MTLComputePipelineState>)_createComputePipeline:(const char *)name
+- (id<MTLComputePipelineState>)_createComputePipeline:(const char*)name
 {
     NSString* nameNS = [NSString stringWithUTF8String:name];
     NSError* error = nil;
@@ -614,8 +607,8 @@ - (void)_createComputePipelines
         [self _createComputePipeline:"SampleImage1DArrayCS"];
 }
 
-- (id<MTLRenderPipelineState>)_createRenderPipeline:(const char *)vs
-                                                 fs:(const char *)fs
+- (id<MTLRenderPipelineState>)_createRenderPipeline:(const char*)vs
+                                                 fs:(const char*)fs
 {
     NSString* vsNameNS = [NSString stringWithUTF8String:vs];
     NSString* fsNameNS = [NSString stringWithUTF8String:fs];
@@ -633,7 +626,7 @@ - (void)_createComputePipelines
 
     // Note: blending is disabled on color target, all blending done in shader
     // since have checkerboard and other stuff to blend against.
-    
+
     // TODO: could drop these for images, but want a 3D preview of content
     // or might make these memoryless.
     pipelineStateDescriptor.depthAttachmentPixelFormat =
@@ -684,10 +677,9 @@ - (void)_createRenderPipelines
                                                        fs:"DrawCubeArrayPS"];
     _pipelineStateVolume = [self _createRenderPipeline:"DrawVolumeVS"
                                                     fs:"DrawVolumePS"];
-    
+
     _pipelineStateDrawLines = [self _createRenderPipeline:"DrawLinesVS"
                                                        fs:"DrawLinesPS"];
-     
 }
 
 - (void)_createSampleRender
@@ -721,9 +713,9 @@ - (void)_createSampleRender
     }
 }
 
-- (MTKMesh *)_createMeshAsset:(const char *)name
-                      mdlMesh:(MDLMesh *)mdlMesh
-                     doFlipUV:(bool)doFlipUV
+- (MTKMesh*)_createMeshAsset:(const char*)name
+                     mdlMesh:(MDLMesh*)mdlMesh
+                    doFlipUV:(bool)doFlipUV
 {
     NSError* error = nil;
 
@@ -737,10 +729,10 @@ - (MTKMesh *)_createMeshAsset:(const char *)name
         id<MDLMeshBuffer> uvs = mdlMesh.vertexBuffers[BufferIndexMeshUV0];
         MDLMeshBufferMap* uvsMap = [uvs map];
 
-        packed_float2* uvData = (packed_float2 *)uvsMap.bytes;
+        packed_float2* uvData = (packed_float2*)uvsMap.bytes;
 
         for (uint32_t i = 0; i < mdlMesh.vertexCount; ++i) {
-            auto &uv = uvData[i];
+            auto& uv = uvData[i];
 
             uv.x = 1.0f - uv.x;
         }
@@ -758,13 +750,13 @@ - (MTKMesh *)_createMeshAsset:(const char *)name
     if (doFlipBitangent) {
         id<MDLMeshBuffer> uvs = mdlMesh.vertexBuffers[BufferIndexMeshTangent];
         MDLMeshBufferMap* uvsMap = [uvs map];
-        packed_float4* uvData = (packed_float4 *)uvsMap.bytes;
+        packed_float4* uvData = (packed_float4*)uvsMap.bytes;
 
         for (uint32_t i = 0; i < mdlMesh.vertexCount; ++i) {
-            //            if (uvData[i].w != -1.0f && uvData[i].w != 1.0f) {
-            //                int bp = 0;
-            //                bp = bp;
-            //            }
+            // if (uvData[i].w != -1.0f && uvData[i].w != 1.0f) {
+            //     int bp = 0;
+            //     bp = bp;
+            // }
 
             uvData[i].w = -uvData[i].w;
         }
@@ -818,12 +810,11 @@ - (MTKMesh *)_createMeshAsset:(const char *)name
     float x, y, z;
 };
 
-
 - (void)releaseAllPendingTextures
 {
     @autoreleasepool {
         [_loader releaseAllPendingTextures];
-        
+
         // also release the model and cached textures in the renderer
         [self unloadModel];
     }
@@ -832,15 +823,15 @@ - (void)releaseAllPendingTextures
 - (void)updateAnimationState:(nonnull MTKView*)view
 {
     bool animateDisplay = self.playAnimations;
-    
+
     // animate the uvPreview until it reaches endPoint, no scrubber yet
     _showSettings->updateUVPreviewState();
-    
+
     if (_showSettings->uvPreviewFrames > 0) {
         _showSettings->uvPreviewFrames--;
         animateDisplay = true;
     }
-    
+
     view.enableSetNeedsDisplay = !animateDisplay;
     view.paused = !animateDisplay;
 }
@@ -851,15 +842,15 @@ - (void)setEyedropperDelegate:(nullable id)delegate
     _delegateHud = delegate;
 }
 
-- (void)updateModelSettings:(const string &)fullFilename
+- (void)updateModelSettings:(const string&)fullFilename
 {
     _showSettings->isModel = true;
     _showSettings->numChannels = 0; // hides rgba
-    
+
     // don't want any scale on view, or as little as possible
     _showSettings->imageBoundsX = 1;
     _showSettings->imageBoundsY = 1;
-    
+
     BOOL isNewFile = YES;
     [self resetSomeImageSettings:isNewFile];
 }
@@ -871,15 +862,13 @@ - (BOOL)loadModel:(nonnull const char*)filename
 
 #if USE_GLTF
 
-        
-    
-        // TODO: move to async version of this, many of these load slow
-        // but is there a way to cancel the load.  Or else move to cgltf which is faster.
-        // see GLTFKit2.
+    // TODO: move to async version of this, many of these load slow
+    // but is there a way to cancel the load.  Or else move to cgltf which is faster.
+    // see GLTFKit2.
 
 #define DO_ASYNC 0
 #if DO_ASYNC
-        // [GLTFAsset loadAssetWithURL:url bufferAllocator:_bufferAllocator delegate:self];
+    // [GLTFAsset loadAssetWithURL:url bufferAllocator:_bufferAllocator delegate:self];
 #else
     @autoreleasepool {
         GLTFAsset* newAsset = [[GLTFAsset alloc] initWithURL:fileURL bufferAllocator:_bufferAllocator];
@@ -909,15 +898,13 @@ - (void)unloadModel
 #endif
 }
 
-
-    
 - (void)_createMeshRect:(float)aspectRatioXToY
 {
     // This is a box that's smashed down to a thin 2d z plane, can get very close to it
     // due to the thinness of the volume without nearZ intersect
-    
+
     /// Load assets into metal objects
-    MDLMesh *mdlMesh;
+    MDLMesh* mdlMesh;
 
     mdlMesh = [MDLMesh newBoxWithDimensions:(vector_float3){aspectRatioXToY, 1, 0.001}
                                    segments:(vector_uint3){1, 1, 1}
@@ -927,7 +914,7 @@ - (void)_createMeshRect:(float)aspectRatioXToY
 
     // for some reason normals are all n = 1,0,0 which doesn't make sense on a box
     // for the side that is being viewed.
-    
+
     // only one of these for now, but really should store per image
     _meshRect = [self _createMeshAsset:"MeshRect" mdlMesh:mdlMesh doFlipUV:false];
 }
@@ -972,19 +959,19 @@ - (void)_loadAssets
         id<MDLMeshBuffer> posBuffer =
             mdlMesh.vertexBuffers[BufferIndexMeshPosition];
         MDLMeshBufferMap* posMap = [posBuffer map];
-        packed_float3* posData = (packed_float3 *)posMap.bytes;
+        packed_float3* posData = (packed_float3*)posMap.bytes;
 
         id<MDLMeshBuffer> normalBuffer =
             mdlMesh.vertexBuffers[BufferIndexMeshNormal];
         MDLMeshBufferMap* normalsMap = [normalBuffer map];
-        packed_float3* normalData = (packed_float3 *)normalsMap.bytes;
+        packed_float3* normalData = (packed_float3*)normalsMap.bytes;
 
         // vertexCount reports 306, but vertex 289+ are garbage
-        uint32_t numVertices = 289;  // mdlMesh.vertexCount
+        uint32_t numVertices = 289; // mdlMesh.vertexCount
 
         for (uint32_t i = 0; i < numVertices; ++i) {
             {
-                auto &pos = posData[i];
+                auto& pos = posData[i];
 
                 // dumb rotate about Y-axis
                 auto copy = pos;
@@ -994,7 +981,7 @@ - (void)_loadAssets
             }
 
             {
-                auto &normal = normalData[i];
+                auto& normal = normalData[i];
                 auto copy = normal;
                 normal.x = copy.x * cosSin.x - copy.z * cosSin.y;
                 normal.z = copy.x * cosSin.y + copy.z * cosSin.x;
@@ -1003,7 +990,7 @@ - (void)_loadAssets
 
         // Hack - knock out all bogus tris from ModelIO that lead to garbage tris
         for (uint32_t i = numVertices; i < mdlMesh.vertexCount; ++i) {
-            auto &pos = posData[i];
+            auto& pos = posData[i];
             pos.x = NAN;
         }
     }
@@ -1030,26 +1017,26 @@ - (void)_loadAssets
 
         id<MDLMeshBuffer> uvsBuffer = mdlMesh.vertexBuffers[BufferIndexMeshUV0];
         MDLMeshBufferMap* uvsMap = [uvsBuffer map];
-        packed_float2* uvData = (packed_float2 *)uvsMap.bytes;
+        packed_float2* uvData = (packed_float2*)uvsMap.bytes;
 
         // this is all aos
 
         id<MDLMeshBuffer> posBuffer =
             mdlMesh.vertexBuffers[BufferIndexMeshPosition];
         MDLMeshBufferMap* posMap = [posBuffer map];
-        packed_float3 *posData = (packed_float3 *)posMap.bytes;
+        packed_float3* posData = (packed_float3*)posMap.bytes;
 
         id<MDLMeshBuffer> normalsBuffe =
             mdlMesh.vertexBuffers[BufferIndexMeshNormal];
         MDLMeshBufferMap* normalsMap = [normalsBuffe map];
-        packed_float3* normalData = (packed_float3 *)normalsMap.bytes;
+        packed_float3* normalData = (packed_float3*)normalsMap.bytes;
 
         // vertexCount reports 306, but vertex 289+ are garbage
-        uint32_t numVertices = 289;  // mdlMesh.vertexCount
+        uint32_t numVertices = 289; // mdlMesh.vertexCount
 
         for (uint32_t i = 0; i < numVertices; ++i) {
             {
-                auto &pos = posData[i];
+                auto& pos = posData[i];
 
                 // dumb rotate about Y-axis
                 auto copy = pos;
@@ -1058,18 +1045,18 @@ - (void)_loadAssets
             }
 
             {
-                auto &normal = normalData[i];
+                auto& normal = normalData[i];
                 auto copy = normal;
                 normal.x = copy.x * cosSin.x - copy.z * cosSin.y;
                 normal.z = copy.x * cosSin.y + copy.z * cosSin.x;
             }
 
-            auto &uv = uvData[i];
+            auto& uv = uvData[i];
 
-            //            if (uv.x < 0.0 || uv.x > 1.0) {
-            //                int bp = 0;
-            //                bp = bp;
-            //            }
+            // if (uv.x < 0.0 || uv.x > 1.0) {
+            //     int bp = 0;
+            //     bp = bp;
+            // }
 
             // this makes it counterclockwise 0 to 1
             float x = uv.x;
@@ -1093,7 +1080,7 @@ - (void)_loadAssets
 
         // Hack - knock out all bogus tris from ModelIO that lead to garbage tris
         for (uint32_t i = numVertices; i < mdlMesh.vertexCount; ++i) {
-            auto &pos = posData[i];
+            auto& pos = posData[i];
             pos.x = NAN;
         }
 
@@ -1120,7 +1107,7 @@ - (void)_loadAssets
     //    doFlipUV:true];
 
     mdlMesh = [MDLMesh newCapsuleWithHeight:1.0
-                                      radii:(vector_float2){1.0f/3.0f, 1.0f/3.0f} // circle
+                                      radii:(vector_float2){1.0f / 3.0f, 1.0f / 3.0f} // circle
                              // vertical cap subtracted from height
                              radialSegments:16
                            verticalSegments:1
@@ -1138,7 +1125,8 @@ - (void)_loadAssets
 }
 
 // this aliases the existing string, so can't chop extension
-inline const char* toFilenameShort(const char* filename) {
+inline const char* toFilenameShort(const char* filename)
+{
     const char* filenameShort = strrchr(filename, '/');
     if (filenameShort == nullptr) {
         filenameShort = filename;
@@ -1149,22 +1137,21 @@ - (void)_loadAssets
     return filenameShort;
 }
 
-
-- (BOOL)loadTextureFromImage:(nonnull const char *)fullFilenameString
+- (BOOL)loadTextureFromImage:(nonnull const char*)fullFilenameString
                    timestamp:(double)timestamp
-                       image:(kram::KTXImage &)image
-                 imageNormal:(nullable kram::KTXImage *)imageNormal
-                   imageDiff:(nullable kram::KTXImage *)imageDiff
+                       image:(kram::KTXImage&)image
+                 imageNormal:(nullable kram::KTXImage*)imageNormal
+                   imageDiff:(nullable kram::KTXImage*)imageDiff
                    isArchive:(BOOL)isArchive
 {
     // image can be decoded to rgba8u if platform can't display format natively
     // but still want to identify blockSize from original format
     string fullFilename = fullFilenameString;
     const char* filenameShort = toFilenameShort(fullFilename.c_str());
-    
+
     bool isTextureNew = _showSettings->isFileNew(fullFilename.c_str());
     bool isTextureChanged = _showSettings->isFileChanged(fullFilename.c_str(), timestamp);
-    
+
     if (isTextureChanged) {
         // synchronously cpu upload from ktx file to buffer, with eventual gpu blit
         // from buffer to returned texture.  TODO: If buffer is full, then something
@@ -1178,13 +1165,13 @@ - (BOOL)loadTextureFromImage:(nonnull const char *)fullFilenameString
         if (!texture) {
             return NO;
         }
-        
+
         bool isPNG = isPNGFilename(fullFilename.c_str());
-        
+
         // to be able to turn on/off srgb, need to set a view
         id<MTLTexture> textureView;
         MyMTLPixelFormat textureFormat = (MyMTLPixelFormat)image.pixelFormat;
-        
+
         // TODO: may only want to offer on png files, where format is
         MyMTLPixelFormat viewFormat = textureFormat;
         if (isPNG) // && isSrgbFormat(textureFormat))
@@ -1195,7 +1182,7 @@ - (BOOL)loadTextureFromImage:(nonnull const char *)fullFilenameString
         else {
             // This may fail.
             textureView = [texture newTextureViewWithPixelFormat:(MTLPixelFormat)viewFormat];
-            
+
             textureView.label = [texture.label stringByAppendingString:@"View"];
         }
 
@@ -1215,13 +1202,13 @@ - (BOOL)loadTextureFromImage:(nonnull const char *)fullFilenameString
         if (imageDiff) {
             // Note: this name may not be the same name
             diffTexture = [_loader loadTextureFromImage:*imageDiff
-                                           originalFormat:nil
-                                                     name:filenameShort];
+                                         originalFormat:nil
+                                                   name:filenameShort];
             if (!diffTexture) {
                 return NO;
             }
         }
-        
+
         // if archive contained png, then it's been converted to ktx
         // so the info below may not reflect original data
         // Would need original png data to look at header
@@ -1250,13 +1237,13 @@ - (BOOL)loadTextureFromImage:(nonnull const char *)fullFilenameString
             _colorMapView = textureView;
             _normalMap = normalTexture;
             _diffMap = diffTexture;
-            
+
             self.hasToggleView = _colorMapView != nil;
         }
 
         // this is the actual format, may have been decoded
         MyMTLPixelFormat format = (MyMTLPixelFormat)_colorMap.pixelFormat;
-       _data->updateImageSettings(fullFilename, image, format);
+        _data->updateImageSettings(fullFilename, image, format);
     }
 
     [self resetSomeImageSettings:isTextureNew];
@@ -1264,7 +1251,7 @@ - (BOOL)loadTextureFromImage:(nonnull const char *)fullFilenameString
     return YES;
 }
 
-- (BOOL)loadTexture:(nonnull NSURL *)url
+- (BOOL)loadTexture:(nonnull NSURL*)url
 {
     string fullFilename = url.path.UTF8String;
 
@@ -1277,10 +1264,10 @@ - (BOOL)loadTexture:(nonnull NSURL *)url
 
     // DONE: tie this to url and modstamp differences
     double timestamp = fileDate.timeIntervalSince1970;
-    
+
     bool isTextureNew = _showSettings->isFileNew(fullFilename.c_str());
     bool isTextureChanged = _showSettings->isFileChanged(fullFilename.c_str(), timestamp);
-    
+
     // image can be decoded to rgba8u if platform can't display format natively
     // but still want to identify blockSize from original format
     if (isTextureChanged) {
@@ -1293,7 +1280,7 @@ - (BOOL)loadTexture:(nonnull NSURL *)url
         }
 
         const char* filenameShort = toFilenameShort(fullFilename.c_str());
-        
+
         MTLPixelFormat originalFormatMTL = MTLPixelFormatInvalid;
         id<MTLTexture> texture = [_loader loadTextureFromImage:image
                                                 originalFormat:&originalFormatMTL
@@ -1303,11 +1290,11 @@ - (BOOL)loadTexture:(nonnull NSURL *)url
         }
 
         bool isPNG = isPNGFilename(fullFilename.c_str());
-        
+
         // to be able to turn on/off srgb, need to set a view
         id<MTLTexture> textureView;
         MyMTLPixelFormat textureFormat = (MyMTLPixelFormat)image.pixelFormat;
-        
+
         // DONE: may only want to offer on png files, where format is
         MyMTLPixelFormat viewFormat = textureFormat;
         if (isPNG) // && isSrgbFormat(textureFormat))
@@ -1318,10 +1305,10 @@ - (BOOL)loadTexture:(nonnull NSURL *)url
         else {
             // This may fail.
             textureView = [texture newTextureViewWithPixelFormat:(MTLPixelFormat)viewFormat];
-            
+
             textureView.label = [texture.label stringByAppendingString:@"View"];
         }
-        
+
         // This doesn't look for or load corresponding normal map, but should
 
         // this is not the png data, but info on converted png to ktx level
@@ -1347,13 +1334,13 @@ - (BOOL)loadTexture:(nonnull NSURL *)url
         // TODO: should archive work with diff?
         id<MTLTexture> diffTexture = nil;
         _showSettings->hasDiffTexture = diffTexture != nil;
-        
+
         @autoreleasepool {
             _colorMap = texture;
             _colorMapView = textureView;
             _normalMap = nil;
             _diffMap = nil;
-            
+
             self.hasToggleView = _colorMapView != nil;
         }
 
@@ -1366,44 +1353,41 @@ - (BOOL)loadTexture:(nonnull NSURL *)url
     return YES;
 }
 
-
 - (void)resetSomeImageSettings:(BOOL)isNewFile
 {
     _data->resetSomeImageSettings(isNewFile);
-    
+
     // the rect is ar:1 for images
     float aspectRatioXtoY = _showSettings->imageAspectRatio();
     [self _createMeshRect:aspectRatioXtoY];
 }
 
-
-
 - (void)_updateGameState
 {
     /// Update any game state before encoding rendering commands to our drawable
-    
-    Uniforms &uniforms =
-    *(Uniforms *)_dynamicUniformBuffer[_uniformBufferIndex].contents;
-    
+
+    Uniforms& uniforms =
+        *(Uniforms*)_dynamicUniformBuffer[_uniformBufferIndex].contents;
+
     uniforms.isNormal = _showSettings->texContentType == TexContentTypeNormal;
     uniforms.doShaderPremul = _showSettings->doShaderPremul;
     uniforms.isSrgbInput = _showSettings->isSRGBShown && isSrgbFormat(_showSettings->originalFormat);
     uniforms.isSigned = _showSettings->isSigned;
     uniforms.isSwizzleAGToRG = _showSettings->isSwizzleAGToRG;
-    
+
     uniforms.isSDF = _showSettings->texContentType == TexContentTypeSDF;
     uniforms.numChannels = _showSettings->numChannels;
     uniforms.lightingMode = (ShaderLightingMode)_showSettings->lightingMode;
-    
+
     MyMTLTextureType textureType = MyMTLTextureType2D;
     MyMTLPixelFormat textureFormat = MyMTLPixelFormatInvalid;
     if (_colorMap) {
         textureType = (MyMTLTextureType)_colorMap.textureType;
         textureFormat = (MyMTLPixelFormat)_colorMap.pixelFormat;
     }
-    
+
     uniforms.isCheckerboardShown = _showSettings->isCheckerboardShown;
-    
+
     // addressing mode
     bool isCube = (textureType == MyMTLTextureTypeCube ||
                    textureType == MyMTLTextureTypeCubeArray);
@@ -1411,27 +1395,27 @@ - (void)_updateGameState
     bool doEdge = !doWrap;
     bool doZero = !doEdge;
     uniforms.isWrap = doWrap ? _showSettings->isWrap : false;
-    
+
     uniforms.isPreview = _showSettings->isPreview;
     uniforms.isDiff = _showSettings->isDiff;
-    
+
     uniforms.isNormalMapPreview = false;
     if (uniforms.isPreview) {
         uniforms.isNormalMapPreview = uniforms.isNormal || (_normalMap != nil);
-        
+
         if (_normalMap != nil) {
             uniforms.isNormalMapSigned =
-            isSignedFormat((MyMTLPixelFormat)_normalMap.pixelFormat);
-            uniforms.isNormalMapSwizzleAGToRG = false;  // TODO: need a prop for this
+                isSignedFormat((MyMTLPixelFormat)_normalMap.pixelFormat);
+            uniforms.isNormalMapSwizzleAGToRG = false; // TODO: need a prop for this
         }
     }
-    
+
     // a few things to fix before enabling this
     uniforms.useTangent = _showSettings->useTangent;
-    
+
     uniforms.gridX = 0;
     uniforms.gridY = 0;
-    
+
     if (_showSettings->isPixelGridShown) {
         uniforms.gridX = 1;
         uniforms.gridY = 1;
@@ -1446,19 +1430,19 @@ - (void)_updateGameState
         uniforms.gridX = _showSettings->gridSizeX;
         uniforms.gridY = _showSettings->gridSizeY;
     }
-    
+
     // no debug mode when preview kicks on, make it possible to toggle back and
     // forth more easily
     uniforms.debugMode = (ShaderDebugMode)_showSettings->debugMode;
     uniforms.shapeChannel = (ShaderShapeChannel)_showSettings->shapeChannel;
     uniforms.channels = (ShaderTextureChannels)_showSettings->channels;
-    
+
     // turn these off in preview mode, but they may be useful?
     if (_showSettings->isPreview) {
         uniforms.debugMode = ShaderDebugMode::ShDebugModeNone;
         uniforms.shapeChannel = ShaderShapeChannel::ShShapeChannelNone;
     }
-    
+
     // crude shape experiment
     _showSettings->is3DView = true;
     switch (_showSettings->meshNumber) {
@@ -1481,22 +1465,22 @@ - (void)_updateGameState
             break;
     }
     uniforms.is3DView = _showSettings->is3DView;
-    
+
     // on small textures can really see missing pixel (3 instead of 4 pixels)
     // so only do this on the sphere/capsule which wrap-around uv space
     uniforms.isInsetByHalfPixel = false;
     if (_showSettings->meshNumber >= 2 && doZero) {
         uniforms.isInsetByHalfPixel = true;
     }
-    
+
     _data->updateTransforms();
-    
+
     // this is an animated effect, that overlays the shape uv wires over the image
     uniforms.isUVPreview = _showSettings->uvPreview > 0.0;
     uniforms.uvPreview = _showSettings->uvPreview;
-    
+
     uniforms.uvToShapeRatio = 1.0f;
-    switch(_showSettings->meshNumber) {
+    switch (_showSettings->meshNumber) {
         case 0:
             if (_showSettings->imageBoundsY)
                 uniforms.uvToShapeRatio = _showSettings->imageBoundsX / (float)_showSettings->imageBoundsY;
@@ -1510,15 +1494,15 @@ - (void)_updateGameState
     }
     uniforms.projectionViewMatrix = _data->_projectionViewMatrix;
     uniforms.cameraPosition = _data->_cameraPosition;
-   
+
     // This is per object
     uniforms.modelMatrix = _data->_modelMatrix;
-    uniforms.modelMatrixInvScale2 = _data->_modelMatrixInvScale2;
+   // uniforms.modelMatrixInvScale2 = _data->_modelMatrixInvScale2;
 
     //_rotation += .01;
 }
 
-- (void)_setUniformsLevel:(UniformsLevel &)uniforms mipLOD:(int32_t)mipLOD
+- (void)_setUniformsLevel:(UniformsLevel&)uniforms mipLOD:(int32_t)mipLOD
 {
     uniforms.mipLOD = mipLOD;
 
@@ -1562,14 +1546,14 @@ - (void)_setUniformsLevel:(UniformsLevel &)uniforms mipLOD:(int32_t)mipLOD
     }
 }
 
-- (void)drawInMTKView:(nonnull MTKView *)view
+- (void)drawInMTKView:(nonnull MTKView*)view
 {
     @autoreleasepool {
         // Per frame updates here
 
         // update per frame state
         [self updateAnimationState:view];
-        
+
         // TODO: move this out, needs to get called off mouseMove, but don't want to
         // call drawMain
         [self drawSample];
@@ -1578,29 +1562,29 @@ - (void)drawInMTKView:(nonnull MTKView *)view
         Signpost postWait("waitOnSemaphore");
         dispatch_semaphore_wait(_inFlightSemaphore, DISPATCH_TIME_FOREVER);
         postWait.stop();
-        
+
         _uniformBufferIndex = (_uniformBufferIndex + 1) % MaxBuffersInFlight;
 
         id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
         commandBuffer.label = @"MyCommand";
 
         __block dispatch_semaphore_t block_sema = _inFlightSemaphore;
-        
-        #if USE_GLTF
-                GLTFMTLRenderer* gltfRenderer = _gltfRenderer;
-                [commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> /* buffer */) {
-                    [gltfRenderer signalFrameCompletion];
-        
-                    // increment count
-                    dispatch_semaphore_signal(block_sema);
-                }];
-        
-        #else
-                [commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> /* buffer */) {
-                    // increment count
-                    dispatch_semaphore_signal(block_sema);
-                }];
-        #endif
+
+#if USE_GLTF
+        GLTFMTLRenderer* gltfRenderer = _gltfRenderer;
+        [commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> /* buffer */) {
+            [gltfRenderer signalFrameCompletion];
+
+            // increment count
+            dispatch_semaphore_signal(block_sema);
+        }];
+
+#else
+        [commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> /* buffer */) {
+            // increment count
+            dispatch_semaphore_signal(block_sema);
+        }];
+#endif
 
         [self _updateGameState];
 
@@ -1622,14 +1606,14 @@ - (void)drawInMTKView:(nonnull MTKView *)view
         [self drawMain:commandBuffer
                   view:view];
         postDraw.stop();
-        
+
         // hold onto this for sampling from it via eyedropper
         id<CAMetalDrawable> drawable = view.currentDrawable;
         _lastDrawableTexture = drawable.texture;
 
         // These are equivalent
         // [commandBuffer presentDrawable:view.currentDrawable];
-        
+
         typeof(self) __weak weakSelf = self;
         [commandBuffer addScheduledHandler:^(id<MTLCommandBuffer> cmdBuf) {
             if (cmdBuf.error) return;
@@ -1643,28 +1627,31 @@ - (void)drawInMTKView:(nonnull MTKView *)view
             double gpuTime = cmdBuf.GPUEndTime - cmdBuf.GPUStartTime;
             [weakSelf _updateFramePacing:gpuTime];
         }];
-            
+
         [commandBuffer commit];
     }
 }
-    
-- (void)_present:(id<CAMetalDrawable>)drawable {
+
+- (void)_present:(id<CAMetalDrawable>)drawable
+{
     if (_useFramePacing)
         [drawable presentAfterMinimumDuration:_avgGpuTime];
     else
         [drawable present];
 }
 
-- (void)_updateFramePacing:(double)gpuTime {
+- (void)_updateFramePacing:(double)gpuTime
+{
     if (_useFramePacing) {
         _avgGpuTime = lerp(_avgGpuTime, gpuTime, 0.25);
     }
 }
 
-- (void)setFramePacingEnabled:(bool)enable {
+- (void)setFramePacingEnabled:(bool)enable
+{
     if (_useFramePacing != enable) {
         _useFramePacing = enable;
-        
+
         // this will get adjusted by updateFramePacing
         _avgGpuTime = 1.0 / 60.0;
     }
@@ -1672,32 +1659,32 @@ - (void)setFramePacingEnabled:(bool)enable {
 
 #if USE_GLTF
 
-static GLTFBoundingSphere GLTFBoundingSphereFromBox2(const GLTFBoundingBox b) {
+static GLTFBoundingSphere GLTFBoundingSphereFromBox2(const GLTFBoundingBox b)
+{
     GLTFBoundingSphere s;
     float3 center = 0.5f * (b.minPoint + b.maxPoint);
-    float r = simd::distance(b.maxPoint, center);
-    
+    float r = distance(b.maxPoint, center);
+
     s.center = center;
     s.radius = r;
     return s;
 }
 #endif
 
-
 - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
-            view:(nonnull MTKView *)view
+            view:(nonnull MTKView*)view
 {
     // Delay getting the currentRenderPassDescriptor until absolutely needed. This
     // avoids
     //   holding onto the drawable and blocking the display pipeline any longer
     //   than necessary
     MTLRenderPassDescriptor* renderPassDescriptor = nil;
-    
+
     // This retrieval can take 20ms+ when gpu is busy
     Signpost post("nextDrawable");
     renderPassDescriptor = view.currentRenderPassDescriptor;
     post.stop();
-    
+
     if (renderPassDescriptor == nil) {
         return;
     }
@@ -1706,8 +1693,7 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
 #if USE_GLTF
         && _asset == nil
 #endif
-    )
-    {
+    ) {
         // this will clear target
         id<MTLRenderCommandEncoder> renderEncoder =
             [commandBuffer renderCommandEncoderWithDescriptor:renderPassDescriptor];
@@ -1723,9 +1709,8 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
 #if USE_GLTF
     {
         mylock lock(gModelLock);
-    
+
         if (_asset) {
-            
             // TODO: needs to be done in the render loop, since it must run compute
             // This runs compute to generate radiance/irradiance in mip levels
             // Also an equirect version for a 2d image
@@ -1734,14 +1719,13 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                     [_gltfRenderer.lightingEnvironment generateFromCubeTexture:_environmentTexture commandBuffer:commandBuffer];
                 else
                     [_gltfRenderer.lightingEnvironment generateFromEquirectTexture:_environmentTexture commandBuffer:commandBuffer];
-                
+
                 _environmentNeedsUpdate = false;
             }
         }
     }
 #endif
 
-    
     // Final pass rendering code here
     id<MTLRenderCommandEncoder> renderEncoder =
         [commandBuffer renderCommandEncoderWithDescriptor:renderPassDescriptor];
@@ -1759,18 +1743,18 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
     [renderEncoder setDepthStencilState:_depthStateFull];
 
     bool drawShape = true;
-    
-    #if USE_GLTF
+
+#if USE_GLTF
     {
         mylock lock(gModelLock);
 
         if (_asset) {
             drawShape = false;
-    
+
             // update animations
             if (self.playAnimations) {
-                _animationTime += 1.0/60.0;
-    
+                _animationTime += 1.0 / 60.0;
+
                 NSTimeInterval maxAnimDuration = 0;
                 for (GLTFAnimation* animation in _asset.animations) {
                     for (GLTFAnimationChannel* channel in animation.channels) {
@@ -1779,50 +1763,58 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                         }
                     }
                 }
-            
+
                 NSTimeInterval animTime = fmod(_animationTime, maxAnimDuration);
-    
+
                 for (GLTFAnimation* animation in _asset.animations) {
                     [animation runAtTime:animTime];
                 }
             }
-            
+
             // regularization scales the model to 1 unit dimension, may animate out of this box
             // just a scale to diameter 1, and translate back from center and viewer z
             GLTFBoundingSphere bounds = GLTFBoundingSphereFromBox2(_asset.defaultScene.approximateBounds);
             float invScale = (bounds.radius > 0) ? (0.5 / (bounds.radius)) : 1.0;
-            float4x4 centerScale = float4x4(float4m(invScale,invScale,invScale,1));
+            float4x4 centerScale = float4x4(float4m(invScale, invScale, invScale, 1));
+
+#if USE_SIMDLIB
+            float4x4 centerTranslation = float4x4::identity();
+#else
             float4x4 centerTranslation = matrix_identity_float4x4;
+#endif
             centerTranslation.columns[3] = vector4(-bounds.center, 1.0f);
             float4x4 regularizationMatrix = centerScale * centerTranslation;
-    
+
             // incorporate the rotation now
-            Uniforms &uniforms =
-                *(Uniforms *)_dynamicUniformBuffer[_uniformBufferIndex].contents;
-    
+            Uniforms& uniforms =
+                *(Uniforms*)_dynamicUniformBuffer[_uniformBufferIndex].contents;
+
             regularizationMatrix = regularizationMatrix * uniforms.modelMatrix;
-    
+
             // TODO: be able to pass regularization to affect root of modelMatrix tree,
             // do not modify viewMatrix here since that messes with world space.
-    
+
             // set the view and projection matrix
-            _gltfRenderer.viewMatrix = _data->_viewMatrix * regularizationMatrix;
-            _gltfRenderer.projectionMatrix = _data->_projectionMatrix;
-    
-            RenderScope drawModelScope( renderEncoder, "DrawModel" );
+            float4x4 m = _data->_viewMatrix * regularizationMatrix;
+
+            // TODO: offer conversions to simd/simd.h
+            _gltfRenderer.viewMatrix = reinterpret_cast<const simd_float4x4&>(m);
+            _gltfRenderer.projectionMatrix = reinterpret_cast<const simd_float4x4&>(_data->_projectionMatrix);
+
+            RenderScope drawModelScope(renderEncoder, "DrawModel");
             [_gltfRenderer renderScene:_asset.defaultScene commandBuffer:commandBuffer commandEncoder:renderEncoder];
         }
     }
-    #endif
-    
+#endif
+
     if (drawShape) {
-        RenderScope drawShapeScope( renderEncoder, "DrawShape" );
-        
+        RenderScope drawShapeScope(renderEncoder, "DrawShape");
+
         // set the mesh shape
         for (NSUInteger bufferIndex = 0; bufferIndex < _mesh.vertexBuffers.count;
              bufferIndex++) {
-            MTKMeshBuffer *vertexBuffer = _mesh.vertexBuffers[bufferIndex];
-            if ((NSNull *)vertexBuffer != [NSNull null]) {
+            MTKMeshBuffer* vertexBuffer = _mesh.vertexBuffers[bufferIndex];
+            if ((NSNull*)vertexBuffer != [NSNull null]) {
                 [renderEncoder setVertexBuffer:vertexBuffer.buffer
                                         offset:vertexBuffer.offset
                                        atIndex:bufferIndex];
@@ -1896,7 +1888,7 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
             id<MTLTexture> tex = _colorMap;
             if (self.isToggleView && _colorMap && _colorMapView)
                 tex = _colorMapView;
-            
+
             // set the texture up
             [renderEncoder setFragmentTexture:tex atIndex:TextureIndexColor];
 
@@ -1904,7 +1896,7 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
             if (_normalMap && _showSettings->isPreview) {
                 [renderEncoder setFragmentTexture:_normalMap atIndex:TextureIndexNormal];
             }
-            
+
             if (_diffMap && _showSettings->isDiff) {
                 [renderEncoder setFragmentTexture:_diffMap atIndex:TextureIndexDiff];
             }
@@ -1912,7 +1904,7 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
             UniformsLevel uniformsLevel;
             uniformsLevel.drawOffset = float2m(0.0f);
             uniformsLevel.passNumber = kPassDefault;
-            
+
             if (_showSettings->isPreview) {
                 // upload this on each face drawn, since want to be able to draw all
                 // mips/levels at once
@@ -1946,14 +1938,14 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                 // by the zoom
                 int32_t gap =
                     _showSettings
-                        ->showAllPixelGap;  // * _showSettings->viewContentScaleFactor;
+                        ->showAllPixelGap; // * _showSettings->viewContentScaleFactor;
 
                 for (int32_t mip = 0; mip < _showSettings->mipCount; ++mip) {
                     // upload this on each face drawn, since want to be able to draw all
                     // mips/levels at once
-                    
+
                     [self _setUniformsLevel:uniformsLevel mipLOD:mip];
-                    
+
                     if (mip == 0) {
                         uniformsLevel.drawOffset.y = 0.0f;
                     }
@@ -1961,13 +1953,13 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                         // all mips draw at top mip size currently
                         uniformsLevel.drawOffset.y -= h + gap;
                     }
-                    
+
                     // this its ktxImage.totalChunks()
                     int32_t numLevels = _showSettings->totalChunks();
-                    
+
                     for (int32_t level = 0; level < numLevels; ++level) {
-                        RenderScope drawLevelScope( renderEncoder, "DrawLevel" );
-                        
+                        RenderScope drawLevelScope(renderEncoder, "DrawLevel");
+
                         if (isCube) {
                             uniformsLevel.face = level % 6;
                             uniformsLevel.arrayOrSlice = level / 6;
@@ -1975,7 +1967,7 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                         else {
                             uniformsLevel.arrayOrSlice = level;
                         }
-                        
+
                         // advance x across faces/slices/array elements, 1d array and 2d thin
                         // array are weird though.
                         if (level == 0) {
@@ -1984,25 +1976,25 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                         else {
                             uniformsLevel.drawOffset.x += w + gap;
                         }
-                        
+
                         [renderEncoder setVertexBytes:&uniformsLevel
                                                length:sizeof(uniformsLevel)
                                               atIndex:BufferIndexUniformsLevel];
-                        
+
                         [renderEncoder setFragmentBytes:&uniformsLevel
                                                  length:sizeof(uniformsLevel)
                                                 atIndex:BufferIndexUniformsLevel];
-                        
+
                         // force lod, and don't mip
                         [renderEncoder setFragmentSamplerState:sampler
                                                    lodMinClamp:mip
                                                    lodMaxClamp:mip + 1
                                                        atIndex:SamplerIndexColor];
-                        
+
                         // TODO: since this isn't a preview, have mode to display all faces
                         // and mips on on screen faces and arrays and slices go across in a
                         // row, and mips are displayed down from each of those in a column
-                        
+
                         for (MTKSubmesh* submesh in _mesh.submeshes) {
                             [renderEncoder drawIndexedPrimitives:submesh.primitiveType
                                                       indexCount:submesh.indexCount
@@ -2012,11 +2004,11 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                         }
                     }
                 }
-                
+
                 for (int32_t mip = 0; mip < _showSettings->mipCount; ++mip) {
                     // upload this on each face drawn, since want to be able to draw all
                     // mips/levels at once
-                    
+
                     [self _setUniformsLevel:uniformsLevel mipLOD:mip];
 
                     if (mip == 0) {
@@ -2038,7 +2030,7 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                         else {
                             uniformsLevel.arrayOrSlice = level;
                         }
-                        
+
                         // advance x across faces/slices/array elements, 1d array and 2d thin
                         // array are weird though.
                         if (level == 0) {
@@ -2047,21 +2039,21 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                         else {
                             uniformsLevel.drawOffset.x += w + gap;
                         }
-                        
+
                         [renderEncoder setVertexBytes:&uniformsLevel
                                                length:sizeof(uniformsLevel)
                                               atIndex:BufferIndexUniformsLevel];
-                        
-//                        [renderEncoder setFragmentBytes:&uniformsLevel
-//                                                 length:sizeof(uniformsLevel)
-//                                                atIndex:BufferIndexUniformsLevel];
-                        
+
+                        // [renderEncoder setFragmentBytes:&uniformsLevel
+                        //                          length:sizeof(uniformsLevel)
+                        //                         atIndex:BufferIndexUniformsLevel];
+
                         // force lod, and don't mip
-//                        [renderEncoder setFragmentSamplerState:sampler
-//                                                   lodMinClamp:mip
-//                                                   lodMaxClamp:mip + 1
-//                                                       atIndex:SamplerIndexColor];
-//                        
+                        // [renderEncoder setFragmentSamplerState:sampler
+                        //                            lodMinClamp:mip
+                        //                            lodMaxClamp:mip + 1
+                        //                                atIndex:SamplerIndexColor];
+                        //
                         [self drawAtlas:renderEncoder];
                     }
                 }
@@ -2098,23 +2090,23 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                                              indexBuffer:submesh.indexBuffer.buffer
                                        indexBufferOffset:submesh.indexBuffer.offset];
                 }
-                
+
                 // Draw uv wire overlay
                 if (_showSettings->is3DView && _showSettings->uvPreview > 0.0) {
                     // need to force color in shader or it's still sampling texture
                     // also need to add z offset
-                    
-                    RenderScope drawUVPreviewScope( renderEncoder, "DrawUVPreview" );
-                    
+
+                    RenderScope drawUVPreviewScope(renderEncoder, "DrawUVPreview");
+
                     [renderEncoder setTriangleFillMode:MTLTriangleFillModeLines];
-                    
+
                     // only applies to tris, not points/lines, pushes depth away (towards 0), after clip
                     // affects reads/tests and writes.  Could also add in vertex shader.
                     // depthBias * 2^(exp(max abs(z) in primitive) - r) + slopeScale * maxSlope
-                    [renderEncoder setDepthBias:0.015 slopeScale:3.0 clamp: 0.02];
-                    
+                    [renderEncoder setDepthBias:0.015 slopeScale:3.0 clamp:0.02];
+
                     uniformsLevel.passNumber = kPassUVPreview;
-                    
+
                     [renderEncoder setVertexBytes:&uniformsLevel
                                            length:sizeof(uniformsLevel)
                                           atIndex:BufferIndexUniformsLevel];
@@ -2130,15 +2122,14 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
                                                  indexBuffer:submesh.indexBuffer.buffer
                                            indexBufferOffset:submesh.indexBuffer.offset];
                     }
-                    
+
                     uniformsLevel.passNumber = kPassDefault;
-                    
+
                     // restore state, even though this isn't a true state shadow
                     [renderEncoder setDepthBias:0.0 slopeScale:0.0 clamp:0.0];
                     [renderEncoder setTriangleFillMode:MTLTriangleFillModeFill];
-                    
                 }
-                
+
                 [self drawAtlas:renderEncoder];
             }
         }
@@ -2150,16 +2141,15 @@ - (void)drawMain:(id<MTLCommandBuffer>)commandBuffer
     // TODO: environment map preview should be done as fsq
 }
 
-class RenderScope
-{
+class RenderScope {
 public:
     RenderScope(id encoder_, const char* name)
         : encoder(encoder_)
     {
         id<MTLRenderCommandEncoder> enc = (id<MTLRenderCommandEncoder>)encoder;
-        [enc pushDebugGroup: [NSString stringWithUTF8String: name]];
+        [enc pushDebugGroup:[NSString stringWithUTF8String:name]];
     }
-    
+
     void close()
     {
         if (encoder) {
@@ -2168,51 +2158,52 @@ void close()
             encoder = nil;
         }
     }
-    
+
     ~RenderScope()
     {
         close();
     }
+
 private:
     id encoder;
 };
 
-- (void)drawAtlas:(nonnull id<MTLRenderCommandEncoder>)renderEncoder {
+- (void)drawAtlas:(nonnull id<MTLRenderCommandEncoder>)renderEncoder
+{
     // draw last since this changes pipeline state
     if (_showSettings->is3DView && _showSettings->atlas.empty())
         return;
-    
+
     //if (!_showSettings->drawAtlas)
     //    return;
-        
-    RenderScope drawAtlasScope( renderEncoder, "DrawAtlas" );
-    
+
+    RenderScope drawAtlasScope(renderEncoder, "DrawAtlas");
+
     [renderEncoder setTriangleFillMode:MTLTriangleFillModeLines];
-    [renderEncoder setDepthBias:5.0 slopeScale:0.0 clamp: 0.0];
+    [renderEncoder setDepthBias:5.0 slopeScale:0.0 clamp:0.0];
     [renderEncoder setCullMode:MTLCullModeNone];
-    
+
     [renderEncoder setRenderPipelineState:_pipelineStateDrawLines];
-    
+
     // TODO: draw line strip with prim reset
     // need atlas data in push constants or in vb
-    
+
     // TOOO: also need to hover name or show names on canvas
-    
-//                    [renderEncoder setVertexBytes:&uniformsLevel
-//                                           length:sizeof(uniformsLevel)
-//                                          atIndex:BufferIndexUniformsLevel];
+
+    // [renderEncoder setVertexBytes:&uniformsLevel
+    //                        length:sizeof(uniformsLevel)
+    //                       atIndex:BufferIndexUniformsLevel];
 
     UniformsDebug uniformsDebug;
-    
-    for (const Atlas& atlas: _showSettings->atlas) {
+
+    for (const Atlas& atlas : _showSettings->atlas) {
         // not accounting for slice
         uniformsDebug.rect = float4m(atlas.x, atlas.y, atlas.w, atlas.h);
-        
-        
+
         [renderEncoder setVertexBytes:&uniformsDebug
                                length:sizeof(uniformsDebug)
                               atIndex:BufferIndexUniformsDebug];
-        
+
         // this will draw diagonal
         for (MTKSubmesh* submesh in _mesh.submeshes) {
             [renderEncoder drawIndexedPrimitives:submesh.primitiveType
@@ -2222,7 +2213,7 @@ - (void)drawAtlas:(nonnull id<MTLRenderCommandEncoder>)renderEncoder {
                                indexBufferOffset:submesh.indexBuffer.offset];
         }
     }
-    
+
     // restore state, even though this isn't a true state shadow
     [renderEncoder setCullMode:MTLCullModeBack];
     [renderEncoder setDepthBias:0.0 slopeScale:0.0 clamp:0.0];
@@ -2249,7 +2240,7 @@ - (void)drawSample
     // this reads directly from compressed texture via a compute shader
     int32_t textureLookupX = _showSettings->textureLookupX;
     int32_t textureLookupY = _showSettings->textureLookupY;
-    
+
     bool isDrawableBlit = _showSettings->isEyedropperFromDrawable();
 
     // TODO: only don't blit for plane + no debug or shape
@@ -2315,14 +2306,15 @@ - (void)drawSample
 
         // copy from texture back to CPU, might be easier using MTLBuffer.contents
         MTLRegion region = {
-            {0, 0, 0},  // MTLOrigin
-            {1, 1, 1}   // MTLSize
+            {0, 0, 0}, // MTLOrigin
+            {1, 1, 1} // MTLSize
         };
 
         if (isDrawableBlit) {
             half4 data16f;
             [texture getBytes:&data16f bytesPerRow:8 fromRegion:region mipmapLevel:0];
-            data = toFloat4(data16f);
+
+            data = float4m(data16f);
         }
         else {
             [texture getBytes:&data bytesPerRow:16 fromRegion:region mipmapLevel:0];
@@ -2335,13 +2327,13 @@ - (void)drawSample
             self->_showSettings->textureResult = data;
             self->_showSettings->textureResultX = textureLookupX;
             self->_showSettings->textureResultY = textureLookupY;
-            
+
             [self->_delegateHud updateEyedropperText];
         });
-        
+
         // TODO: This completed handler runs long after the hud has updated
         // so need to invalidate the hud.  So the pixel location is out of date.
-        
+
         // printf("Color %f %f %f %f\n", data.x, data.y, data.z, data.w);
     }];
 
@@ -2360,8 +2352,8 @@ - (void)drawSamples:(id<MTLCommandBuffer>)commandBuffer
 
     renderEncoder.label = @"SampleCompute";
 
-    RenderScope drawShapeScope( renderEncoder, "DrawShape" );
-    
+    RenderScope drawShapeScope(renderEncoder, "DrawShape");
+
     UniformsCS uniforms;
     uniforms.uv.x = lookupX;
     uniforms.uv.y = lookupY;
@@ -2404,7 +2396,7 @@ - (void)drawSamples:(id<MTLCommandBuffer>)commandBuffer
     id<MTLTexture> tex = _colorMap;
     if (self.isToggleView && _colorMap && _colorMapView)
         tex = _colorMapView;
-    
+
     // input and output texture
     [renderEncoder setTexture:tex
                       atIndex:TextureIndexColor];
@@ -2422,7 +2414,7 @@ - (void)drawSamples:(id<MTLCommandBuffer>)commandBuffer
     [renderEncoder endEncoding];
 }
 
-- (void)mtkView:(nonnull MTKView *)view drawableSizeWillChange:(CGSize)size
+- (void)mtkView:(nonnull MTKView*)view drawableSizeWillChange:(CGSize)size
 {
     // Don't crashing trying to readback from the cached drawable during a resize.
     _lastDrawableTexture = nil;
@@ -2440,49 +2432,46 @@ - (void)mtkView:(nonnull MTKView *)view drawableSizeWillChange:(CGSize)size
     _showSettings->viewContentScaleFactor = framebufferScale;
 
     _data->updateProjTransform();
-    
+
 #if USE_GLTF
     _gltfRenderer.drawableSize = size;
     _gltfRenderer.colorPixelFormat = view.colorPixelFormat;
     _gltfRenderer.depthStencilPixelFormat = view.depthStencilPixelFormat;
 #endif
-    
+
     _data->updateProjTransform();
 }
 
 #if USE_GLTF
 // @protocol GLTFAssetLoadingDelegate
-- (void)assetWithURL:(NSURL *)assetURL requiresContentsOfURL:(NSURL *)url completionHandler:(void (^)(NSData *_Nullable, NSError *_Nullable))completionHandler
+- (void)assetWithURL:(NSURL*)assetURL requiresContentsOfURL:(NSURL*)url completionHandler:(void (^)(NSData* _Nullable, NSError* _Nullable))completionHandler
 {
     // This can handle remote assets
-    NSURLSessionDataTask *task = [_urlSession dataTaskWithURL:url
-                                                        completionHandler:^(NSData *data, NSURLResponse *response, NSError *error)
-    {
-        completionHandler(data, error);
-    }];
-    
+    NSURLSessionDataTask* task = [_urlSession dataTaskWithURL:url
+                                            completionHandler:^(NSData* data, NSURLResponse* response, NSError* error) {
+                                                completionHandler(data, error);
+                                            }];
+
     [task resume];
 }
 
-- (void)assetWithURL:(NSURL *)assetURL didFinishLoading:(GLTFAsset *)asset
+- (void)assetWithURL:(NSURL*)assetURL didFinishLoading:(GLTFAsset*)asset
 {
     mylock lock(gModelLock);
-    
+
     _asset = asset;
-    
+
     _animationTime = 0.0;
-    
+
     string fullFilename = assetURL.path.UTF8String;
     [self updateModelSettings:fullFilename];
 }
 
-- (void)assetWithURL:(NSURL *)assetURL didFailToLoadWithError:(NSError *)error;
+- (void)assetWithURL:(NSURL*)assetURL didFailToLoadWithError:(NSError*)error;
 {
     // TODO: display this error to the user
     KLOGE("Renderer", "Asset load failed with error: %s", [[error localizedDescription] UTF8String]);
 }
 #endif
 
-
-
 @end
diff --git a/kramv/KramViewerBase.cpp b/kramv/KramViewerBase.cpp
index 02b23eb3..74df253e 100644
--- a/kramv/KramViewerBase.cpp
+++ b/kramv/KramViewerBase.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -13,8 +13,8 @@
 #endif
 
 namespace kram {
-using namespace simd;
-using namespace NAMESPACE_STL;
+using namespace SIMD_NAMESPACE;
+using namespace STL_NAMESPACE;
 
 #define ArrayCount(x) (sizeof(x) / sizeof(x[0]))
 
@@ -26,14 +26,14 @@ using namespace NAMESPACE_STL;
 
 // Writing out to rgba32 for sampling, but unorm formats like ASTC and RGBA8
 // are still off and need to use the following.
-float  toSnorm8(float c)  { return (255.0f / 127.0f) * c - (128.0f / 127.0f); }
+float toSnorm8(float c) { return (255.0f / 127.0f) * c - (128.0f / 127.0f); }
 float2 toSnorm8(float2 c) { return (255.0f / 127.0f) * c - (128.0f / 127.0f); }
 float3 toSnorm8(float3 c) { return (255.0f / 127.0f) * c - (128.0f / 127.0f); }
 float4 toSnorm8(float4 c) { return (255.0f / 127.0f) * c - (128.0f / 127.0f); }
 
-float4 toSnorm(float4 c)  { return 2.0f * c - 1.0f; }
+float4 toSnorm(float4 c) { return 2.0f * c - 1.0f; }
 
-inline float4 toPremul(const float4 &c)
+inline float4 toPremul(const float4& c)
 {
     // premul with a
     float4 cpremul = c;
@@ -48,9 +48,9 @@ inline bool almost_equal_elements(float3 v, float tol)
     return (fabs(v.x - v.y) < tol) && (fabs(v.x - v.z) < tol);
 }
 
-inline const float3x3& toFloat3x3(const float4x4 &m) { return (const float3x3 &)m; }
+inline const float3x3& toFloat3x3(const float4x4& m) { return (const float3x3&)m; }
 
-float4 inverseScaleSquared(const float4x4 &m)
+float4 inverseScaleSquared(const float4x4& m)
 {
     float3 scaleSquared = float3m(length_squared(m.columns[0].xyz),
                                   length_squared(m.columns[1].xyz),
@@ -63,7 +63,7 @@ float4 inverseScaleSquared(const float4x4 &m)
 
     // don't divide by 0
     float3 invScaleSquared =
-        recip(simd::max(float3m(0.0001 * 0.0001), scaleSquared));
+        recip(SIMD_NAMESPACE::max(float3m(0.0001 * 0.0001), scaleSquared));
 
     // identify determinant here for flipping orientation
     // all shapes with negative determinant need orientation flipped for
@@ -79,7 +79,7 @@ static string filenameNoExtension(const char* filename)
     if (dotPosStr == nullptr)
         return filename;
     auto dotPos = dotPosStr - filename;
-    
+
     // now chop off the extension
     string filenameNoExt = filename;
     return filenameNoExt.substr(0, dotPos);
@@ -88,47 +88,48 @@ static string filenameNoExtension(const char* filename)
 static void findPossibleNormalMapFromAlbedoFilename(const char* filename, vector<string>& normalFilenames)
 {
     normalFilenames.clear();
-    
+
     string filenameShort = filename;
-    
+
     const char* ext = strrchr(filename, '.');
 
     const char* dotPosStr = strrchr(filenameShort.c_str(), '.');
     if (dotPosStr == nullptr)
         return;
-    
+
     auto dotPos = dotPosStr - filenameShort.c_str();
-    
+
     // now chop off the extension
     filenameShort = filenameShort.substr(0, dotPos);
 
-    const char* searches[] = { "-a", "-d", "_Color", "_baseColor" };
-    
+    const char* searches[] = {"-a", "-d", "_Color", "_baseColor"};
+
     for (uint32_t i = 0; i < ArrayCount(searches); ++i) {
         const char* search = searches[i];
         if (endsWith(filenameShort, search)) {
-            filenameShort = filenameShort.substr(0, filenameShort.length()-strlen(search));
+            filenameShort = filenameShort.substr(0, filenameShort.length() - strlen(search));
             break;
         }
     }
-     
-    const char* suffixes[] = { "-n", "_normal", "_Normal" };
-    
+
+    const char* suffixes[] = {"-n", "_normal", "_Normal"};
+
     string normalFilename;
     for (uint32_t i = 0; i < ArrayCount(suffixes); ++i) {
         const char* suffix = suffixes[i];
-        
+
         // may need to try various names, and see if any exist
         normalFilename = filenameShort;
         normalFilename += suffix;
         normalFilename += ext;
-        
+
         normalFilenames.push_back(normalFilename);
     }
 }
 
 // this aliases the existing string, so can't chop extension
-inline const char* toFilenameShort(const char* filename) {
+inline const char* toFilenameShort(const char* filename)
+{
     const char* filenameShort = strrchr(filename, '/');
     if (filenameShort == nullptr) {
         filenameShort = filename;
@@ -141,8 +142,8 @@ inline const char* toFilenameShort(const char* filename) {
 
 static const vector<const char*> supportedModelExt = {
 #if USE_GLTF
-     ".gltf",
-     ".glb",
+    ".gltf",
+    ".glb",
 #endif
 #if USE_USD
     ".gltf",
@@ -150,24 +151,27 @@ static const vector<const char*> supportedModelExt = {
 #endif
 };
 
-
-bool isSupportedModelFilename(const char* filename) {
-    for (const char* ext: supportedModelExt) {
+bool isSupportedModelFilename(const char* filename)
+{
+    for (const char* ext : supportedModelExt) {
         if (endsWithExtension(filename, ext)) {
             return true;
         }
     }
     return false;
 }
-bool isSupportedArchiveFilename(const char* filename) {
+bool isSupportedArchiveFilename(const char* filename)
+{
     return endsWithExtension(filename, ".zip");
 }
 
-bool isSupportedJsonFilename(const char* filename) {
+bool isSupportedJsonFilename(const char* filename)
+{
     return endsWith(filename, "-atlas.json");
 }
 
-bool isDirectory(const char* filename) {
+bool isDirectory(const char* filename)
+{
     FileHelper fileHelper;
     return fileHelper.isDirectory(filename);
 }
@@ -185,9 +189,9 @@ File::File(const char* name_, int32_t urlIndex_)
 {
 }
 
-const char *ShowSettings::meshNumberName(uint32_t meshNumber_) const
+const char* ShowSettings::meshNumberName(uint32_t meshNumber_) const
 {
-    const char *text = "";
+    const char* text = "";
 
     switch (meshNumber_) {
         case 0:
@@ -212,9 +216,9 @@ const char *ShowSettings::meshNumberName(uint32_t meshNumber_) const
     return text;
 }
 
-const char *ShowSettings::meshNumberText() const
+const char* ShowSettings::meshNumberText() const
 {
-    const char *text = "";
+    const char* text = "";
 
     switch (meshNumber) {
         case 0:
@@ -239,9 +243,9 @@ const char *ShowSettings::meshNumberText() const
     return text;
 }
 
-const char *ShowSettings::shapeChannelText() const
+const char* ShowSettings::shapeChannelText() const
 {
-    const char *text = "";
+    const char* text = "";
 
     switch (shapeChannel) {
         case ShapeChannelNone:
@@ -265,7 +269,7 @@ const char *ShowSettings::shapeChannelText() const
         case ShapeChannelFaceNormal:
             text = "Show Faces";
             break;
-        // case ShapeChannelBumpNormal: text = "Show Bumps"; break;
+            // case ShapeChannelBumpNormal: text = "Show Bumps"; break;
         case ShapeChannelMipLevel:
             text = "Show Mip Levels";
             break;
@@ -276,9 +280,9 @@ const char *ShowSettings::shapeChannelText() const
     return text;
 }
 
-const char *ShowSettings::debugModeText() const
+const char* ShowSettings::debugModeText() const
 {
-    const char *text = "";
+    const char* text = "";
 
     switch (debugMode) {
         case DebugModeNone:
@@ -314,9 +318,9 @@ const char *ShowSettings::debugModeText() const
     return text;
 }
 
-const char *ShowSettings::lightingModeText() const
+const char* ShowSettings::lightingModeText() const
 {
-    const char *text = "";
+    const char* text = "";
 
     switch (lightingMode) {
         case LightingModeDiffuse:
@@ -430,7 +434,7 @@ void ShowSettings::advanceDebugMode(bool decrement)
 
     bool isNormal = texContentType == TexContentTypeNormal;
     bool isSDF = texContentType == TexContentTypeSDF;
-    
+
     // for normals show directions
     if (debugMode == DebugModePosX && !(isNormal || isSDF)) {
         advanceDebugMode(decrement);
@@ -456,27 +460,26 @@ void ShowSettings::updateUVPreviewState()
                 if (uvPreview < 1.0)
                     uvPreview += uvPreviewStep;
             }
-            else
-            {
+            else {
                 if (uvPreview > 0.0)
                     uvPreview -= uvPreviewStep;
             }
 
-            uvPreview = saturate(uvPreview);
+            uvPreview = std::clamp(uvPreview, 0.0f, 1.0f);
         }
     }
     else {
         // This hides the uvView even when switchig back to 3d shape
         //uvPreview = 0.0;
     }
-    
+
     // stop the frame update
     if (uvPreview == 0.0f || uvPreview == 1.0f) {
         uvPreviewFrames = 0;
     }
 }
 
-void printChannels(string &tmp, const string &label, float4 c,
+void printChannels(string& tmp, const string& label, float4 c,
                    int32_t numChannels, bool isFloat, bool isSigned)
 {
     if (isFloat || isSigned) {
@@ -519,7 +522,7 @@ void printChannels(string &tmp, const string &label, float4 c,
 }
 
 string ShowSettings::windowTitleString(const char* filename) const
-{    
+{
     // set title to filename, chop this to just file+ext, not directory
     const char* filenameShort = strrchr(filename, '/');
     if (filenameShort == nullptr) {
@@ -528,9 +531,9 @@ string ShowSettings::windowTitleString(const char* filename) const
     else {
         filenameShort += 1;
     }
-    
+
     string title = "kramv - ";
-    
+
     if (isModel) {
         title += formatTypeName(originalFormat);
         title += " - ";
@@ -540,141 +543,43 @@ string ShowSettings::windowTitleString(const char* filename) const
         // was using subtitle, but that's macOS 11.0 feature.
         title += formatTypeName(originalFormat);
         title += " - ";
-        
+
         // identify what we think the content type is
         const char* typeText = "";
-        switch(texContentType) {
-            case TexContentTypeAlbedo: typeText = "a"; break;
-            case TexContentTypeNormal: typeText = "n"; break;
-            case TexContentTypeAO: typeText = "ao"; break;
-            case TexContentTypeMetallicRoughness: typeText = "mr"; break;
-            case TexContentTypeSDF: typeText = "sdf"; break;
-            case TexContentTypeHeight: typeText = "h"; break;
-            case TexContentTypeUnknown: typeText = ""; break;
+        switch (texContentType) {
+            case TexContentTypeAlbedo:
+                typeText = "a";
+                break;
+            case TexContentTypeNormal:
+                typeText = "n";
+                break;
+            case TexContentTypeAO:
+                typeText = "ao";
+                break;
+            case TexContentTypeMetallicRoughness:
+                typeText = "mr";
+                break;
+            case TexContentTypeSDF:
+                typeText = "sdf";
+                break;
+            case TexContentTypeHeight:
+                typeText = "h";
+                break;
+            case TexContentTypeUnknown:
+                typeText = "";
+                break;
         }
         title += typeText;
         // add some info about the texture to avoid needing to go to info
         // srgb src would be useful too.
         if (texContentType == TexContentTypeAlbedo && isPremul) {
             title += ",p";
-            
         }
         title += " - ";
         title += filenameShort;
     }
-    
-    return title;
-}
 
-float4x4 matrix4x4_translation(float tx, float ty, float tz)
-{
-    float4x4 m = {(float4){1, 0, 0, 0},
-                  (float4){0, 1, 0, 0},
-                  (float4){0, 0, 1, 0},
-                  (float4){tx, ty, tz, 1}};
-    return m;
-}
-
-float4x4 matrix4x4_rotation(float radians, vector_float3 axis)
-{
-    axis = vector_normalize(axis);
-    float ct = cosf(radians);
-    float st = sinf(radians);
-    float ci = 1 - ct;
-    float x = axis.x, y = axis.y, z = axis.z;
-
-    float4x4 m = {
-        (float4){ ct + x * x * ci,     y * x * ci + z * st, z * x * ci - y * st, 0},
-        (float4){ x * y * ci - z * st,     ct + y * y * ci, z * y * ci + x * st, 0},
-        (float4){ x * z * ci + y * st, y * z * ci - x * st,     ct + z * z * ci, 0},
-        (float4){                   0,                   0,                   0, 1}
-    };
-    return m;
-}
-
-float4x4 perspective_rhs(float fovyRadians, float aspectXtoY, float nearZ, float farZ, bool isReverseZ)
-{
-    // form tangents
-    float tanY = tanf(fovyRadians * 0.5f);
-    float tanX = tanY * aspectXtoY;
-
-    // currently symmetric
-    // all postive values from center
-    float4 tangents = { tanY, tanY, tanX, tanX };
-    tangents *= nearZ;
-    
-    float t =  tangents.x;
-    float b = -tangents.y;
-    float r =  tangents.z;
-    float l = -tangents.w;
-    
-    float dx = (r - l);
-    float dy = (t - b);
-     
-    float xs = 2.0f * nearZ / dx;
-    float ys = 2.0f * nearZ / dy;
-
-    // 0.5x?
-    float xoff = (r + l) / dx;
-    float yoff = (t + b) / dy;
-
-    float m22;
-    float m23;
-
-    if (isReverseZ) {
-        // zs drops out since zs = inf / -inf = 1, 1-1 = 0
-        // z' = near / -z
-        
-        m22 = 0;
-        m23 = nearZ;
-    }
-    else {
-        float zs = farZ / (nearZ - farZ);
-
-        m22 = zs;
-        m23 = zs * nearZ;
-    }
-     
-    float4x4 m = {
-        (float4){ xs,       0,   0,  0 },
-        (float4){  0,      ys,   0,  0 },
-        (float4){  xoff, yoff, m22, -1 },
-        (float4){  0,       0, m23,  0 }
-    };
-     
-    return m;
-}
-
-float4x4 orthographic_rhs(float width, float height, float nearZ, float farZ,
-                          bool isReverseZ)
-{
-    // float aspectRatio = width / height;
-    float xs = 2.0f / width;
-    float ys = 2.0f / height;
-    
-    float xoff = 0.0f;  // -0.5f * width;
-    float yoff = 0.0f;  // -0.5f * height;
-    
-    float dz = -(farZ - nearZ);
-    float zs = 1.0f / dz;
-    
-    float m22 = zs;
-    float m23 = zs * nearZ;
-    
-    // revZ, can't use infiniteZ with ortho view
-    if (isReverseZ) {
-        m22 = -m22;
-        m23 = 1.0f - m23;
-    }
-    
-    float4x4 m = {
-        (float4){xs, 0, 0, 0},
-        (float4){0, ys, 0, 0},
-        (float4){0, 0, m22, 0},
-        (float4){xoff, yoff, m23, 1}
-    };
-    return m;
-    
+    return title;
 }
 
 //--------------------------------
@@ -683,8 +588,16 @@ float4x4 orthographic_rhs(float width, float height, float nearZ, float farZ,
 
 Data::Data()
 {
+#if USE_SIMDLIB && KRAM_DEBUG && 0
+    vecf vfmt;
+
+    // want to see the simd config
+    KLOGI("SIMDK", "%s", vfmt.simd_configs().c_str());
+    KLOGI("SIMDK", "%s", vfmt.simd_alignments().c_str());
+#endif
+
     _showSettings = new ShowSettings();
-    
+
     _textSlots.resize(kTextSlotCount);
 }
 Data::~Data()
@@ -692,7 +605,8 @@ Data::~Data()
     delete _showSettings;
 }
 
-void Data::clearAtlas() {
+void Data::clearAtlas()
+{
     _showSettings->atlas.clear();
     _showSettings->lastAtlas = nullptr;
 }
@@ -702,50 +616,50 @@ void Data::clearAtlas() {
 bool Data::loadAtlasFile(const char* filename)
 {
     using namespace simdjson;
-    
+
     clearAtlas();
-    
+
     Timer timer;
-    
+
     // can just mmap the json
     MmapHelper mmap;
     if (!mmap.open(filename)) {
         KLOGE("kramv", "Failed to open %s", filename);
         return false;
     }
-    
+
     ondemand::parser parser;
-    
+
     padded_string json((const char*)mmap.data(), mmap.dataLength());
     auto atlasProps = parser.iterate(json);
-       
+
     // can we get at memory use numbers to do the parse?
     KLOGI("kramv", "parsed %.0f KB of json in %.3fms",
           (double)mmap.dataLength() / 1024.0,
           timer.timeElapsedMillis());
-    
+
     // Can use hover or a show all on these entries and names.
     // Draw names on screen using system text in the upper left corner if 1
     // if showing all, then show names across each mip level.  May want to
     // snap to pixels on each mip level so can see overlap.
-    
+
     {
         std::vector<double> values;
         //string_view atlasName = atlasProps["name"].get_string().value_unsafe();
-        
+
         uint64_t width = atlasProps["width"].get_uint64().value_unsafe();
         uint64_t height = atlasProps["height"].get_uint64().value_unsafe();
-    
+
         uint64_t slice = atlasProps["slice"].get_uint64().value_unsafe();
-        
+
         float uPad = 0.0f;
         float vPad = 0.0f;
-        
+
         if (atlasProps["paduv"].get_array().error() != NO_SUCH_FIELD) {
             values.clear();
             for (auto value : atlasProps["paduv"])
                 values.push_back(value.get_double().value_unsafe());
-            
+
             uPad = values[0];
             vPad = values[1];
         }
@@ -753,29 +667,27 @@ bool Data::loadAtlasFile(const char* filename)
             values.clear();
             for (auto value : atlasProps["padpx"])
                 values.push_back(value.get_double().value_unsafe());
-            
+
             uPad = values[0];
             vPad = values[1];
-            
+
             uPad /= width;
             vPad /= height;
         }
-        
-        for (auto regionProps: atlasProps["regions"])
-        {
+
+        for (auto regionProps : atlasProps["regions"]) {
             string_view name = regionProps["name"].get_string().value_unsafe();
-            
+
             float x = 0.0f;
             float y = 0.0f;
             float w = 0.0f;
             float h = 0.0f;
-            
-            if (regionProps["ruv"].get_array().error() != NO_SUCH_FIELD)
-            {
+
+            if (regionProps["ruv"].get_array().error() != NO_SUCH_FIELD) {
                 values.clear();
                 for (auto value : regionProps["ruv"])
                     values.push_back(value.get_double().value_unsafe());
-            
+
                 // Note: could convert pixel and mip0 size to uv.
                 // normalized uv make these easier to draw across all mips
                 x = values[0];
@@ -783,37 +695,36 @@ bool Data::loadAtlasFile(const char* filename)
                 w = values[2];
                 h = values[3];
             }
-            else if (regionProps["rpx"].get_array().error() != NO_SUCH_FIELD)
-            {
+            else if (regionProps["rpx"].get_array().error() != NO_SUCH_FIELD) {
                 values.clear();
                 for (auto value : regionProps["rpx"])
                     values.push_back(value.get_double().value_unsafe());
-            
+
                 x = values[0];
                 y = values[1];
                 w = values[2];
                 h = values[3];
-                
+
                 // normalize to uv using the width/height
                 x /= width;
                 y /= height;
                 w /= width;
                 h /= height;
             }
-                
+
             const char* verticalProp = "f"; // regionProps["rot"];
             bool isVertical = verticalProp && verticalProp[0] == 't';
-            
-            Atlas atlas = {(string)name, x,y, w,h, uPad,vPad, isVertical, (uint32_t)slice};
+
+            Atlas atlas = {(string)name, x, y, w, h, uPad, vPad, isVertical, (uint32_t)slice};
             _showSettings->atlas.emplace_back(std::move(atlas));
         }
     }
-    
+
     // TODO: also need to be able to bring in vector shapes
     // maybe from svg or files written out from figma or photoshop.
     // Can triangulate those, and use xatlas to pack those.
     // Also xatlas can flatten out a 3d model into a chart.
-    
+
     return true;
 }
 
@@ -822,16 +733,16 @@ bool Data::loadAtlasFile(const char* filename)
 bool Data::loadAtlasFile(const char* filename)
 {
     using namespace json11;
-    
+
     clearAtlas();
-    
+
     // can just mmap the json
     MmapHelper mmap;
     if (!mmap.open(filename)) {
         KLOGE("kramv", "Failed to open %s", filename);
         return false;
     }
-    
+
     Timer timer;
     JsonReader jsonReader;
     const Json* root = jsonReader.read((const char*)mmap.data(), mmap.dataLength());
@@ -841,37 +752,36 @@ bool Data::loadAtlasFile(const char* filename)
         return false;
     }
     timer.stop();
-    
+
     KLOGI("kramv", "parsed %.0f KB of json using %.0f KB of memory in %.3fms",
           (double)mmap.dataLength() / 1024.0,
           (double)jsonReader.memoryUse() / 1024.0,
           timer.timeElapsedMillis());
-    
+
     const Json& atlasProps = (*root)[(uint32_t)0];
-    
+
     // Can use hover or a show all on these entries and names.
     // Draw names on screen using system text in the upper left corner if 1
     // if showing all, then show names across each mip level.  May want to
     // snap to pixels on each mip level so can see overlap.
-    
-    
+
     {
         std::vector<double> values;
         // string_view atlasName = atlasProps["name"].get_string().value_unsafe();
-        
+
         int width = atlasProps["width"].int_value();
         int height = atlasProps["height"].int_value();
-    
+
         int slice = atlasProps["slice"].int_value();
-        
+
         float uPad = 0.0f;
         float vPad = 0.0f;
-        
+
         if (atlasProps["paduv"].is_array()) {
             values.clear();
             for (const auto& value : atlasProps["paduv"])
                 values.push_back(value.number_value());
-            
+
             uPad = values[0];
             vPad = values[1];
         }
@@ -879,30 +789,28 @@ bool Data::loadAtlasFile(const char* filename)
             values.clear();
             for (const auto& value : atlasProps["padpx"])
                 values.push_back(value.number_value());
-            
+
             uPad = values[0];
             vPad = values[1];
-            
+
             uPad /= width;
             vPad /= height;
         }
-        
+
         string decodedName;
-        for (auto regionProps: atlasProps["regions"])
-        {
+        for (auto regionProps : atlasProps["regions"]) {
             const char* name = regionProps["name"].string_value(decodedName);
-            
+
             float x = 0.0f;
             float y = 0.0f;
             float w = 0.0f;
             float h = 0.0f;
-            
-            if (regionProps["ruv"].is_array())
-            {
+
+            if (regionProps["ruv"].is_array()) {
                 values.clear();
                 for (auto value : regionProps["ruv"])
                     values.push_back(value.number_value());
-            
+
                 // Note: could convert pixel and mip0 size to uv.
                 // normalized uv make these easier to draw across all mips
                 x = values[0];
@@ -910,60 +818,59 @@ bool Data::loadAtlasFile(const char* filename)
                 w = values[2];
                 h = values[3];
             }
-            else if (regionProps["rpx"].is_array())
-            {
+            else if (regionProps["rpx"].is_array()) {
                 values.clear();
                 for (auto value : regionProps["rpx"])
                     values.push_back(value.number_value());
-            
+
                 x = values[0];
                 y = values[1];
                 w = values[2];
                 h = values[3];
-                
+
                 // normalize to uv using the width/height
                 x /= width;
                 y /= height;
                 w /= width;
                 h /= height;
             }
-                
+
             const char* verticalProp = "f"; // regionProps["rot"];
             bool isVertical = verticalProp && verticalProp[0] == 't';
-            
-            Atlas atlas = {name, x,y, w,h, uPad,vPad, isVertical, (uint32_t)slice};
+
+            Atlas atlas = {name, x, y, w, h, uPad, vPad, isVertical, (uint32_t)slice};
             _showSettings->atlas.emplace_back(std::move(atlas));
         }
     }
-    
+
     // TODO: also need to be able to bring in vector shapes
     // maybe from svg or files written out from figma or photoshop.
     // Can triangulate those, and use xatlas to pack those.
     // Also xatlas can flatten out a 3d model into a chart.
-    
+
     return true;
 }
 
 #endif
 
 // opens archive
-bool Data::openArchive(const char * zipFilename, int32_t urlIndex)
+bool Data::openArchive(const char* zipFilename, int32_t urlIndex)
 {
     // grow the array, ptrs so that existing mmaps aren't destroyed
     if (urlIndex >= _containers.size()) {
         _containers.resize(urlIndex + 1, nullptr);
     }
-    
+
     if (_containers[urlIndex] == nullptr)
         _containers[urlIndex] = new FileContainer;
-    
+
     FileContainer& container = *_containers[urlIndex];
     MmapHelper& zipMmap = container.zipMmap;
     ZipHelper& zip = container.zip;
-    
+
     // close any previous zip
     zipMmap.close();
-    
+
     // open the mmap again
     if (!zipMmap.open(zipFilename)) {
         return false;
@@ -979,115 +886,119 @@ bool Data::listFilesInArchive(int32_t urlIndex)
 {
     FileContainer& container = *_containers[urlIndex];
     ZipHelper& zip = container.zip;
-    
+
     // filter out unsupported extensions
     vector<string> extensions = {
-        ".ktx", ".ktx2", ".png", ".dds" // textures
+        ".ktx", ".ktx2", ".png", // textures
+        ".dds", ".DDS" // allow caps for dds
 #if USE_GLTF
-        // TODO: can't support these until have a loader from memory block
-        // GLTFAsset requires a URL.
-        //, ".glb", ".gltf" // models
+    // TODO: can't support these until have a loader from memory block
+    // GLTFAsset requires a URL.
+    //, ".glb", ".gltf" // models
 #endif
 #if USE_USD
-        , ".usd", ".usda", ".usb"
+        ,
+        ".usd", ".usda", ".usb"
 #endif
     };
-    
+
     container.zip.filterExtensions(extensions);
-    
+
     // don't switch to empty archive
     if (zip.zipEntrys().empty()) {
         return false;
     }
-    
-    for (const auto& entry: zip.zipEntrys()) {
+
+    for (const auto& entry : zip.zipEntrys()) {
         _files.emplace_back(File(entry.filename, urlIndex));
     }
-    
+
     return true;
 }
 
 // TODO: can simplify by storing counterpart id when file list is created
-bool Data::hasCounterpart(bool increment) {
+bool Data::hasCounterpart(bool increment)
+{
     if (_files.size() <= 1) {
         return false;
     }
-    
+
     const File& file = _files[_fileIndex];
     string currentFilename = filenameNoExtension(file.nameShort.c_str());
-   
+
     uint32_t nextFileIndex = _fileIndex;
-    
+
     size_t numEntries = _files.size();
     if (increment)
         nextFileIndex++;
     else
-        nextFileIndex += numEntries - 1;  // back 1
-    
+        nextFileIndex += numEntries - 1; // back 1
+
     nextFileIndex = nextFileIndex % numEntries;
-    
+
     const File& nextFile = _files[nextFileIndex];
     string nextFilename = filenameNoExtension(nextFile.nameShort.c_str());
-    
+
     // if short name matches (no ext) then it's a counterpart
     if (currentFilename != nextFilename)
-       return false;
-    
+        return false;
+
     return true;
 }
 
-bool Data::advanceCounterpart(bool increment) {
-    
+bool Data::advanceCounterpart(bool increment)
+{
     if (_files.size() <= 1) {
         return false;
     }
-    
+
     // see if file has counterparts
     const File& file = _files[_fileIndex];
     string currentFilename = filenameNoExtension(file.nameShort.c_str());
-    
+
     // TODO: this should cycle through only the counterparts
     uint32_t nextFileIndex = _fileIndex;
-    
+
     size_t numEntries = _files.size();
     if (increment)
         nextFileIndex++;
     else
-        nextFileIndex += numEntries - 1;  // back 1
+        nextFileIndex += numEntries - 1; // back 1
 
     nextFileIndex = nextFileIndex % numEntries;
-    
+
     const File& nextFile = _files[nextFileIndex];
     string nextFilename = filenameNoExtension(nextFile.nameShort.c_str());
-    
+
     if (currentFilename != nextFilename)
         return false;
-    
+
     _fileIndex = nextFileIndex;
-    
+
     return _delegate.loadFile(true);
 }
 
-bool Data::advanceFile(bool increment) {
+bool Data::advanceFile(bool increment)
+{
     if (_files.empty()) {
         return false;
     }
-    
+
     size_t numEntries = _files.size();
     if (increment)
         _fileIndex++;
     else
-        _fileIndex += numEntries - 1;  // back 1
+        _fileIndex += numEntries - 1; // back 1
 
     _fileIndex = _fileIndex % numEntries;
-    
+
     return _delegate.loadFile(true);
 }
 
 bool Data::findFilename(const string& filename)
 {
     bool isFound = false;
-    
+
     // linear search
     for (const auto& search : _files) {
         if (search.name == filename) {
@@ -1101,7 +1012,7 @@ bool Data::findFilename(const string& filename)
 bool Data::findFilenameShort(const string& filename)
 {
     bool isFound = false;
-    
+
     // linear search
     for (const auto& search : _files) {
         if (search.nameShort == filename) {
@@ -1133,20 +1044,20 @@ const Atlas* Data::findAtlasAtUV(float2 pt)
 {
     if (_showSettings->atlas.empty()) return nullptr;
     if (_showSettings->imageBoundsX == 0) return nullptr;
-   
+
     const Atlas* atlas = nullptr;
-    
+
     // Note: rects are in uv
-    
+
     // This might need to become an atlas array index instead of ptr
     const Atlas* lastAtlas = _showSettings->lastAtlas;
-    
+
     if (lastAtlas) {
         if (isPtInRect(pt, lastAtlas->rect())) {
             atlas = lastAtlas;
         }
     }
-    
+
     if (!atlas) {
         // linear search
         for (const auto& search : _showSettings->atlas) {
@@ -1155,102 +1066,105 @@ const Atlas* Data::findAtlasAtUV(float2 pt)
                 break;
             }
         }
-        
+
         _showSettings->lastAtlas = atlas;
     }
-    
+
     return atlas;
 }
 
-
 bool Data::isArchive() const
 {
     //NSArray<NSURL*>* urls_ = (NSArray<NSURL*>*)_delegate._urls;
     //NSURL* url = urls_[_files[_fileIndex].urlIndex];
     //const char* filename = url.fileSystemRepresentation;
-    
+
     string filename = _urls[_files[_fileIndex].urlIndex];
     return isSupportedArchiveFilename(filename.c_str());
 }
 
-
+void Data::setPerfDirectory(const char* directory)
+{
+    Perf* perf = Perf::instance();
+    perf->setPerfDirectory(directory);
+}
 
 bool Data::loadFile()
 {
     if (isArchive()) {
         return loadFileFromArchive();
     }
-    
+
     // now lookup the filename and data at that entry
     const File& file = _files[_fileIndex];
     const char* filename = file.name.c_str();
-    
+
     string fullFilename = filename;
     auto timestamp = FileHelper::modificationTimestamp(filename);
-    
+
     bool isTextureChanged = _showSettings->isFileChanged(filename, timestamp);
     if (!isTextureChanged) {
         return true;
     }
-    
+
 #if USE_GLTF || USE_USD
     bool isModel = isSupportedModelFilename(filename);
     if (isModel) {
         bool success = _delegate.loadModelFile(filename);
-        
+
         if (success) {
             // store the filename
             _showSettings->lastFilename = filename;
             _showSettings->lastTimestamp = timestamp;
         }
-        
+
         return success;
     }
 #endif
-    
+
     // have already filtered filenames out, so this should never get hit
     if (!isSupportedFilename(filename)) {
         return false;
     }
-    
+
     // Note: better to extract from filename instead of root of folder dropped
     // or just keep displaying full path of filename.
-    
+
     _archiveName.clear();
-    
+
     vector<string> possibleNormalFilenames;
     string normalFilename;
     bool hasNormal = false;
-    
+
     TexContentType texContentType = findContentTypeFromFilename(filename);
     if (texContentType == TexContentTypeAlbedo) {
         findPossibleNormalMapFromAlbedoFilename(filename, possibleNormalFilenames);
-        
-        for (const auto& name: possibleNormalFilenames) {
+
+        for (const auto& name : possibleNormalFilenames) {
             hasNormal = findFilename(name);
-            
+
             if (hasNormal) {
                 normalFilename = name;
                 break;
             }
         }
     }
-    
+
     // see if there is an atlas file too, and load the rectangles for preview
     // note sidecar atlas files are a pain to view with a sandbox, may want to
     // splice into ktx/ktx2 files, but no good metadata for png/dds.
     _showSettings->atlas.clear();
-    
+
     string atlasFilename = filenameNoExtension(filename);
     bool hasAtlas = false;
-    
-    // replace -a, -d, with -atlas.json
+
+    // replace -a, -d, with -atlas.jsonc
     const char* dashPosStr = strrchr(atlasFilename.c_str(), '-');
     if (dashPosStr != nullptr) {
         atlasFilename = atlasFilename.substr(0, dashPosStr - atlasFilename.c_str());
     }
     atlasFilename += "-atlas.json";
-    if ( findFilename(atlasFilename.c_str())) {
+    if (findFilename(atlasFilename.c_str())) {
         if (loadAtlasFile(atlasFilename.c_str())) {
             hasAtlas = true;
         }
@@ -1259,20 +1173,20 @@ bool Data::loadFile()
         clearAtlas();
         atlasFilename.clear();
     }
-    
+
     // If it's a compressed file, then set a diff target if a corresponding png
     // is found.  Eventually see if a src dds/ktx/ktx2 exists.  Want to stop
     // using png as source images.  Note png don't have custom mips, unless
     // flattened to one image.  So have to fabricate mips here.  KTXImage
     // can already load up striped png into slices, etc.
-    
+
     bool hasDiff = false;
     string diffFilename;
-    
+
     if (!isPNGFilename(filename)) {
         diffFilename = filenameNoExtension(filename);
         diffFilename += ".png";
-        
+
         diffFilename = toFilenameShort(diffFilename.c_str());
         if (diffFilename != filename) {
             const File* diffFile = findFileShort(diffFilename.c_str());
@@ -1281,44 +1195,41 @@ bool Data::loadFile()
                 hasDiff = true;
             }
         }
-        
+
         if (!hasDiff)
             diffFilename.clear();
     }
-    
+
     //-------------------------------
-    
+
     KTXImage image;
     KTXImageData imageDataKTX;
-    
+
     KTXImage imageNormal;
     KTXImageData imageNormalDataKTX;
-    
+
     KTXImage imageDiff;
     KTXImageData imageDiffDataKTX;
-    
+
     // this requires decode and conversion to RGBA8u
     if (!imageDataKTX.open(fullFilename.c_str(), image)) {
         return false;
     }
-    
+
     // load up the diff, but would prefer to defer this
     if (hasDiff && !imageDiffDataKTX.open(diffFilename.c_str(), imageDiff)) {
         hasDiff = false;
-        
+
         // TODO: could also compare dimensions to see if same
-        
+
         if (imageDiff.textureType == image.textureType &&
-            (imageDiff.textureType == MyMTLTextureType2D) )
-        {
-            
+            (imageDiff.textureType == MyMTLTextureType2D)) {
         }
-        else
-        {
+        else {
             hasDiff = false;
         }
     }
-    
+
     if (hasNormal &&
         imageNormalDataKTX.open(normalFilename.c_str(), imageNormal)) {
         // shaders only pull from albedo + normal on these texture types
@@ -1331,22 +1242,21 @@ bool Data::loadFile()
             hasNormal = false;
         }
     }
-    
+
     //---------------------------------
-    
+
     if (!_delegate.loadTextureFromImage(fullFilename.c_str(), (double)timestamp,
-        image,
-        hasNormal ? &imageNormal : nullptr,
-        hasDiff ? &imageDiff : nullptr,
-        false))
-    {
+                                        image,
+                                        hasNormal ? &imageNormal : nullptr,
+                                        hasDiff ? &imageDiff : nullptr,
+                                        false)) {
         return false;
     }
-    
+
     // store the filename
     _showSettings->lastFilename = filename;
     _showSettings->lastTimestamp = timestamp;
-    
+
     return true;
 }
 
@@ -1356,7 +1266,7 @@ bool Data::loadFileFromArchive()
     const File& file = _files[_fileIndex];
     FileContainer& container = *_containers[file.urlIndex];
     ZipHelper& zip = container.zip;
-    
+
     const char* filename = file.name.c_str();
     const auto* entry = zip.zipEntry(filename);
     string fullFilename = entry->filename;
@@ -1366,18 +1276,28 @@ bool Data::loadFileFromArchive()
     if (!isTextureChanged) {
         return true;
     }
-    
-// TODO: don't have a version which loads gltf model from memory block
-//    bool isModel = isSupportedModelFilename(filename);
-//    if (isModel)
-//        return [self loadModelFile:filename];
-    
+
+    // TODO: don't have a version which loads gltf model from memory block
+    //    bool isModel = isSupportedModelFilename(filename);
+    //    if (isModel)
+    //        return [self loadModelFile:filename];
+
     //--------
-    
+
     if (!isSupportedFilename(filename)) {
         return false;
     }
+
+    // TODO: right now -atlas.json even if already loaded loose
+    // won't apply to archive textures.  Would the -atlas.json file
+    // need to be in the same archive?
+    bool hasAtlas = false;
+    if (!hasAtlas) {
+        clearAtlas();
+    }
     
+    KPERFT("loadFileFromArchive");
+
     const uint8_t* imageData = nullptr;
     uint64_t imageDataLength = 0;
 
@@ -1386,58 +1306,65 @@ bool Data::loadFileFromArchive()
     // zip that compressed png files.  So then the raw ptr/size
     // needs deflated.
     bool isFileUncompressed = entry->compressedSize == entry->uncompressedSize;
-    
+
     vector<uint8_t> bufferForImage;
-    
+
     if (isFileUncompressed) {
+        KPERFT("ZipExtractRaw");
+
         // search for main file - can be albedo or normal
         if (!zip.extractRaw(filename, &imageData, imageDataLength)) {
             return false;
         }
-
     }
     else {
+        KPERFT("ZipExtract");
+
         // need to decompress first
         if (!zip.extract(filename, bufferForImage)) {
             return false;
         }
-        
+
         imageData = bufferForImage.data();
         imageDataLength = bufferForImage.size();
     }
-    
+
     vector<uint8_t> bufferForNormal;
-    
+
     const uint8_t* imageNormalData = nullptr;
     uint64_t imageNormalDataLength = 0;
-    
+
     string normalFilename;
     bool hasNormal = false;
     vector<string> normalFilenames;
-    
+
     TexContentType texContentType = findContentTypeFromFilename(filename);
     if (texContentType == TexContentTypeAlbedo) {
         findPossibleNormalMapFromAlbedoFilename(filename, normalFilenames);
-     
-        for (const auto& name: normalFilenames) {
+
+        for (const auto& name : normalFilenames) {
             const auto* normalEntry = zip.zipEntry(name.c_str());
-            
+
             hasNormal = normalEntry != nullptr;
             if (hasNormal) {
                 normalFilename = name;
-                
-                bool isNormalUncompressed = normalEntry->compressedSize == entry->uncompressedSize;
-                
+
+                bool isNormalUncompressed = normalEntry->compressedSize == normalEntry->uncompressedSize;
+
                 if (isNormalUncompressed) {
+                    KPERFT("ZipExtractRawNormal");
+
                     zip.extractRaw(name.c_str(), &imageNormalData,
                                    imageNormalDataLength);
                 }
                 else {
+                    KPERFT("ZipExtractNormal");
+
                     // need to decompress first
                     if (!zip.extract(filename, bufferForNormal)) {
                         return false;
                     }
-                    
+
                     imageNormalData = bufferForNormal.data();
                     imageNormalDataLength = bufferForNormal.size();
                 }
@@ -1457,131 +1384,134 @@ bool Data::loadFileFromArchive()
     KTXImageData imageNormalDataKTX;
 
     // TODO: do imageDiff here?
-    
+
+    KPERFT_START(1, "KTXOpen");
+
     if (!imageDataKTX.open(imageData, imageDataLength, image)) {
         return false;
     }
 
-    if (hasNormal && imageNormalDataKTX.open(
-                         imageNormalData, imageNormalDataLength, imageNormal)) {
-        // shaders only pull from albedo + normal on these texture types
-        if (imageNormal.textureType == image.textureType &&
-            (imageNormal.textureType == MyMTLTextureType2D ||
-             imageNormal.textureType == MyMTLTextureType2DArray)) {
-            // hasNormal = true;
-        }
-        else {
-            hasNormal = false;
+    KPERFT_STOP(1);
+
+    if (hasNormal) {
+        KPERFT("KTXOpenNormal");
+
+        if (imageNormalDataKTX.open(
+                imageNormalData, imageNormalDataLength, imageNormal)) {
+            // shaders only pull from albedo + normal on these texture types
+            if (imageNormal.textureType == image.textureType &&
+                (imageNormal.textureType == MyMTLTextureType2D ||
+                 imageNormal.textureType == MyMTLTextureType2DArray)) {
+                // hasNormal = true;
+            }
+            else {
+                hasNormal = false;
+            }
         }
     }
 
     //---------------------------------
-    
+
+    KPERFT_START(3, "KTXLoad");
+
     if (!_delegate.loadTextureFromImage(fullFilename.c_str(), (double)timestamp, image, hasNormal ? &imageNormal : nullptr, nullptr, true)) {
         return false;
     }
 
+    KPERFT_STOP(3);
+
     //---------------------------------
-    
-   // NSArray<NSURL*>* urls_ = (NSArray<NSURL*>*)_delegate._urls;
+
     string archiveURL = _urls[file.urlIndex];
     _archiveName = toFilenameShort(archiveURL.c_str());
-    
+
     return true;
 }
 
-
-
-
 void Data::loadFilesFromUrls(vector<string>& urls, bool skipSubdirs)
 {
     // Using a member for archives, so limited to one archive in a drop
     // but that's probably okay for now.  Add a separate array of open
     // archives if want > 1.
-    
+
     // copy the existing files list
     string existingFilename;
     if (_fileIndex < (int32_t)_files.size())
         existingFilename = _files[_fileIndex].name;
-    
+
     // Fill this out again
     _files.clear();
-    
+
     // clear pointers
-    for (FileContainer* container: _containers)
+    for (FileContainer* container : _containers)
         delete container;
     _containers.clear();
-    
+
     // this will flatten the list
     int32_t urlIndex = 0;
-    
+
     vector<string> urlsExtracted;
-    
-    for (const auto& url: urls) {
+
+    for (const auto& url : urls) {
         // These will flatten out to a list of files
         const char* filename = url.c_str();
-        
+
         if (isSupportedArchiveFilename(filename) &&
             openArchive(filename, urlIndex) &&
-            listFilesInArchive(urlIndex))
-        {
+            listFilesInArchive(urlIndex)) {
             urlsExtracted.push_back(filename);
             urlIndex++;
         }
         else if (isDirectory(filename)) {
-            
             // this first loads only models, then textures if only those
             listFilesInFolder(url, urlIndex, skipSubdirs);
-            
+
             // could skip if nothing added
             urlsExtracted.push_back(url);
             urlIndex++;
-            
+
             // handle archives within folder
             vector<File> archiveFiles;
             listArchivesInFolder(url, archiveFiles, skipSubdirs);
-            
-            for (const File& archiveFile: archiveFiles) {
+
+            for (const File& archiveFile : archiveFiles) {
                 const char* archiveFilename = archiveFile.name.c_str();
                 if (openArchive(archiveFilename, urlIndex) &&
                     listFilesInArchive(urlIndex)) {
-                    
                     //NSURL* urlArchive = [NSURL fileURLWithPath:[NSString stringWithUTF8String:archiveFilename]];
                     //[urlsExtracted addObject:urlArchive];
                     urlsExtracted.push_back(archiveFilename);
                     urlIndex++;
                 }
-                
             }
         }
         else if (isSupportedFilename(filename)
 #if USE_GLTF
                  || isSupportedModelFilename(filename)
 #endif
-                 ) {
+        ) {
             _files.emplace_back(File(filename, urlIndex));
-            
+
             //[urlsExtracted addObject:url];
             urlsExtracted.push_back(filename);
             urlIndex++;
         }
         else if (isSupportedJsonFilename(filename)) {
             _files.emplace_back(File(filename, urlIndex));
-            
+
             //[urlsExtracted addObject:url];
             urlsExtracted.push_back(filename);
             urlIndex++;
         }
-        
     }
-    
+
     // sort them by short filename
 #if USE_EASTL
-    NAMESPACE_STL::quick_sort(_files.begin(), _files.end());
+    STL_NAMESPACE::quick_sort(_files.begin(), _files.end());
 #else
-    std::sort(_files.begin(), _files.end());
+    STL_NAMESPACE::sort(_files.begin(), _files.end());
 #endif
-    
+
     // preserve filename before load, and restore that index, by finding
     // that name in refreshed folder list
     _fileIndex = 0;
@@ -1593,7 +1523,7 @@ void Data::loadFilesFromUrls(vector<string>& urls, bool skipSubdirs)
             }
         }
     }
-    
+
     // preserve old file selection
     _urls = urlsExtracted;
 }
@@ -1606,7 +1536,7 @@ void Data::showEyedropperData(const float2& uv)
     float4 c = _showSettings->textureResult;
     int32_t x = _showSettings->textureResultX;
     int32_t y = _showSettings->textureResultY;
-    
+
     // DONE: use these to format the text
     MyMTLPixelFormat format = _showSettings->originalFormat;
     bool isSrgb = isSrgbFormat(format);
@@ -1630,18 +1560,18 @@ void Data::showEyedropperData(const float2& uv)
         // interpret based on shapeChannel, debugMode, etc
         switch (_showSettings->shapeChannel) {
             case ShapeChannelDepth:
-                isSigned = false;  // using fract on uv
+                isSigned = false; // using fract on uv
 
                 isValue = true;
                 isFloat = true;
                 numChannels = 1;
                 break;
             case ShapeChannelUV0:
-                isSigned = false;  // using fract on uv
+                isSigned = false; // using fract on uv
 
                 isValue = true;
                 isFloat = true;
-                numChannels = 2;  // TODO: fix for 3d uvw
+                numChannels = 2; // TODO: fix for 3d uvw
                 break;
 
             case ShapeChannelFaceNormal:
@@ -1670,7 +1600,7 @@ void Data::showEyedropperData(const float2& uv)
         }
 
         // TODO: indicate px, mip, etc (f.e. showAll)
-        
+
         // debug mode
 
         // preview vs. not
@@ -1679,7 +1609,6 @@ void Data::showEyedropperData(const float2& uv)
         // this will be out of sync with gpu eval, so may want to only display px
         // from returned lookup this will always be a linear color
 
-        
         // show uv, so can relate to gpu coordinates stored in geometry and find
         // atlas areas
         append_sprintf(text, "uv:%0.3f %0.3f\n",
@@ -1820,12 +1749,12 @@ void Data::showEyedropperData(const float2& uv)
     // TODO: Stuff these on clipboard with a click, or use cmd+C?
 }
 
-void Data::setEyedropperText(const char * text)
+void Data::setEyedropperText(const char* text)
 {
     setTextSlot(kTextSlotEyedropper, text);
 }
 
-void Data::setAtlasText(const char * text)
+void Data::setAtlasText(const char* text)
 {
     setTextSlot(kTextSlotAtlas, text);
 }
@@ -1836,18 +1765,16 @@ string Data::textFromSlots(bool isFileListHidden) const
     string text = _textSlots[kTextSlotHud];
     if (!text.empty() && text.back() != '\n')
         text += "\n";
-        
+
     // don't show eyedropper text with table up, it's many lines and overlaps
-    if (!isFileListHidden)
-    {
+    if (!isFileListHidden) {
         text += _textSlots[kTextSlotEyedropper];
         if (!text.empty() && text.back() != '\n')
             text += "\n";
-        
+
         text += _textSlots[kTextSlotAtlas];
     }
-    
-    
+
     return text;
 }
 
@@ -1871,19 +1798,19 @@ void Data::updateUIAfterLoad()
     bool isMipHidden = _showSettings->mipCount <= 1;
 
     bool isJumpToNextHidden = _files.size() <= 1;
-    
+
     bool isJumpToCounterpartHidden = true;
     bool isJumpToPrevCounterpartHidden = true;
-    
-    if ( _files.size() > 1) {
+
+    if (_files.size() > 1) {
         isJumpToCounterpartHidden = !hasCounterpart(true);
-        isJumpToPrevCounterpartHidden  = !hasCounterpart(false);
+        isJumpToPrevCounterpartHidden = !hasCounterpart(false);
     }
-    
+
     bool isRedHidden = _showSettings->numChannels == 0; // models don't show rgba
     bool isGreenHidden = _showSettings->numChannels <= 1;
     bool isBlueHidden = _showSettings->numChannels <= 2 &&
-                        _showSettings->texContentType != TexContentTypeNormal;  // reconstruct z = b on normals
+                        _showSettings->texContentType != TexContentTypeNormal; // reconstruct z = b on normals
 
     // TODO: also need a hasAlpha for pixels, since many compressed formats like
     // ASTC always have 4 channels but internally store R,RG01,... etc.  Can get
@@ -1901,7 +1828,7 @@ void Data::updateUIAfterLoad()
 
     bool isSignedHidden = !isSignedFormat(_showSettings->originalFormat);
     bool isPlayHidden = !_showSettings->isModel; // only for models
-    
+
     bool isDiffHidden = false; // only for images
     if (!_showSettings->isModel && _showSettings->hasDiffTexture) {
         isDiffHidden = false;
@@ -1911,28 +1838,28 @@ void Data::updateUIAfterLoad()
     _actionFace->setHidden(isFaceSliceHidden);
     _actionMip->setHidden(isMipHidden);
     _actionShowAll->setHidden(isShowAllHidden);
-    
+
     _actionDiff->setHidden(isDiffHidden);
     _actionItem->setHidden(isJumpToNextHidden);
     _actionPrevItem->setHidden(isJumpToNextHidden);
-    
+
     _actionCounterpart->setHidden(isJumpToCounterpartHidden);
     _actionPrevCounterpart->setHidden(isJumpToPrevCounterpartHidden);
-    
+
     _actionR->setHidden(isRedHidden);
     _actionG->setHidden(isGreenHidden);
     _actionB->setHidden(isBlueHidden);
     _actionA->setHidden(isAlphaHidden);
-    
+
     _actionPremul->setHidden(isPremulHidden);
     _actionSigned->setHidden(isSignedHidden);
     _actionChecker->setHidden(isCheckerboardHidden);
-    
+
     // only allow srgb to be disabled, not toggle on if off at load
     MyMTLPixelFormat format = _showSettings->originalFormat;
     bool isSrgb = isSrgbFormat(format);
-   _actionSrgb->setHidden(!isSrgb);
-    
+    _actionSrgb->setHidden(!isSrgb);
+
     // also need to call after each toggle
     updateUIControlState();
 }
@@ -1942,7 +1869,7 @@ void Data::updateUIControlState()
     // there is also mixed state, but not using that
     auto On = true;
     auto Off = false;
-    
+
 #define toState(x) (x) ? On : Off
 
     auto showAllState = toState(_showSettings->isShowingAllLevelsAndMips);
@@ -1954,8 +1881,8 @@ void Data::updateUIControlState()
     auto wrapState = toState(_showSettings->isWrap);
     auto debugState = toState(_showSettings->debugMode != DebugModeNone);
     auto hudState = toState(_showSettings->isHudShown);
-    
-    TextureChannels &channels = _showSettings->channels;
+
+    TextureChannels& channels = _showSettings->channels;
 
     auto redState = toState(channels == TextureChannels::ModeR001);
     auto greenState = toState(channels == TextureChannels::Mode0G01);
@@ -1978,35 +1905,36 @@ void Data::updateUIControlState()
     auto verticalState = toState(_showSettings->isVerticalUI);
     auto uiState = toState(_showSettings->isHideUI);
     auto diffState = toState(_showSettings->isDiff && _showSettings->hasDiffTexture);
-    
+
     auto srgbState = toState(_showSettings->isSRGBShown);
-    
+    auto perfState = toState(_showSettings->isPerf);
+
     _actionVertical->setHighlight(verticalState);
-    
+
     // TODO: pass boolean, and change in the call
     _actionPlay->setHighlight(playState);
     _actionHelp->setHighlight(Off);
     _actionInfo->setHighlight(Off);
     _actionHud->setHighlight(hudState);
-    
+
     _actionArray->setHighlight(arrayState);
     _actionFace->setHighlight(faceState);
     _actionMip->setHighlight(mipState);
-    
+
     // these never show check state
     _actionItem->setHighlight(Off);
     _actionPrevItem->setHighlight(Off);
-    
+
     _actionCounterpart->setHighlight(Off);
     _actionPrevCounterpart->setHighlight(Off);
-    
+
     _actionHideUI->setHighlight(uiState); // note below button always off, menu has state
-    
+
     _actionR->setHighlight(redState);
     _actionG->setHighlight(greenState);
     _actionB->setHighlight(blueState);
     _actionA->setHighlight(alphaState);
-    
+
     _actionShowAll->setHighlight(showAllState);
     _actionPreview->setHighlight(previewState);
     _actionDiff->setHighlight(diffState);
@@ -2017,12 +1945,13 @@ void Data::updateUIControlState()
     _actionGrid->setHighlight(gridState);
     _actionDebug->setHighlight(debugState);
     _actionTangent->setHighlight(tangentState);
-    
+
     _actionPremul->setHighlight(premulState);
     _actionSigned->setHighlight(signedState);
     _actionChecker->setHighlight(checkerboardState);
-    
+
     _actionSrgb->setHighlight(srgbState);
+    _actionPerf->setHighlight(perfState);
 }
 
 // TODO: convert to C++ actions, and then call into Base holding all this
@@ -2031,42 +1960,42 @@ void Data::updateUIControlState()
 const Action* Data::actionFromMenu(kram_id menuItem) const
 {
     const Action* action = nullptr;
-    
-    for (const auto& search: _actions) {
+
+    for (const auto& search : _actions) {
         if (search.menuItem == menuItem) {
             action = &search;
             break;
         }
     }
-    
+
     return action;
 }
 
 const Action* Data::actionFromButton(kram_id button) const
 {
     const Action* action = nullptr;
-    
-    for (const auto& search: _actions) {
+
+    for (const auto& search : _actions) {
         if (search.button == button) {
             action = &search;
             break;
         }
     }
-    
+
     return action;
 }
 
 const Action* Data::actionFromKey(uint32_t keyCode) const
 {
     const Action* action = nullptr;
-    
-    for (const auto& search: _actions) {
+
+    for (const auto& search : _actions) {
         if (search.keyCode == keyCode) {
             action = &search;
             break;
         }
     }
-    
+
     return action;
 }
 
@@ -2095,6 +2024,33 @@ void Data::setLoadedText(string& text)
     }
 }
 
+void Data::setFailedText(const string& filename, string& text)
+{
+    text = "Failed ";
+
+    // This doesn't advance with failure
+    //string filename = _showSettings->lastFilename;
+
+    text += toFilenameShort(filename.c_str());
+
+    // archives and file systems have folders, split that off
+    string folderName;
+    const char* slashPos = strrchr(filename.c_str(), '/');
+    if (slashPos != nullptr) {
+        folderName = filename.substr(0, slashPos - filename.c_str());
+    }
+
+    if (!folderName.empty()) {
+        text += " in folder ";
+        text += folderName;
+    }
+
+    if (!_archiveName.empty()) {
+        text += " from archive ";
+        text += _archiveName;
+    }
+}
+
 void Data::initActions()
 {
     // Don't reorder without also matching actionPtrs below
@@ -2109,14 +2065,14 @@ void Data::initActions()
         Action("D", "Debug", Key::D),
         Action("G", "Grid", Key::G),
         Action("B", "Checkerboard", Key::B),
-        
+
         Action("", "", Key::A), // sep
 
         Action("P", "Preview", Key::P),
         Action("W", "Wrap", Key::W),
         Action("8", "Premul", Key::Num8),
         Action("7", "Signed", Key::Num7),
-        
+
         Action("", "", Key::A), // sep
 
         Action("A", "Show All", Key::A),
@@ -2124,12 +2080,13 @@ void Data::initActions()
         Action("F", "Face", Key::F),
         Action("Y", "Array", Key::Y),
         Action("9", "Srgb", Key::Num9),
-        
+        Action("5", "Perf", Key::Num5), // really a debug action
+
         Action("↑", "Prev Item", Key::UpArrow),
         Action("↓", "Next Item", Key::DownArrow),
         Action("←", "Prev Counterpart", Key::LeftArrow),
         Action("→", "Next Counterpart", Key::RightArrow),
-        
+
         Action("R", "Reload", Key::R),
         Action("0", "Fit", Key::Num0),
 
@@ -2150,7 +2107,7 @@ void Data::initActions()
         Action("3", "Blue", Key::Num3),
         Action("4", "Alpha", Key::Num4),
     };
-    
+
     // These have to be in same order as above.  May want to go back to search for text above.
     Action** actionPtrs[] = {
         &_actionHelp,
@@ -2158,38 +2115,39 @@ void Data::initActions()
         &_actionHud,
         &_actionHideUI,
         &_actionVertical,
-       
+
         &_actionDiff,
         &_actionDebug,
         &_actionGrid,
         &_actionChecker,
-        
+
         &_actionPreview,
         &_actionWrap,
         &_actionPremul,
         &_actionSigned,
-        
+
         &_actionShowAll,
         &_actionMip,
         &_actionFace,
         &_actionArray,
         &_actionSrgb,
-       
+        &_actionPerf,
+
         &_actionPrevItem,
         &_actionItem,
         &_actionPrevCounterpart,
         &_actionCounterpart,
-        
+
         &_actionReload,
         &_actionFit,
-        
+
         &_actionPlay,
         &_actionShapeUVPreview,
         &_actionShapeMesh,
         &_actionShapeChannel,
         &_actionLighting,
         &_actionTangent,
-        
+
         &_actionR,
         &_actionG,
         &_actionB,
@@ -2197,7 +2155,7 @@ void Data::initActions()
     };
 
     uint32_t numActions = ArrayCount(actions);
-    
+
     // copy all of them to a vector, and then assign the action ptrs
     for (int32_t i = 0; i < numActions; ++i) {
         Action& action = actions[i];
@@ -2209,10 +2167,10 @@ void Data::initActions()
     for (int32_t i = 0; i < _actions.size(); ++i) {
         // skip separators
         Action& action = _actions[i];
-        const char* icon = action.icon;  // single char
+        const char* icon = action.icon; // single char
         bool isSeparator = icon[0] == 0;
         if (isSeparator) continue;
-        
+
         *(actionPtrs[counter++]) = &_actions[i];
     }
     KASSERT(counter == ArrayCount(actionPtrs));
@@ -2249,7 +2207,7 @@ void Data::updateEyedropper()
         _showSettings->lastCursorY == _showSettings->cursorY) {
         return;
     }
-    
+
     if (_showSettings->isEyedropperFromDrawable()) {
         _showSettings->lastCursorX = _showSettings->cursorX;
         _showSettings->lastCursorY = _showSettings->cursorY;
@@ -2262,8 +2220,8 @@ void Data::updateEyedropper()
     // don't wait on renderer to update this matrix
     float4x4 projectionViewModelMatrix =
         computeImageTransform(_showSettings->panX,
-                                   _showSettings->panY,
-                                   _showSettings->zoom);
+                              _showSettings->panY,
+                              _showSettings->zoom);
 
     // convert to clip space, or else need to apply additional viewport transform
     float halfX = _showSettings->viewSizeX * 0.5f;
@@ -2276,37 +2234,37 @@ void Data::updateEyedropper()
     halfY /= (float)_showSettings->viewContentScaleFactor;
 
     float4 cursor = float4m(_showSettings->cursorX, _showSettings->cursorY, 0.0f, 1.0f);
-    
+
     float4x4 pixelToClipTfm =
-    {
-        (float4){ halfX,      0, 0, 0 },
-        (float4){ 0,     -halfY, 0, 0 },
-        (float4){ 0,          0, 1, 0 },
-        (float4){ halfX,  halfY, 0, 1 },
-    };
+        {
+            (float4){halfX, 0, 0, 0},
+            (float4){0, -halfY, 0, 0},
+            (float4){0, 0, 1, 0},
+            (float4){halfX, halfY, 0, 1},
+        };
     pixelToClipTfm = inverse(pixelToClipTfm);
-    
+
     cursor = pixelToClipTfm * cursor;
-    
+
     //float4 clipPoint;
     //clipPoint.x = (point.x - halfX) / halfX;
     //clipPoint.y = -(point.y - halfY) / halfY;
 
     // convert point in window to point in texture
     float4x4 mInv = inverse(projectionViewModelMatrix);
-    
+
     float4 pixel = mInv * float4m(cursor.x, cursor.y, 1.0f, 1.0f);
     pixel.xyz /= pixel.w; // in case perspective used
 
     float ar = _showSettings->imageAspectRatio();
-    
+
     // that's in model space (+/0.5f * ar, +/0.5f), so convert to texture space
     pixel.x = (pixel.x / ar + 0.5f);
     pixel.y = (-pixel.y + 0.5f);
 
     //pixel.x *= 0.999f;
     //pixel.y *= 0.999f;
-    
+
     float2 uv = pixel.xy;
 
     // pixels are 0 based
@@ -2328,33 +2286,33 @@ void Data::updateEyedropper()
     bool outsideImageBounds =
         pixel.x < 0.0f || pixel.x >= (float)_showSettings->imageBoundsX ||
         pixel.y < 0.0f || pixel.y >= (float)_showSettings->imageBoundsY;
-    
+
     // only display pixel if over image
     if (outsideImageBounds) {
         sprintf(text, "canvas: %d %d\n", (int32_t)pixel.x, (int32_t)pixel.y);
-        setEyedropperText(text.c_str());  // ick
+        setEyedropperText(text.c_str()); // ick
         _showSettings->outsideImageBounds = true;
     }
     else {
         // Note: fromView: nil returns isFlipped coordinate, fromView:self flips it
         // back.
-        
+
         int32_t newX = (int32_t)pixel.x;
         int32_t newY = (int32_t)pixel.y;
-        
+
         if (_showSettings->outsideImageBounds ||
             (_showSettings->textureLookupX != newX ||
              _showSettings->textureLookupY != newY)) {
             // Note: this only samples from the original texture via compute shaders
             // so preview mode pixel colors are not conveyed.  But can see underlying
             // data driving preview.
-            
+
             _showSettings->outsideImageBounds = false;
-            
+
             // %.0f rounds the value, but want truncation
             _showSettings->textureLookupX = newX;
             _showSettings->textureLookupY = newY;
-            
+
             // show block num
             int mipLOD = _showSettings->mipNumber;
 
@@ -2373,27 +2331,26 @@ void Data::updateEyedropper()
             // Has to be set in other call, not here
             _showSettings->textureLookupMipX = mipX;
             _showSettings->textureLookupMipY = mipY;
-            
+
             // showEyedropperData(uv);
         }
     }
 }
 
-
 bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionState& actionState)
 {
     // Some data depends on the texture data (isSigned, isNormal, ..)
     bool isChanged = false;
     bool isStateChanged = false;
-    
+
     // TODO: fix isChanged to only be set when value changes
     // f.e. clamped values don't need to re-render
     string text;
-    
+
     if (action == _actionVertical) {
         _showSettings->isVerticalUI = !_showSettings->isVerticalUI;
         text = _showSettings->isVerticalUI ? "Vert UI" : "Horiz UI";
-        
+
         // just to update toggle state to Off
         isStateChanged = true;
     }
@@ -2402,18 +2359,18 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
         if (_noImageLoaded) {
             return true;
         }
-        
+
         _showSettings->isHideUI = !_showSettings->isHideUI;
         text = _showSettings->isHideUI ? "Hide UI" : "Show UI";
-        
+
         // just to update toggle state to Off
         isStateChanged = true;
     }
-    
+
     else if (action == _actionR) {
         if (!action->isHidden) {
             TextureChannels& channels = _showSettings->channels;
-            
+
             if (channels == TextureChannels::ModeR001) {
                 channels = TextureChannels::ModeRGBA;
                 text = "Mask RGBA";
@@ -2424,12 +2381,11 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
             }
             isChanged = true;
         }
-        
     }
     else if (action == _actionG) {
         if (!action->isHidden) {
             TextureChannels& channels = _showSettings->channels;
-            
+
             if (channels == TextureChannels::Mode0G01) {
                 channels = TextureChannels::ModeRGBA;
                 text = "Mask RGBA";
@@ -2444,7 +2400,7 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
     else if (action == _actionB) {
         if (!action->isHidden) {
             TextureChannels& channels = _showSettings->channels;
-            
+
             if (channels == TextureChannels::Mode00B1) {
                 channels = TextureChannels::ModeRGBA;
                 text = "Mask RGBA";
@@ -2453,14 +2409,14 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
                 channels = TextureChannels::Mode00B1;
                 text = "Mask 00B1";
             }
-            
+
             isChanged = true;
         }
     }
     else if (action == _actionA) {
         if (!action->isHidden) {
             TextureChannels& channels = _showSettings->channels;
-            
+
             if (channels == TextureChannels::ModeAAA1) {
                 channels = TextureChannels::ModeRGBA;
                 text = "Mask RGBA";
@@ -2469,36 +2425,57 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
                 channels = TextureChannels::ModeAAA1;
                 text = "Mask AAA1";
             }
-            
+
             isChanged = true;
         }
-        
+    }
+    else if (action == _actionPerf) {
+        Perf* perf = Perf::instance();
+
+        bool isCompressed = true;
+        if ((!_showSettings->isPerf) && perf->start("kramv", isCompressed)) {
+            _showSettings->isPerf = true;
+        }
+        else {
+            _showSettings->isPerf = false;
+
+            if (perf->isRunning()) {
+                perf->stop();
+
+                // TODO: Only open in non-sandboxed builds, it calls system("open file")
+                // and this will have quarantine flag set if app not in app store
+                // or notarized, signed, sandboxed for distribution outside of app store
+                perf->openPerftrace();
+            }
+        }
+
+        text = "Perf ";
+        text += _showSettings->isPerf ? "On" : "Off";
+        isChanged = true;
     }
     else if (action == _actionPlay) {
         if (!action->isHidden) {
-            
-            _showSettings->isPlayAnimations = ! _showSettings->isPlayAnimations;
-            
+            _showSettings->isPlayAnimations = !_showSettings->isPlayAnimations;
+
             //Renderer* renderer = (Renderer*)self.delegate;
             //renderer.playAnimations = !renderer.playAnimations;
-            
+
             text = _showSettings->isPlayAnimations ? "Play" : "Pause";
             isChanged = true;
         }
     }
     else if (action == _actionShapeUVPreview) {
-        
         // toggle state
         _showSettings->isUVPreview = !_showSettings->isUVPreview;
         text = _showSettings->isUVPreview ? "Show UVPreview" : "Hide UvPreview";
         isChanged = true;
-        
+
         _showSettings->uvPreviewFrames = 10;
     }
-    
+
     else if (action == _actionShapeChannel) {
         _showSettings->advanceShapeChannel(isShiftKeyDown);
-        
+
         text = _showSettings->shapeChannelText();
         isChanged = true;
     }
@@ -2523,18 +2500,18 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
     else if (action == _actionHelp) {
         // display the chars for now
         text =
-        "1234-rgba, Preview, Debug, A-show all\n"
-        "Info, Hud, Reload, 0-fit\n"
-        "Checker, Grid\n"
-        "Wrap, 8-signed, 9-premul\n"
-        "Mip, Face, Y-array\n"
-        "↓-next item, →-next counterpart\n"
-        "Lighting, S-shape, C-shape channel\n";
-        
+            "1234-rgba, Preview, Debug, A-show all\n"
+            "Info, Hud, Reload, 0-fit\n"
+            "Checker, Grid\n"
+            "Wrap, 8-signed, 9-premul\n"
+            "Mip, Face, Y-array\n"
+            "↓-next item, →-next counterpart\n"
+            "Lighting, S-shape, C-shape channel\n";
+
         // just to update toggle state to Off
         isStateChanged = true;
     }
-    
+
     else if (action == _actionFit) {
         float zoom;
         // fit image or mip
@@ -2545,54 +2522,54 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
             // fit to topmost image
             zoom = _showSettings->zoomFit;
         }
-        
+
         // This zoom needs to be checked against zoom limits
         // there's a cap on the zoom multiplier.
         // This is reducing zoom which expands the image.
         zoom *= 1.0f / (1 << _showSettings->mipNumber);
-        
+
         // even if zoom same, still do this since it resets the pan
         _showSettings->zoom = zoom;
-        
+
         _showSettings->panX = 0.0f;
         _showSettings->panY = 0.0f;
-        
+
         text = "Scale Image\n";
-//        if (doPrintPanZoom) {
-//            string tmp;
-//            sprintf(tmp,
-//                    "Pan %.3f,%.3f\n"
-//                    "Zoom %.2fx\n",
-//                    _showSettings->panX, _showSettings->panY, _showSettings->zoom);
-//            text += tmp;
-//        }
-        
+        // if (doPrintPanZoom) {
+        //     string tmp;
+        //     sprintf(tmp,
+        //             "Pan %.3f,%.3f\n"
+        //             "Zoom %.2fx\n",
+        //             _showSettings->panX, _showSettings->panY, _showSettings->zoom);
+        //     text += tmp;
+        // }
+
         isChanged = true;
     }
     // reload key (also a quick way to reset the settings)
     else if (action == _actionReload) {
         //bool success =
         _delegate.loadFile();
-        
+
         // reload at actual size
         if (isShiftKeyDown) {
             _showSettings->zoom = 1.0f;
         }
-        
+
         // Name change if image
         if (_showSettings->isModel)
             text = "Reload Model\n";
         else
             text = "Reload Image\n";
-//        if (doPrintPanZoom) {
-//            string tmp;
-//            sprintf(tmp,
-//                    "Pan %.3f,%.3f\n"
-//                    "Zoom %.2fx\n",
-//                    _showSettings->panX, _showSettings->panY, _showSettings->zoom);
-//            text += tmp;
-//        }
-        
+        // if (doPrintPanZoom) {
+        //     string tmp;
+        //     sprintf(tmp,
+        //             "Pan %.3f,%.3f\n"
+        //             "Zoom %.2fx\n",
+        //             _showSettings->panX, _showSettings->panY, _showSettings->zoom);
+        //     text += tmp;
+        // }
+
         isChanged = true;
     }
     else if (action == _actionPreview) {
@@ -2609,7 +2586,7 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
     }
     // TODO: might switch c to channel cycle, so could just hit that
     // and depending on the content, it cycles through reasonable channel masks
-    
+
     // toggle checkerboard for transparency
     else if (action == _actionChecker) {
         if (!action->isHidden) {
@@ -2619,75 +2596,75 @@ bool Data::handleEventAction(const Action* action, bool isShiftKeyDown, ActionSt
             text += _showSettings->isCheckerboardShown ? "On" : "Off";
         }
     }
-    
+
     else if (action == _actionSrgb) {
         if (!action->isHidden) {
             _showSettings->isSRGBShown = !_showSettings->isSRGBShown;
-            
+
             sprintf(text, "Format srgb %s", _showSettings->isSRGBShown ? "On" : "Off");
-            
+
             isChanged = true;
         }
     }
-    
+
     // toggle pixel grid when magnified above 1 pixel, can happen from mipmap
     // changes too
     else if (action == _actionGrid) {
         static int grid = 0;
         static const int kNumGrids = 7;
-        
+
 #define advanceGrid(g, dec) \
-grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
-        
+    grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
+
         // if block size is 1, then this shouldn't toggle
         _showSettings->isBlockGridShown = false;
         _showSettings->isAtlasGridShown = false;
         _showSettings->isPixelGridShown = false;
-        
+
         advanceGrid(grid, isShiftKeyDown);
-        
+
         static const uint32_t gridSizes[kNumGrids] = {
-            0, 1, 4, 32, 64, 128, 256  // grid sizes
+            0, 1, 4, 32, 64, 128, 256 // grid sizes
         };
-        
+
         if (grid == 0) {
             sprintf(text, "Grid Off");
         }
         else if (grid == 1) {
             _showSettings->isPixelGridShown = true;
-            
+
             sprintf(text, "Pixel Grid 1x1");
         }
         else if (grid == 2 && _showSettings->blockX > 1) {
             _showSettings->isBlockGridShown = true;
-            
+
             sprintf(text, "Block Grid %dx%d", _showSettings->blockX,
                     _showSettings->blockY);
         }
         else {
             _showSettings->isAtlasGridShown = true;
-            
+
             // want to be able to show altases tht have long entries derived from
             // props but right now just a square grid atlas
             _showSettings->gridSizeX = _showSettings->gridSizeY = gridSizes[grid];
-            
+
             sprintf(text, "Atlas Grid %dx%d", _showSettings->gridSizeX,
                     _showSettings->gridSizeY);
         }
-        
+
         isChanged = true;
     }
     else if (action == _actionShowAll) {
         if (!action->isHidden) {
             // TODO: have drawAllMips, drawAllLevels, drawAllLevelsAndMips
             _showSettings->isShowingAllLevelsAndMips =
-            !_showSettings->isShowingAllLevelsAndMips;
+                !_showSettings->isShowingAllLevelsAndMips;
             isChanged = true;
             text = "Show All ";
             text += _showSettings->isShowingAllLevelsAndMips ? "On" : "Off";
         }
     }
-    
+
     // toggle hud that shows name and pixel value under the cursor
     // this may require calling setNeedsDisplay on the UILabel as cursor moves
     else if (action == _actionHud) {
@@ -2698,23 +2675,22 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
         text += _showSettings->isHudShown ? "On" : "Off";
         isStateChanged = true;
     }
-    
+
     // info on the texture, could request info from lib, but would want to cache
     // that info
     else if (action == _actionInfo) {
         if (_showSettings->isHudShown) {
-            
             // also hide the file table, since this can be long
             //[self hideFileTable];
-            
+
             sprintf(text, "%s",
                     isShiftKeyDown ? _showSettings->imageInfoVerbose.c_str()
-                    : _showSettings->imageInfo.c_str());
+                                   : _showSettings->imageInfo.c_str());
         }
         // just to update toggle state to Off
         isStateChanged = true;
     }
-    
+
     // toggle wrap/clamp
     else if (action == _actionWrap) {
         // TODO: cycle through all possible modes (clamp, repeat, mirror-once,
@@ -2724,7 +2700,7 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
         text = "Wrap ";
         text += _showSettings->isWrap ? "On" : "Off";
     }
-    
+
     // toggle signed vs. unsigned
     else if (action == _actionSigned) {
         if (!action->isHidden) {
@@ -2734,7 +2710,7 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
             text += _showSettings->isSigned ? "On" : "Off";
         }
     }
-    
+
     // toggle premul alpha vs. unmul
     else if (action == _actionPremul) {
         if (!action->isHidden) {
@@ -2744,26 +2720,26 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
             text += _showSettings->doShaderPremul ? "On" : "Off";
         }
     }
-    
+
     else if (action == _actionItem || action == _actionPrevItem) {
         if (!action->isHidden) {
             // invert shift key for prev, since it's reverse
             if (action == _actionPrevItem) {
                 isShiftKeyDown = !isShiftKeyDown;
             }
-            
+
             if (advanceFile(!isShiftKeyDown)) {
                 //_hudHidden = true;
                 //[self updateHudVisibility];
                 //[self setEyedropperText:""];
-                
+
                 isChanged = true;
-            
+
                 setLoadedText(text);
             }
         }
     }
-    
+
     else if (action == _actionCounterpart || action == _actionPrevCounterpart) {
         if (!action->isHidden) {
             // invert shift key for prev, since it's reverse
@@ -2774,14 +2750,14 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
                 //_hudHidden = true;
                 //[self updateHudVisibility];
                 //[self setEyedropperText:""];
-                
+
                 isChanged = true;
-                
+
                 setLoadedText(text);
             }
         }
     }
-    
+
     // test out different shapes
     else if (action == _actionShapeMesh) {
         if (_showSettings->meshCount > 1) {
@@ -2790,9 +2766,9 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
             isChanged = true;
         }
     }
-    
+
     // TODO: should probably have these wrap and not clamp to count limits
-    
+
     // mip up/down
     else if (action == _actionMip) {
         if (_showSettings->mipCount > 1) {
@@ -2801,14 +2777,14 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
             }
             else {
                 _showSettings->mipNumber =
-                std::min(_showSettings->mipNumber + 1, _showSettings->mipCount - 1);
+                    std::min(_showSettings->mipNumber + 1, _showSettings->mipCount - 1);
             }
             sprintf(text, "Mip %d/%d", _showSettings->mipNumber,
                     _showSettings->mipCount);
             isChanged = true;
         }
     }
-    
+
     else if (action == _actionFace) {
         // cube or cube array, but hit s to pick cubearray
         if (_showSettings->faceCount > 1) {
@@ -2817,14 +2793,14 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
             }
             else {
                 _showSettings->faceNumber =
-                std::min(_showSettings->faceNumber + 1, _showSettings->faceCount - 1);
+                    std::min(_showSettings->faceNumber + 1, _showSettings->faceCount - 1);
             }
             sprintf(text, "Face %d/%d", _showSettings->faceNumber,
                     _showSettings->faceCount);
             isChanged = true;
         }
     }
-    
+
     else if (action == _actionArray) {
         // slice
         if (_showSettings->sliceCount > 1) {
@@ -2833,7 +2809,7 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
             }
             else {
                 _showSettings->sliceNumber =
-                std::min(_showSettings->sliceNumber + 1, _showSettings->sliceCount - 1);
+                    std::min(_showSettings->sliceNumber + 1, _showSettings->sliceCount - 1);
             }
             sprintf(text, "Slice %d/%d", _showSettings->sliceNumber,
                     _showSettings->sliceCount);
@@ -2846,7 +2822,7 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
             }
             else {
                 _showSettings->arrayNumber =
-                std::min(_showSettings->arrayNumber + 1, _showSettings->arrayCount - 1);
+                    std::min(_showSettings->arrayNumber + 1, _showSettings->arrayCount - 1);
             }
             sprintf(text, "Array %d/%d", _showSettings->arrayNumber,
                     _showSettings->arrayCount);
@@ -2857,11 +2833,11 @@ grid = (grid + kNumGrids + (dec ? -1 : 1)) % kNumGrids
         // non-handled action
         return false;
     }
-    
+
     actionState.hudText = text;
     actionState.isChanged = isChanged;
     actionState.isStateChanged = isStateChanged;
-    
+
     return true;
 }
 
@@ -2877,7 +2853,7 @@ void Data::updateImageSettings(const string& fullFilename, KTXImage& image, MyMT
     _showSettings->blockY = image.blockDims().y;
 
     _showSettings->isSigned = isSignedFormat(format);
-    
+
     TexContentType texContentType = findContentTypeFromFilename(fullFilename.c_str());
     _showSettings->texContentType = texContentType;
     //_showSettings->isSDF = isSDF;
@@ -2890,7 +2866,7 @@ void Data::updateImageSettings(const string& fullFilename, KTXImage& image, MyMT
     _showSettings->doShaderPremul = false;
     if (texContentType == TexContentTypeAlbedo && isPNG) {
         _showSettings->doShaderPremul =
-            true;  // convert to premul in shader, so can see other channels
+            true; // convert to premul in shader, so can see other channels
     }
 
     int32_t numChannels = numChannelsOfFormat(originalFormat);
@@ -2922,9 +2898,6 @@ void Data::updateImageSettings(const string& fullFilename, KTXImage& image, MyMT
     _showSettings->imageBoundsY = (int32_t)image.height;
 }
 
-
-
-
 float zoom3D = 1.0f;
 
 void Data::updateProjTransform()
@@ -2932,8 +2905,8 @@ void Data::updateProjTransform()
     // Want to move to always using perspective even for 2d images, but still more math
     // to work out to keep zoom to cursor working.
 #if USE_PERSPECTIVE
-    float aspect = _showSettings->viewSizeX /  (float)_showSettings->viewSizeY;
-    _projectionMatrix = perspective_rhs(90.0f * (M_PI / 180.0f), aspect, 0.1f, 100000.0f, _showSettings->isReverseZ);
+    float aspect = _showSettings->viewSizeX / (float)_showSettings->viewSizeY;
+    _projectionMatrix = perspective_rhcs(90.0f * (M_PI / 180.0f), aspect, 0.1f);
 
     // This was used to reset zoom to a baseline that had a nice zoom.  But little connected to it now.
     // Remember with rotation, the bounds can hit the nearClip.  Note all shapes are 0.5 radius,
@@ -2944,22 +2917,26 @@ void Data::updateProjTransform()
 #else
 
     if (_showSettings->isModel) {
-        float aspect = _showSettings->viewSizeX /  (float)_showSettings->viewSizeY;
-        _projectionMatrix = perspective_rhs(90.0f * (M_PI / 180.0f), aspect, 0.1f, 100000.0f, _showSettings->isReverseZ);
+        float aspect = _showSettings->viewSizeX / (float)_showSettings->viewSizeY;
+        _projectionMatrix = perspective_rhcs(90.0f * (M_PI / 180.0f), aspect, 0.1f);
 
         _showSettings->zoomFit = 1;
     }
     else {
+        // ltrb
+        float2 rectDims = 0.5f * float2m(_showSettings->viewSizeX, _showSettings->viewSizeY);
+        float4 rect = float4m(-rectDims.x, rectDims.y,
+                              rectDims.x, -rectDims.y);
+
         _projectionMatrix =
-            orthographic_rhs(_showSettings->viewSizeX, _showSettings->viewSizeY, 0.1f,
-                             100000.0f, _showSettings->isReverseZ);
+            orthographic_rhcs(rect, 0.1f, 1e6f);
 
         // DONE: adjust zoom to fit the entire image to the window
         _showSettings->zoomFit =
             std::min((float)_showSettings->viewSizeX, (float)_showSettings->viewSizeY) /
             std::max(1.0f, std::max((float)_showSettings->imageBoundsX,
                                     (float)_showSettings->imageBoundsY));
-        
+
         static bool useImageAndViewBounds = true;
         if (useImageAndViewBounds) {
             float invWidth = 1.0f / std::max(1.0f, (float)_showSettings->imageBoundsX);
@@ -2968,8 +2945,8 @@ void Data::updateProjTransform()
             // DONE: adjust zoom to fit the entire image to the window
             // the best fit depends on dimension of image and window
             _showSettings->zoomFit =
-                std::min( (float)_showSettings->viewSizeX * invWidth,
-                          (float)_showSettings->viewSizeY * invHeight);
+                std::min((float)_showSettings->viewSizeX * invWidth,
+                         (float)_showSettings->viewSizeY * invHeight);
         }
     }
 #endif
@@ -2984,53 +2961,52 @@ void Data::resetSomeImageSettings(bool isNewFile)
         _showSettings->faceNumber = 0;
         _showSettings->arrayNumber = 0;
         _showSettings->sliceNumber = 0;
-        
+
         _showSettings->channels = TextureChannels::ModeRGBA;
-        
+
         // wish could keep existing setting, but new texture might not
         // be supported debugMode for new texture
         _showSettings->debugMode = DebugMode::DebugModeNone;
-        
+
         _showSettings->shapeChannel = ShapeChannel::ShapeChannelNone;
     }
     else {
         // reloaded file may have different limits
         _showSettings->mipNumber =
-        std::min(_showSettings->mipNumber, _showSettings->mipCount);
+            std::min(_showSettings->mipNumber, _showSettings->mipCount);
         _showSettings->faceNumber =
-        std::min(_showSettings->faceNumber, _showSettings->faceCount);
+            std::min(_showSettings->faceNumber, _showSettings->faceCount);
         _showSettings->arrayNumber =
-        std::min(_showSettings->arrayNumber, _showSettings->arrayCount);
+            std::min(_showSettings->arrayNumber, _showSettings->arrayCount);
         _showSettings->sliceNumber =
-        std::min(_showSettings->sliceNumber, _showSettings->sliceCount);
+            std::min(_showSettings->sliceNumber, _showSettings->sliceCount);
     }
-    
+
     updateProjTransform();
-    
-    
+
     // this controls viewMatrix (global to all visible textures)
     _showSettings->panX = 0.0f;
     _showSettings->panY = 0.0f;
-    
+
     _showSettings->zoom = _showSettings->zoomFit;
-    
+
     // Y is always 1.0 on the plane, so scale to imageBoundsY
     // plane is already a non-uniform size, so can keep uniform scale
-    
+
     // have one of these for each texture added to the viewer
     //float scaleX = MAX(1, _showSettings->imageBoundsX);
     float scaleY = std::max(1, _showSettings->imageBoundsY);
     float scaleX = scaleY;
     float scaleZ = scaleY;
-    
+
     _modelMatrix2D =
-    float4x4(float4m(scaleX, scaleY, scaleZ, 1.0f)); // uniform scale
+        float4x4(float4m(scaleX, scaleY, scaleZ, 1.0f)); // uniform scale
     _modelMatrix2D = _modelMatrix2D *
-    matrix4x4_translation(0.0f, 0.0f, -1.0);  // set z=-1 unit back
-    
+                     translation(float3m(0.0f, 0.0f, -1.0)); // set z=-1 unit back
+
     // uniform scaled 3d primitive
     float scale = scaleY; // MAX(scaleX, scaleY);
-    
+
     // store the zoom into thew view matrix
     // fragment tangents seem to break down at high model scale due to precision
     // differences between worldPos and uv
@@ -3039,59 +3015,59 @@ void Data::resetSomeImageSettings(bool isNewFile)
     //        zoom3D = scale;  // * _showSettings->viewSizeX / 2.0f;
     //        scale = 1.0;
     //    }
-    
-    _modelMatrix3D = float4x4(float4m(scale, scale, scale, 1.0f));  // uniform scale
+
+    _modelMatrix3D = float4x4(float4m(scale, scale, scale, 1.0f)); // uniform scale
     _modelMatrix3D =
-    _modelMatrix3D *
-    matrix4x4_translation(0.0f, 0.0f, -1.0f);  // set z=-1 unit back
+        _modelMatrix3D *
+        translation(float3m(0.0f, 0.0f, -1.0f)); // set z=-1 unit back
 }
 
 void Data::updateTransforms()
 {
     // scale
     float zoom = _showSettings->zoom;
-    
+
     // translate
     float4x4 panTransform =
-        matrix4x4_translation(-_showSettings->panX, _showSettings->panY, 0.0);
+        translation(float3m(-_showSettings->panX, _showSettings->panY, 0.0));
 
     if (_showSettings->is3DView) {
-        _viewMatrix3D = float4x4(float4m(zoom, zoom, 1.0f, 1.0f));  // non-uniform
+        _viewMatrix3D = float4x4(float4m(zoom, zoom, 1.0f, 1.0f)); // non-uniform
         _viewMatrix3D = panTransform * _viewMatrix3D;
-        
+
         _viewMatrix = _viewMatrix3D;
-        
+
         // obj specific
         _modelMatrix = _modelMatrix3D;
     }
     else {
         _viewMatrix2D = float4x4(float4m(zoom, zoom, 1.0f, 1.0f));
         _viewMatrix2D = panTransform * _viewMatrix2D;
-        
+
         _viewMatrix = _viewMatrix2D;
-        
+
         // obj specific
         _modelMatrix = _modelMatrix2D;
     }
-    
+
     // viewMatrix should typically be the inverse
     //_viewMatrix = simd_inverse(_viewMatrix);
-    
+
     _projectionViewMatrix = _projectionMatrix * _viewMatrix;
-    
+
     // cache the camera position
     _cameraPosition =
-        inverse(_viewMatrix).columns[3].xyz;  // this is all ortho
-    
+        inverse(_viewMatrix).columns[3].xyz; // this is all ortho
+
     // obj specific
-    _modelMatrixInvScale2 = inverseScaleSquared(_modelMatrix);
-    _showSettings->isInverted = _modelMatrixInvScale2.w < 0.0f;
+    float4 modelMatrixInvScale2 = inverseScaleSquared(_modelMatrix);
+    _showSettings->isInverted = modelMatrixInvScale2.w < 0.0f;
 }
 
 float4x4 Data::computeImageTransform(float panX, float panY, float zoom)
 {
     // translate
-    float4x4 panTransform = matrix4x4_translation(-panX, panY, 0.0);
+    float4x4 panTransform = translation(float3m(-panX, panY, 0.0));
 
     // non-uniform scale is okay here, only affects ortho volume
     // setting this to uniform zoom and object is not visible, zoom can be 20x in
@@ -3112,45 +3088,44 @@ float4x4 Data::computeImageTransform(float panX, float panY, float zoom)
     }
 }
 
-
 void Data::doZoomMath(float newZoom, float2& newPan)
 {
     // transform the cursor to texture coordinate, or clamped version if outside
     float4x4 projectionViewModelMatrix = computeImageTransform(
-                                    _showSettings->panX,
-                                    _showSettings->panY,
-                                    _showSettings->zoom);
+        _showSettings->panX,
+        _showSettings->panY,
+        _showSettings->zoom);
 
     // convert from pixel to clip space
     float halfX = _showSettings->viewSizeX * 0.5f;
     float halfY = _showSettings->viewSizeY * 0.5f;
-    
+
     // sometimes get viewSizeX that's scaled by retina, and other times not.
     // account for contentScaleFactor (viewSizeX is 2x bigger than cursorX on
     // retina display) now passing down drawableSize instead of view.bounds.size
     halfX /= (float)_showSettings->viewContentScaleFactor;
     halfY /= (float)_showSettings->viewContentScaleFactor;
-    
+
     float4x4 viewportMatrix =
-    {
-        (float4){ halfX,      0, 0, 0 },
-        (float4){ 0,     -halfY, 0, 0 },
-        (float4){ 0,          0, 1, 0 },
-        (float4){ halfX,  halfY, 0, 1 },
-    };
+        {
+            (float4){halfX, 0, 0, 0},
+            (float4){0, -halfY, 0, 0},
+            (float4){0, 0, 1, 0},
+            (float4){halfX, halfY, 0, 1},
+        };
     viewportMatrix = inverse(viewportMatrix);
-    
+
     float4 cursor = float4m(_showSettings->cursorX, _showSettings->cursorY, 0.0f, 1.0f);
-    
+
     cursor = viewportMatrix * cursor;
-    
+
     //NSPoint clipPoint;
     //clipPoint.x = (point.x - halfX) / halfX;
     //clipPoint.y = -(point.y - halfY) / halfY;
 
     // convert point in window to point in model space
     float4x4 mInv = inverse(projectionViewModelMatrix);
-    
+
     float4 pixel = mInv * float4m(cursor.x, cursor.y, 1.0f, 1.0f);
     pixel.xyz /= pixel.w; // in case perspective used
 
@@ -3175,10 +3150,10 @@ void Data::doZoomMath(float newZoom, float2& newPan)
     // normalized coords to pixel coords
     pixel.x *= _showSettings->imageBoundsX;
     pixel.y *= _showSettings->imageBoundsY;
-    
+
     // this fixes pinch-zoom on cube which are 6:1
     pixel.x /= ar;
-    
+
 #if USE_PERSPECTIVE
     // TODO: this doesn't work for perspective
     newPan.x = _showSettings->panX - (_showSettings->zoom - newZoom) * pixel.x;
@@ -3189,6 +3164,4 @@ void Data::doZoomMath(float newZoom, float2& newPan)
 #endif
 }
 
-
-
-}  // namespace kram
+} // namespace kram
diff --git a/kramv/KramViewerBase.h b/kramv/KramViewerBase.h
index a9ba2737..ddd9ef34 100644
--- a/kramv/KramViewerBase.h
+++ b/kramv/KramViewerBase.h
@@ -1,10 +1,10 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #include <cstdint>
 
-#include "KramLib.h"  // for MyMTLPixelFormat
+#include "KramLib.h" // for MyMTLPixelFormat
 //#include <string>
 //#include <simd/simd.h>
 
@@ -12,8 +12,8 @@
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
-using namespace simd;
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
 
 enum TextureChannels {
     ModeRGBA = 0,
@@ -53,13 +53,13 @@ enum ShapeChannel {
 
     ShapeChannelUV0,
 
-    ShapeChannelFaceNormal,  // gen from dfdx and dfdy
+    ShapeChannelFaceNormal, // gen from dfdx and dfdy
 
-    ShapeChannelNormal,  // vertex normal
+    ShapeChannelNormal, // vertex normal
     ShapeChannelTangent,
     ShapeChannelBitangent,
 
-    ShapeChannelMipLevel,  // can estimate mip chose off dfdx/dfdy, and pseudocolor
+    ShapeChannelMipLevel, // can estimate mip chose off dfdx/dfdy, and pseudocolor
 
     // don't need bump, since can already see it, but what if combined diffuse +
     // normal ShapeChannelBumpNormal,
@@ -68,22 +68,21 @@ enum ShapeChannel {
 };
 
 enum LightingMode {
-    LightingModeDiffuse = 0,   // amb + diffuse
-    LightingModeSpecular = 1,  // amb + diffuse + specular
-    LightingModeNone = 2,      // no lighting, just mips
-    
+    LightingModeDiffuse = 0, // amb + diffuse
+    LightingModeSpecular = 1, // amb + diffuse + specular
+    LightingModeNone = 2, // no lighting, just mips
+
     LightingModeCount,
 };
 
-struct Atlas
-{
+struct Atlas {
     string name;
-    float x,y,w,h;
-    float u,v; // padding - to both or just left or right?
+    float x, y, w, h;
+    float u, v; // padding - to both or just left or right?
     bool isVertical;
     uint32_t level;
-    
-    float4 rect() const { return float4m(x,y,w,h); }
+
+    float4 rect() const { return float4m(x, y, w, h); }
 };
 
 class ShowSettings {
@@ -112,12 +111,15 @@ class ShowSettings {
 
     // DONE: hook all these up to shader and view
     bool isHudShown = true;
-    
+
     bool isHideUI = false;
     bool isVerticalUI = true;
-    
+
     bool isPlayAnimations = false;
-    
+
+    // Can get a dump of perf (mostly loading a decode/transcode perf)
+    bool isPerf = false;
+
     // transparency checkboard under the image
     bool isCheckerboardShown = false;
 
@@ -144,17 +146,17 @@ class ShowSettings {
     bool isSwizzleAGToRG = false;
     //bool isSDF = false;
     TexContentType texContentType = TexContentTypeUnknown;
-    
+
     // this mode shows the content with lighting or with bilinear/mips active
     bool isPreview = false;
 
     // Can collapse 3d to 2d and overlay the uv
     bool isUVPreview = false;
-    
+
     uint32_t uvPreviewFrames = 0;
     float uvPreviewStep = 1.0f / 10.0f;
     float uvPreview = 0.0f;
-    
+
     // the 2d view doesn't want to inset pixels for clamp, or point sampling is
     // thrown off expecially on small 4x4 textures
 #if USE_PERSPECTIVE
@@ -162,7 +164,7 @@ class ShowSettings {
 #else
     bool is3DView = false;
 #endif
-    
+
     // TODO: Might eliminate this, since mips are either built with or without
     // srgb and disabling with a MTLView caused many flags to have to be set on
     // MTLTexture
@@ -171,18 +173,15 @@ class ShowSettings {
     // whether to use normal to tangent (false), or vertex tangents (true)
     bool useTangent = true;
 
-    // draw with reverseZ to better match perspective
-    bool isReverseZ = true;
-
     // image vs. gltf model
     bool isModel = false;
-    
+
     // if diff texture available, can show diff against source
     bool isDiff = false;
-    
+
     // currently loading the diff texture if found, this slows loads
     bool hasDiffTexture = false;
-    
+
     // can sample from drawable or from single source texture
     bool isEyedropperFromDrawable();
 
@@ -191,11 +190,11 @@ class ShowSettings {
 
     // this could be boundary of all visible images, so that pan doesn't go flying
     // off to nowhere
-    int32_t imageBoundsX = 0;  // px
-    int32_t imageBoundsY = 0;  // px
+    int32_t imageBoundsX = 0; // px
+    int32_t imageBoundsY = 0; // px
 
     bool outsideImageBounds = false;
-    
+
     // size of the block, used in block grid drawing
     int32_t blockX = 1;
     int32_t blockY = 1;
@@ -220,8 +219,8 @@ class ShowSettings {
     float4 textureResult;
 
     // size of the view and its contentScaleFactor
-    int32_t viewSizeX = 1;  // px
-    int32_t viewSizeY = 1;  // px
+    int32_t viewSizeX = 1; // px
+    int32_t viewSizeY = 1; // px
     float viewContentScaleFactor = 1.0f;
 
     // cursor is in view coordinates, but doesn't include contentScaleFactor
@@ -258,54 +257,46 @@ class ShowSettings {
     void advanceShapeChannel(bool decrement);
     void advanceLightingMode(bool decrement);
 
-    const char *meshNumberText() const;
-    const char *shapeChannelText() const;
-    const char *debugModeText() const;
-    const char *lightingModeText() const;
-    
-    const char *meshNumberName(uint32_t meshNumber) const;
-    
+    const char* meshNumberText() const;
+    const char* shapeChannelText() const;
+    const char* debugModeText() const;
+    const char* lightingModeText() const;
+
+    const char* meshNumberName(uint32_t meshNumber) const;
+
     void updateUVPreviewState();
-    
-    float imageAspectRatio() const {
+
+    float imageAspectRatio() const
+    {
         float ar = 1.0f;
         if (meshNumber == 0 && !isModel && imageBoundsY > 0)
             ar = imageBoundsX / (float)imageBoundsY;
         return ar;
     }
-    
-    bool isFileNew(const char* fullFilename) const {
+
+    bool isFileNew(const char* fullFilename) const
+    {
         return lastFilename != fullFilename;
     }
-    bool isFileChanged(const char* fullFilename, double timestamp) const {
+    bool isFileChanged(const char* fullFilename, double timestamp) const
+    {
         // Note that modstamp can change, but content data hash may be the same
         return isFileNew(fullFilename) || (timestamp != lastTimestamp);
     }
-    
+
     string lastFilename;
     double lastTimestamp = 0.0;
 
     int32_t meshNumber = 0;
     int32_t meshCount = 5;
-    
+
     const Atlas* lastAtlas = nullptr; // Might move to index
     vector<Atlas> atlas;
 };
 
-float4x4 matrix4x4_translation(float tx, float ty, float tz);
-
-float4x4 perspective_rhs(float fovyRadians, float aspect, float nearZ, float
-                         farZ, bool isReverseZ);
-
-float4x4 orthographic_rhs(float width, float height, float nearZ, float farZ,
-                          bool isReverseZ);
-
-float4x4 matrix4x4_rotation(float radians, vector_float3 axis);
-
-void printChannels(string &tmp, const string &label, float4 c,
+void printChannels(string& tmp, const string& label, float4 c,
                    int32_t numChannels, bool isFloat, bool isSigned);
 
-
 enum Key {
     A = 0x00,
     S = 0x01,
@@ -362,7 +353,7 @@ enum Key {
     RightArrow = 0x7C,
     DownArrow = 0x7D,
     UpArrow = 0x7E,
-    
+
     Space = 0x31,
     Escape = 0x35,
 };
@@ -374,7 +365,7 @@ class Action {
 public:
     Action(const char* icon_, const char* tip_, Key keyCode_)
         : icon(icon_), tip(tip_), keyCode(keyCode_) {}
-    
+
     const char* icon;
     const char* tip;
 
@@ -382,11 +373,11 @@ class Action {
     kram_id button; // NSButton*
     kram_id menuItem; // NSMenuItem*
     Key keyCode;
-    
+
     bool isHighlighted = false;
     bool isHidden = false;
     bool isButtonDisabled = false;
-    
+
     // This have platform impl
     void setHighlight(bool enable);
     void setHidden(bool enable);
@@ -397,46 +388,44 @@ class Action {
 struct FileContainer {
     // allow zip files to be dropped and opened, and can advance through bundle
     // content.
-    
+
     // TODO: Add FileHelper if acrhive file is networked, but would require
     // full load to memory.
-    
+
     ZipHelper zip;
     MmapHelper zipMmap;
 };
 
-struct ActionState
-{
+struct ActionState {
     string hudText;
     bool isChanged;
     bool isStateChanged;
 };
 
-enum TextSlot
-{
+enum TextSlot {
     kTextSlotHud,
     kTextSlotEyedropper,
     kTextSlotAtlas,
-    
+
     kTextSlotCount // not a slot
 };
 
 struct File {
 public:
     File(const char* name_, int32_t urlIndex_);
-    
+
     // Note: not sorting by urlIndex currently
-    bool operator <(const File& rhs) const
+    bool operator<(const File& rhs) const
     {
         // sort by shortname
         int compare = strcasecmp(nameShort.c_str(), rhs.nameShort.c_str());
-        if ( compare != 0 )
+        if (compare != 0)
             return compare < 0;
-        
+
         // if equal, then sort by longname
         return strcasecmp(name.c_str(), rhs.name.c_str()) < 0;
     }
-    
+
 public:
     string name;
     int32_t urlIndex;
@@ -444,14 +433,13 @@ struct File {
 };
 
 // This allows wrapping all the ObjC stuff
-struct DataDelegate
-{
+struct DataDelegate {
     bool loadFile(bool clear = false);
-    
+
     bool loadModelFile(const char* filename);
-   
+
     bool loadTextureFromImage(const char* fullFilename, double timestamp, KTXImage& image, KTXImage* imageNormal, KTXImage* imageDiff, bool isArchive);
-    
+
 public:
     kram_id view; // MyMTKView*
 };
@@ -459,11 +447,11 @@ struct DataDelegate
 struct Data {
     Data();
     ~Data();
-    
+
     void clearAtlas();
     bool loadAtlasFile(const char* filename);
     bool listFilesInArchive(int32_t urlIndex);
-    bool openArchive(const char * zipFilename, int32_t urlIndex);
+    bool openArchive(const char* zipFilename, int32_t urlIndex);
 
     bool hasCounterpart(bool increment);
     bool advanceCounterpart(bool increment);
@@ -475,7 +463,7 @@ struct Data {
     const Atlas* findAtlasAtUV(float2 uv);
     bool isArchive() const;
     bool loadFile();
-    
+
     bool handleEventAction(const Action* action, bool isShiftKeyDown, ActionState& actionState);
     void updateUIAfterLoad();
     void updateUIControlState();
@@ -485,6 +473,7 @@ struct Data {
     const Action* actionFromKey(uint32_t keyCodes) const;
 
     void setLoadedText(string& text);
+    void setFailedText(const string& filename, string& text);
 
     void initActions();
     vector<Action>& actions() { return _actions; }
@@ -499,9 +488,9 @@ struct Data {
 
     // See these to split off ObjC code
     DataDelegate _delegate;
-    
+
     void updateEyedropper();
-    
+
     float4x4 computeImageTransform(float panX, float panY, float zoom);
     void updateProjTransform();
     void resetSomeImageSettings(bool isNewFile);
@@ -509,67 +498,70 @@ struct Data {
 
     void doZoomMath(float newZoom, float2& newPan);
 
+    void setPerfDirectory(const char* directory);
+
 private:
     bool loadFileFromArchive();
 
 public:
     void showEyedropperData(const float2& uv);
-    void setEyedropperText(const char * text);
-    void setAtlasText(const char * text);
+    void setEyedropperText(const char* text);
+    void setAtlasText(const char* text);
     void updateTransforms();
-   
+
     //----------------
     float4x4 _projectionMatrix;
-    
+
     float4x4 _projectionViewMatrix;
     float3 _cameraPosition;
-    
+
     float4x4 _viewMatrix;
     float4x4 _viewMatrix2D;
     float4x4 _viewMatrix3D;
 
     // object specific
     float4x4 _modelMatrix;
-    float4 _modelMatrixInvScale2;
+    //float4 _modelMatrixInvScale2;
     float4x4 _modelMatrix2D;
     float4x4 _modelMatrix3D;
 
     //----------------
-    
+
     vector<string> _textSlots;
     ShowSettings* _showSettings = nullptr;
 
     bool _noImageLoaded = true;
     string _archiveName; // archive or blank
-    
+
     // folders and archives and multi-drop files are filled into this
     vector<File> _files;
     int32_t _fileIndex = 0;
-   
+
     // One of these per url in _urlss
     vector<FileContainer*> _containers;
     vector<string> _urls;
-    
+
     Action* _actionPlay;
     Action* _actionShapeUVPreview;
     Action* _actionHelp;
     Action* _actionInfo;
     Action* _actionHud;
     Action* _actionShowAll;
-    
+
     Action* _actionPreview;
     Action* _actionWrap;
     Action* _actionPremul;
     Action* _actionSigned;
     Action* _actionSrgb;
-    
+    Action* _actionPerf;
+
     Action* _actionDiff;
     Action* _actionDebug;
     Action* _actionGrid;
     Action* _actionChecker;
     Action* _actionHideUI;
     Action* _actionVertical;
-    
+
     Action* _actionMip;
     Action* _actionFace;
     Action* _actionArray;
@@ -579,17 +571,17 @@ struct Data {
     Action* _actionPrevCounterpart;
     Action* _actionReload;
     Action* _actionFit;
-    
+
     Action* _actionShapeMesh;
     Action* _actionShapeChannel;
     Action* _actionLighting;
     Action* _actionTangent;
-    
+
     Action* _actionR;
     Action* _actionG;
     Action* _actionB;
     Action* _actionA;
-    
+
     vector<Action> _actions;
 };
 
@@ -599,4 +591,4 @@ bool isSupportedJsonFilename(const char* filename);
 
 //extern bool doPrintPanZoom;
 
-}  // namespace kram
+} // namespace kram
diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm
index 9e1467f7..3ee1fd22 100644
--- a/kramv/KramViewerMain.mm
+++ b/kramv/KramViewerMain.mm
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -18,7 +18,7 @@
 
 // C++
 #include "KramLib.h"
-#include "KramVersion.h"  // keep kramv version in sync with libkram
+#include "KramVersion.h" // keep kramv version in sync with libkram
 #include "TaskSystem.h"
 
 //#include "KramMipper.h"
@@ -29,129 +29,125 @@
 //#include "KramZipHelper.h"
 
 //#include "KramImage.h"
-#include "KramViewerBase.h"
-
-
 #include <mutex> // for recursive_mutex
 
+#include "KramViewerBase.h"
+
 using mymutex = std::recursive_mutex;
 using mylock = std::unique_lock<mymutex>;
 
 #include <UniformTypeIdentifiers/UTType.h>
 
-using namespace simd;
+using namespace SIMD_NAMESPACE;
 using namespace kram;
-using namespace NAMESPACE_STL;
-
+using namespace STL_NAMESPACE;
 
 // ktx, ktx2, png, and dds for images
 // zip, metallib
 // gltf, glb files for models
 NSArray<NSString*>* utis = @[
-  @"public.directory",
-    
-  [UTType typeWithFilenameExtension: @"png"].identifier,
-  [UTType typeWithFilenameExtension: @"ktx"].identifier,
-  [UTType typeWithFilenameExtension: @"ktx2"].identifier,
-  [UTType typeWithFilenameExtension: @"dds"].identifier,
-  
-  [UTType typeWithFilenameExtension: @"zip"].identifier,
-  [UTType typeWithFilenameExtension: @"metallib"].identifier,
-  
+    @"public.directory",
+
+    [UTType typeWithFilenameExtension:@"png"].identifier,
+    [UTType typeWithFilenameExtension:@"ktx"].identifier,
+    [UTType typeWithFilenameExtension:@"ktx2"].identifier,
+    [UTType typeWithFilenameExtension:@"dds"].identifier,
+
+    [UTType typeWithFilenameExtension:@"zip"].identifier,
+    [UTType typeWithFilenameExtension:@"metallib"].identifier,
+
 #if USE_GLTF
-  [UTType typeWithFilenameExtension: @"gltf"].identifier,
-  [UTType typeWithFilenameExtension: @"glb"].identifier,
-  //@"model/gltf+json",
-  //@"model/gltf+binary"
+    [UTType typeWithFilenameExtension:@"gltf"].identifier,
+    [UTType typeWithFilenameExtension:@"glb"].identifier,
+//@"model/gltf+json",
+//@"model/gltf+binary"
 #endif
 #if USE_USD
-  [UTType typeWithFilenameExtension: @"usd"].identifier,
-  [UTType typeWithFilenameExtension: @"usd"].identifier,
-  [UTType typeWithFilenameExtension: @"usda"].identifier,
+    [UTType typeWithFilenameExtension:@"usd"].identifier,
+    [UTType typeWithFilenameExtension:@"usd"].identifier,
+    [UTType typeWithFilenameExtension:@"usda"].identifier,
 #endif
-  
-  // read -atlas.json files
-  [UTType typeWithFilenameExtension: @"json"].identifier
+
+    // read -atlas.json files
+    [UTType typeWithFilenameExtension:@"json"].identifier
 ];
 NSDictionary* pasteboardOptions = @{
     // This means only these uti can be droped.
-    NSPasteboardURLReadingContentsConformToTypesKey: utis
-    
+    NSPasteboardURLReadingContentsConformToTypesKey : utis
+
     // Don't use this it prevents folder urls
     //, NSPasteboardURLReadingFileURLsOnlyKey: @YES
 };
 
-
-struct MouseData
-{
+struct MouseData {
     NSPoint originPoint;
     NSPoint oldPoint;
     NSPoint newPoint;
-    
+
     NSPoint pan;
 };
 
 //-------------
 
-
-void Action::setHighlight(bool enable) {
+void Action::setHighlight(bool enable)
+{
     isHighlighted = enable;
-    
+
     auto On = 1; // NSControlStateValueOn;
     auto Off = 0; // NSControlStateValueOff;
-    
+
     if (!isButtonDisabled) {
         ((__bridge NSButton*)button).state = enable ? On : Off;
     }
     ((__bridge NSMenuItem*)menuItem).state = enable ? On : Off;
 }
 
-void Action::setHidden(bool enable) {
+void Action::setHidden(bool enable)
+{
     isHidden = enable;
-    
+
     if (!isButtonDisabled) {
         ((__bridge NSButton*)button).hidden = enable;
     }
     ((__bridge NSMenuItem*)menuItem).hidden = enable;
 }
 
-void Action::disableButton() {
+void Action::disableButton()
+{
     ((__bridge NSButton*)button).hidden = true;
     isButtonDisabled = true;
 }
 
-
 // These are using NSFileManager to list files, so must be ObjC
 void Data::listArchivesInFolder(const string& folderFilename, vector<File>& archiveFiles, bool skipSubdirs)
 {
     NSURL* url = [NSURL fileURLWithPath:[NSString stringWithUTF8String:folderFilename.c_str()]];
-        
+
     NSDirectoryEnumerationOptions options = NSDirectoryEnumerationSkipsHiddenFiles;
     if (skipSubdirs)
         options |= NSDirectoryEnumerationSkipsSubdirectoryDescendants;
-    
+
     NSDirectoryEnumerator* directoryEnumerator =
-    [[NSFileManager defaultManager]
-     enumeratorAtURL:url
-     includingPropertiesForKeys:[NSArray array]
-     options:options
-     errorHandler:  // nil
-     ^BOOL(NSURL *urlArg, NSError *error) {
-        macroUnusedVar(urlArg);
-        macroUnusedVar(error);
-        
-        // handle error
-        return false;
-    }];
-    
+        [[NSFileManager defaultManager]
+                       enumeratorAtURL:url
+            includingPropertiesForKeys:[NSArray array]
+                               options:options
+                          errorHandler: // nil
+                              ^BOOL(NSURL* urlArg, NSError* error) {
+                                  macroUnusedVar(urlArg);
+                                  macroUnusedVar(error);
+
+                                  // handle error
+                                  return false;
+                              }];
+
     // only display models in folder if found, ignore the png/jpg files
     while (NSURL* fileOrDirectoryURL = [directoryEnumerator nextObject]) {
         const char* name = fileOrDirectoryURL.fileSystemRepresentation;
-        
+
         bool isArchive = isSupportedArchiveFilename(name);
-        if (isArchive)
-        {
-            archiveFiles.emplace_back(File(name,0));
+        if (isArchive) {
+            archiveFiles.emplace_back(File(name, 0));
         }
     }
 }
@@ -160,30 +156,30 @@
 {
     // Hope this hsas same permissions
     NSURL* url = [NSURL fileURLWithPath:[NSString stringWithUTF8String:archiveFilename.c_str()]];
-    
+
     NSDirectoryEnumerationOptions options = NSDirectoryEnumerationSkipsHiddenFiles;
     if (skipSubdirs)
         options |= NSDirectoryEnumerationSkipsSubdirectoryDescendants;
-    
+
     NSDirectoryEnumerator* directoryEnumerator =
-    [[NSFileManager defaultManager]
-     enumeratorAtURL:url
-     includingPropertiesForKeys:[NSArray array]
-     options:options
-     errorHandler:  // nil
-     ^BOOL(NSURL *urlArg, NSError *error) {
-        macroUnusedVar(urlArg);
-        macroUnusedVar(error);
-        
-        // handle error - don't change to folder if devoid of valid content
-        return false;
-    }];
-    
+        [[NSFileManager defaultManager]
+                       enumeratorAtURL:url
+            includingPropertiesForKeys:[NSArray array]
+                               options:options
+                          errorHandler: // nil
+                              ^BOOL(NSURL* urlArg, NSError* error) {
+                                  macroUnusedVar(urlArg);
+                                  macroUnusedVar(error);
+
+                                  // handle error - don't change to folder if devoid of valid content
+                                  return false;
+                              }];
+
     while (NSURL* fileOrDirectoryURL = [directoryEnumerator nextObject]) {
         const char* name = fileOrDirectoryURL.fileSystemRepresentation;
-        
+
         bool isValid = isSupportedFilename(name);
-        
+
 #if USE_GLTF || USE_USD
         // note: many gltf reference jpg which will load via GltfAsset, but
         // kram and kramv do not import jpg files.
@@ -191,12 +187,12 @@
             isValid = isSupportedModelFilename(name);
         }
 #endif
-        
+
         if (!isValid) {
             isValid = isSupportedJsonFilename(name);
         }
         if (isValid) {
-            _files.emplace_back(File(name,urlIndex));
+            _files.emplace_back(File(name, urlIndex));
         }
     }
 }
@@ -210,9 +206,7 @@ @interface MyNSTextField : NSTextField
 
 @end
 
-@implementation MyNSTextField
-{
-    
+@implementation MyNSTextField {
 }
 
 // override to allow clickthrough
@@ -228,7 +222,7 @@ - (NSView*)hitTest:(NSPoint)aPoint
 @interface MyMTKView : MTKView
 
 @property(retain, nonatomic, readwrite, nullable)
-    NSMagnificationGestureRecognizer *zoomGesture;
+    NSMagnificationGestureRecognizer* zoomGesture;
 
 @property(nonatomic, readwrite) double lastArchiveTimestamp;
 
@@ -240,12 +234,11 @@ @interface MyMTKView : MTKView
 @property(nonatomic, readwrite) float validMagnification;
 @property(nonatomic, readwrite) MouseData mouseData;
 
-
 - (BOOL)loadTextureFromURLs:(NSArray<NSURL*>*)url;
 
-- (void)setHudText:(const char *)text;
+- (void)setHudText:(const char*)text;
 
-- (void)tableViewSelectionDidChange:(NSNotification *)notification;
+- (void)tableViewSelectionDidChange:(NSNotification*)notification;
 
 - (void)addNotifications;
 
@@ -259,27 +252,28 @@ - (void)fixupDocumentList;
 
 // https://medium.com/@kevingutowski/how-to-setup-a-tableview-in-2019-obj-c-c7dece203333
 @interface TableViewController : NSObject <NSTableViewDataSource, NSTableViewDelegate>
-@property (nonatomic, strong) NSMutableArray<NSAttributedString*>* items;
+@property(nonatomic, strong) NSMutableArray<NSAttributedString*>* items;
 @end
 
 @implementation TableViewController
 
-- (instancetype)init {
+- (instancetype)init
+{
     self = [super init];
-    
+
     _items = [[NSMutableArray alloc] init];
-    
+
     return self;
 }
 
 // NSTableViewDataSource
-- (NSInteger)numberOfRowsInTableView:(NSTableView *)tableView
+- (NSInteger)numberOfRowsInTableView:(NSTableView*)tableView
 {
     return self.items.count;
 }
 
 // NSTableViewDelegate
--(NSView *)tableView:(NSTableView *)tableView viewForTableColumn:(NSTableColumn *)tableColumn row:(NSInteger)row
+- (NSView*)tableView:(NSTableView*)tableView viewForTableColumn:(NSTableColumn*)tableColumn row:(NSInteger)row
 {
     NSString* identifier = tableColumn.identifier;
     NSTableCellView* cell = [tableView makeViewWithIdentifier:identifier owner:self];
@@ -288,18 +282,18 @@ -(NSView *)tableView:(NSTableView *)tableView viewForTableColumn:(NSTableColumn
 }
 
 // NSTableViewDelegate
-- (BOOL)tableView:(NSTableView *)tableView
-shouldTypeSelectForEvent:(NSEvent *)event
-withCurrentSearchString:(NSString *)searchString
+- (BOOL)tableView:(NSTableView*)tableView
+    shouldTypeSelectForEvent:(NSEvent*)event
+     withCurrentSearchString:(NSString*)searchString
 {
     // Return NO to prevent type select (otherwise S or N key will search that key)
     // This is nice on long lists though.
     return NO;
 }
 
-- (void)tableViewSelectionDidChange:(NSNotification *)notification
+- (void)tableViewSelectionDidChange:(NSNotification*)notification
 {
-   // does not need to respond, have a listener on this notification
+    // does not need to respond, have a listener on this notification
 }
 @end
 
@@ -326,7 +320,7 @@ - (instancetype)init
 
 + (BOOL)autosavesInPlace
 {
-    return NO;  // YES;
+    return NO; // YES;
 }
 
 // call when "new" called
@@ -339,8 +333,8 @@ - (void)makeWindowControllers
     //addWindowController:controller];
 }
 
-- (NSData *)dataOfType:(nonnull NSString *)typeName
-                 error:(NSError *_Nullable __autoreleasing *)outError
+- (NSData*)dataOfType:(nonnull NSString*)typeName
+                error:(NSError* _Nullable __autoreleasing*)outError
 {
     // Insert code here to write your document to data of the specified type. If
     // outError != NULL, ensure that you create and set an appropriate error if
@@ -352,18 +346,18 @@ - (NSData *)dataOfType:(nonnull NSString *)typeName
     return nil;
 }
 
-- (BOOL)readFromURL:(nonnull NSURL *)url
-             ofType:(nonnull NSString *)typeName
-              error:(NSError *_Nullable __autoreleasing *)outError
+- (BOOL)readFromURL:(nonnull NSURL*)url
+             ofType:(nonnull NSString*)typeName
+              error:(NSError* _Nullable __autoreleasing*)outError
 {
     // called from OpenRecent documents menu
-    
+
     // throw into an array
-    NSArray<NSURL*>* urls = @[url];
-    
+    NSArray<NSURL*>* urls = @[ url ];
+
     NSApplication* app = [NSApplication sharedApplication];
     MyMTKView* view = app.mainWindow.contentView;
-    
+
     BOOL success = [view loadTextureFromURLs:urls];
     if (success) {
         // Note: if I return NO from this call then a dialog pops up that image
@@ -393,24 +387,24 @@ @interface AppDelegate ()
 
 @implementation AppDelegate
 
-- (void)applicationDidFinishLaunching:(NSNotification *)aNotification
+- (void)applicationDidFinishLaunching:(NSNotification*)aNotification
 {
     // Insert code here to initialize your application
 }
 
-- (void)applicationWillTerminate:(NSNotification *)aNotification
+- (void)applicationWillTerminate:(NSNotification*)aNotification
 {
     // Insert code here to tear down your application
 }
 
 - (BOOL)applicationShouldTerminateAfterLastWindowClosed:
-    (NSApplication *)sender
+    (NSApplication*)sender
 {
     return YES;
 }
 
-- (void)application:(NSApplication *)sender
-           openURLs:(nonnull NSArray<NSURL *> *)urls
+- (void)application:(NSApplication*)sender
+           openURLs:(nonnull NSArray<NSURL*>*)urls
 {
     // this is called from "Open In..."
     MyMTKView* view = sender.mainWindow.contentView;
@@ -418,13 +412,41 @@ - (void)application:(NSApplication *)sender
     [view fixupDocumentList];
 }
 
+/* may need to add code for NSSavePanel for perftrace (.gz)
+- (void)exportDocument:(NSString*)name toType:(NSString*)typeUTI
+{
+   NSWindow* window = [[[self windowControllers] objectAtIndex:0] window];
+
+   // Build a new name for the file using the current name and
+   // the filename extension associated with the specified UTI.
+   CFStringRef newExtension = UTTypeCopyPreferredTagWithClass((CFStringRef)typeUTI,
+                                   kUTTagClassFilenameExtension);
+   NSString* newName = [[name stringByDeletingPathExtension]
+                       stringByAppendingPathExtension:(NSString*)newExtension];
+   CFRelease(newExtension);
+
+   // Set the default name for the file and show the panel.
+   NSSavePanel*    panel = [NSSavePanel savePanel];
+   [panel setNameFieldStringValue:newName];
+   [panel beginSheetModalForWindow:window completionHandler:^(NSInteger result){
+        if (result == NSFileHandlingPanelOKButton)
+        {
+            NSURL*  theFile = [panel URL];
+
+            // Write the contents in the new format.
+
+        }
+    }];
+}
+*/
+
 // this isn't filtered by the document types specified, NSDocumentController?
 // added public.folder instead, this would need to call readFromURL
 - (IBAction)openDocument:(id)sender
 {
     // need to implement, or default NSOpenPanel can't specify a directory
     NSDocumentController* controller = [NSDocumentController sharedDocumentController];
- 
+
 #if 0
     // Would be nice, but doesn't allow directory.
     // How is NSDocument aware of directory, from Info.plist?
@@ -434,35 +456,33 @@ - (IBAction)openDocument:(id)sender
 //        
 //    }
 #else
-    
+
     NSOpenPanel* panel = [NSOpenPanel openPanel];
     [panel setCanChooseFiles:YES];
     [panel setCanChooseDirectories:YES];
     [panel setAllowsMultipleSelection:NO];
 
-    if ([controller runModalOpenPanel:panel forTypes:utis] == NSModalResponseOK)
-    {
+    if ([controller runModalOpenPanel:panel forTypes:utis] == NSModalResponseOK) {
         NSArray<NSURL*>* urls = [panel URLs];
         NSURL* url = [urls objectAtIndex:0];
-        
+
         // This gets a file:// urls, and then openDocument won't open it if
         // it's a folder.
-        
+
         bool isDirectory = false;
         if (url.isFileURL) {
             BOOL isDir = NO;
             // Verify that the file exists
             // and is indeed a directory (isDirectory is an out parameter)
-            if ([[NSFileManager defaultManager] fileExistsAtPath: url.path isDirectory: &isDir]
-                && isDir) {
+            if ([[NSFileManager defaultManager] fileExistsAtPath:url.path isDirectory:&isDir] && isDir) {
                 isDirectory = true;
             }
         }
-        
+
         if (isDirectory) {
             // have to open this directory URL directly
             //[self openURLs:[NSApplication sharedApplication] urls:urls];
-            
+
             // this is called from "Open In..."
             NSApplication* app = [NSApplication sharedApplication];
             MyMTKView* view = app.mainWindow.contentView;
@@ -473,12 +493,11 @@ - (IBAction)openDocument:(id)sender
             [controller openDocumentWithContentsOfURL:url
                                               display:YES
                                     completionHandler:
-                 ^(NSDocument* doc, BOOL isAlreadOpen, NSError* error ) {
-                if (!error) {
-                    // what should this do?
-                }
-            }
-            ];
+                                        ^(NSDocument* doc, BOOL isAlreadOpen, NSError* error) {
+                                            if (!error) {
+                                                // what should this do?
+                                            }
+                                        }];
         }
     }
 #endif
@@ -539,16 +558,16 @@ - (IBAction)showAboutDialog:(id)sender
 
 @end
 
-NSArray<NSString *>* pasteboardTypes = @[
+NSArray<NSString*>* pasteboardTypes = @[
     // don't really want generic urls, but need folders to drop
     //NSPasteboardTypeURL
-    
+
     // this is preventing folder drops ?
     NSPasteboardTypeFileURL
 ];
 
 /* correlates with
- 
+
 public.directory.
 public.png,
 org.khronos.ktx,
@@ -558,33 +577,26 @@ - (IBAction)showAboutDialog:(id)sender
 dyn.ah62d4rv4ge8043pyqf0g24pc, // ick - metallib
 dyn.ah62d4rv4ge80s5dyq2,       // ick - gltf
 dyn.ah62d4rv4ge80s5dc          // ick - glb
- 
-*/
-
-
-
-
-
 
+*/
 
 //----------------------------------------------------
 
-
 @implementation MyMTKView {
-    NSMenu* _viewMenu;  // really the items
+    NSMenu* _viewMenu; // really the items
     NSStackView* _buttonStack;
-    NSMutableArray<NSButton *>* _buttonArray;
+    NSMutableArray<NSButton*>* _buttonArray;
     NSTextField* _hudLabel;
     NSTextField* _hudLabel2;
-    
+
     // Offer list of files in archives
     // TODO: move to NSOutlineView since that can show archive folders with content inside
     IBOutlet NSTableView* _tableView;
     IBOutlet TableViewController* _tableViewController;
-    
+
     // copy of modifier flags, can tie drop actions to this
     NSEventModifierFlags _modifierFlags;
-    
+
     ShowSettings* _showSettings;
     Data _data;
 }
@@ -592,16 +604,23 @@ @implementation MyMTKView {
 - (void)awakeFromNib
 {
     [super awakeFromNib];
-    
+
     // vertical offset of table down so hud can display info
     NSScrollView* scrollView = [_tableView enclosingScrollView];
     CGRect rect = scrollView.frame;
     rect.origin.y += 50;
     scrollView.frame = rect;
-    
+
     // C++ delegate
     _data._delegate.view = (__bridge void*)self;
-    
+
+    // this is sandbox or root if not sandboxed
+    // This is objC call...
+    // This has to be in a .mm file to call
+    std::string traceDir = [NSHomeDirectory() UTF8String];
+    traceDir += "/traces/";
+    _data.setPerfDirectory(traceDir.c_str());
+
     // TODO: see if can only open this
     // KLOGI("Viewer", "AwakeFromNIB");
 }
@@ -623,7 +642,7 @@ - (BOOL)isFlipped
 // TODO: Sometimes getting panels from right side popping in when trying to pan
 // on macOS without using pan gesture.
 
-- (instancetype)initWithCoder:(NSCoder *)coder
+- (instancetype)initWithCoder:(NSCoder*)coder
 {
     self = [super initWithCoder:coder];
 
@@ -631,7 +650,7 @@ - (instancetype)initWithCoder:(NSCoder *)coder
 
     self.clearColor = MTLClearColorMake(0.005f, 0.005f, 0.005f, 0.0f);
 
-    self.clearDepth = _showSettings->isReverseZ ? 0.0f : 1.0f;
+    self.clearDepth = 0.0f;
 
     // only re-render when changes are made
     // Note: this breaks ability to gpu capture, since display link not running.
@@ -642,7 +661,7 @@ - (instancetype)initWithCoder:(NSCoder *)coder
 
     // added for drag-drop support
     [self registerForDraggedTypes:pasteboardTypes];
-    
+
     // This gesture only works for trackpad
     _zoomGesture = [[NSMagnificationGestureRecognizer alloc]
         initWithTarget:self
@@ -658,32 +677,32 @@ - (instancetype)initWithCoder:(NSCoder *)coder
     // hide until image loaded
     _showSettings->isHideUI = true;
     _buttonStack.hidden = YES;
-    
+
     _hudLabel2 = [self _addHud:YES];
     _hudLabel = [self _addHud:NO];
     [self setHudText:""];
-    
+
     return self;
 }
 
-- (nonnull ShowSettings *)showSettings
+- (nonnull ShowSettings*)showSettings
 {
     return _showSettings;
 }
 
-- (nonnull kram::Data *)data
+- (nonnull kram::Data*)data
 {
     return &_data;
 }
 
--(void)fixupDocumentList
+- (void)fixupDocumentList
 {
     // DONE: this recent menu only seems to work the first time
     // and not in subsequent calls to the same entry.  readFromUrl isn't even
     // called. So don't get a chance to switch back to a recent texture. Maybe
     // there's some list of documents created and so it doesn't think the file
     // needs to be reloaded.
-   
+
     // Clear the document list so readFromURL keeps getting called
     // Can't remove currentDoc, so have to skip that
     NSDocumentController* dc = [NSDocumentController sharedDocumentController];
@@ -699,17 +718,15 @@ -(void)fixupDocumentList
     }
 }
 
-
-
-- (NSStackView *)_addButtons
+- (NSStackView*)_addButtons
 {
     _data.initActions();
-    
+
     NSRect rect = NSMakeRect(0, 10, 30, 30);
 
     vector<Action>& actions = _data.actions();
     int32_t numActions = actions.size();
-    
+
     NSMutableArray* buttons = [[NSMutableArray alloc] init];
 
     for (int32_t i = 0; i < numActions; ++i) {
@@ -734,40 +751,40 @@ - (NSStackView *)_addButtons
         [button setFrame:rect];
 
         // https://stackoverflow.com/questions/4467597/how-do-you-stroke-the-outside-of-an-nsattributedstring
-        
+
         NSMutableDictionary* attribsOff = [NSMutableDictionary dictionaryWithObjectsAndKeys:
-            //[NSFont systemFontOfSize:64.0],NSFontAttributeName,
-            [NSColor whiteColor],NSForegroundColorAttributeName,
-            [NSNumber numberWithFloat:-2.0],NSStrokeWidthAttributeName,
-            [NSColor blackColor],NSStrokeColorAttributeName,
-            nil];
+                                                                   //[NSFont systemFontOfSize:64.0],NSFontAttributeName,
+                                                                   [NSColor whiteColor], NSForegroundColorAttributeName,
+                                                                   [NSNumber numberWithFloat:-2.0], NSStrokeWidthAttributeName,
+                                                                   [NSColor blackColor], NSStrokeColorAttributeName,
+                                                                   nil];
         NSMutableDictionary* attribsOn = [NSMutableDictionary dictionaryWithObjectsAndKeys:
-            //[NSFont systemFontOfSize:64.0],NSFontAttributeName,
-            [NSColor systemBlueColor],NSForegroundColorAttributeName,
-            [NSNumber numberWithFloat:-2.0],NSStrokeWidthAttributeName,
-            [NSColor blackColor],NSStrokeColorAttributeName,
-            nil];
+                                                                  //[NSFont systemFontOfSize:64.0],NSFontAttributeName,
+                                                                  [NSColor systemBlueColor], NSForegroundColorAttributeName,
+                                                                  [NSNumber numberWithFloat:-2.0], NSStrokeWidthAttributeName,
+                                                                  [NSColor blackColor], NSStrokeColorAttributeName,
+                                                                  nil];
         button.attributedTitle = [[NSMutableAttributedString alloc] initWithString:name attributes:attribsOff];
-        
+
         // Have to set this too, or button doesn't go blue
         button.attributedAlternateTitle = [[NSMutableAttributedString alloc] initWithString:name attributes:attribsOn];
-        
+
         // stackView seems to disperse the items evenly across the area, so this
         // doesn't work
         bool isSeparator = icon[0] == 0;
-        
+
         if (isSeparator) {
             // rect.origin.y += 11;
             button.enabled = NO;
         }
         else {
             action.button = (__bridge void*)button;
-            
+
             // rect.origin.y += 25;
 
             // TODO: add icons
             //button.image = ...;
-            
+
             // keep all buttons, since stackView will remove and pack the stack
             [_buttonArray addObject:button];
         }
@@ -778,7 +795,7 @@ - (NSStackView *)_addButtons
     NSStackView* stackView = [NSStackView stackViewWithViews:buttons];
     stackView.orientation = NSUserInterfaceLayoutOrientationVertical;
     stackView.detachesHiddenViews =
-        YES;  // default, but why have to have _buttonArrary
+        YES; // default, but why have to have _buttonArrary
     [self addSubview:stackView];
 
     // Want menus, so user can define their own shortcuts to commands
@@ -795,41 +812,41 @@ - (NSStackView *)_addButtons
 
     for (int32_t i = 0; i < numActions; ++i) {
         Action& action = actions[i];
-        const char* icon = action.icon;  // single char
+        const char* icon = action.icon; // single char
         const char* title = action.tip;
 
         NSString* toolTip = [NSString stringWithUTF8String:icon];
         NSString* name = [NSString stringWithUTF8String:title];
         bool isSeparator = icon[0] == 0;
-        
+
         if (isSeparator) {
             [_viewMenu addItem:[NSMenuItem separatorItem]];
         }
         else {
             // NSString *shortcut = @"";  // for now, or AppKit turns key int cmd+shift+key
             NSString* shortcut = [NSString stringWithUTF8String:icon];
-            
+
             NSMenuItem* menuItem =
                 [[NSMenuItem alloc] initWithTitle:name
                                            action:@selector(handleAction:)
                                     keyEquivalent:shortcut];
             menuItem.toolTip = toolTip;
-            
+
             // All key-equivalents assume cmd, so unset cmd
             // still leaves shift next to keys, but better than nothing
             menuItem.keyEquivalentModifierMask = (NSEventModifierFlags)0;
-            
+
             // TODO: add icons, also onStateImage, offStageImage, mixedStateImage
             //menuItem.image = ...;
-             
+
             // can set an integer constant that represents menu that avoid testing string (actionID)
             //menuItem.tag = ...;
-            
+
             // TODO: menus and buttons should reflect any toggle state
             // menuItem.state = Mixed/Off/On;
 
             [_viewMenu addItem:menuItem];
-            
+
             action.menuItem = (__bridge void*)menuItem;
         }
     }
@@ -837,27 +854,25 @@ - (NSStackView *)_addButtons
     [_viewMenu addItem:[NSMenuItem separatorItem]];
 
     //----------------------
-    
+
     // don't want some buttons showing up, menu only
     _data.initDisabledButtons();
-    
+
     return stackView;
 }
 
-
-
-- (NSTextField *)_addHud:(BOOL)isShadow
+- (NSTextField*)_addHud:(BOOL)isShadow
 {
     // TODO: This text field is clamping to the height, so have it set to 1200.
     // really want field to expand to fill the window height for large output
     uint32_t w = 800;
     uint32_t h = 1220;
-    
+
     // add a label for the hud
     NSTextField* label = [[MyNSTextField alloc]
         initWithFrame:NSMakeRect(isShadow ? 21 : 20, isShadow ? 21 : 20, w,
                                  h)];
-    
+
     label.preferredMaxLayoutWidth = w;
 
     label.drawsBackground = NO;
@@ -868,12 +883,12 @@ - (NSTextField *)_addHud:(BOOL)isShadow
     label.editable = NO;
     label.selectable = NO;
     label.lineBreakMode = NSLineBreakByClipping;
-    label.maximumNumberOfLines = 0;  // fill to height
+    label.maximumNumberOfLines = 0; // fill to height
 
     // important or interferes with table view
     label.refusesFirstResponder = YES;
     label.enabled = NO;
-    
+
     label.cell.scrollable = NO;
     label.cell.wraps = NO;
 
@@ -888,14 +903,13 @@ - (NSTextField *)_addHud:(BOOL)isShadow
     return label;
 }
 
-
-- (void)handleGesture:(NSGestureRecognizer *)gestureRecognizer
+- (void)handleGesture:(NSGestureRecognizer*)gestureRecognizer
 {
     // skip until image loaded
     if (_showSettings->imageBoundsX == 0) {
         return;
     }
-    
+
     // https://cocoaosxrevisited.wordpress.com/2018/01/06/chapter-18-mouse-events/
     if (gestureRecognizer != _zoomGesture) {
         return;
@@ -906,10 +920,10 @@ - (void)handleGesture:(NSGestureRecognizer *)gestureRecognizer
     float zoom = _zoomGesture.magnification;
     if (isFirstGesture) {
         _zoomGesture.magnification = 1.0f;
-        
+
         _validMagnification = 1.0f;
         _originalZoom = _showSettings->zoom;
-        
+
         zoom = _originalZoom;
     }
     else if (zoom * _originalZoom < 0.1f) {
@@ -917,7 +931,7 @@ - (void)handleGesture:(NSGestureRecognizer *)gestureRecognizer
         zoom = 0.1f / _originalZoom;
         _zoomGesture.magnification = zoom;
     }
-    
+
     if (!isFirstGesture) {
         // try expontental (this causes a jump, comparison avoids an initial jump
         // zoom = powf(zoom, 1.05f);
@@ -925,29 +939,28 @@ - (void)handleGesture:(NSGestureRecognizer *)gestureRecognizer
         // doing multiply instead of equals here, also does exponential zom
         zoom *= _originalZoom;
     }
-    
+
     [self updateZoom:zoom];
 }
 
--(void)updateZoom:(float)zoom
+- (void)updateZoom:(float)zoom
 {
     // https://developer.apple.com/documentation/uikit/touches_presses_and_gestures/handling_uikit_gestures/handling_pinch_gestures?language=objc
     // need to sync up the zoom when action begins or zoom will jump
-    
 
     // https://stackoverflow.com/questions/30002361/image-zoom-centered-on-mouse-position
 
     // DONE: rect is now ar:1 for rect case, so these x values need to be half ar
     // and that's only if it's not rotated.  box/cube/ellipse make also not correspond
     float ar = _showSettings->imageAspectRatio();
-    
+
     // find the cursor location with respect to the image
-    float4 bottomLeftCorner = float4m(-0.5 * ar, -0.5f, 0.0f, 1.0f);
-    float4 topRightCorner = float4m(0.5 * ar, 0.5f, 0.0f, 1.0f);
+    float4 bottomLeftCorner = float4m(-0.5f * ar, -0.5f, 0.0f, 1.0f);
+    float4 topRightCorner = float4m(0.5f * ar, 0.5f, 0.0f, 1.0f);
 
     float4x4 newMatrix = _data.computeImageTransform(_showSettings->panX,
-                                                    _showSettings->panY,
-                                                    zoom);
+                                                     _showSettings->panY,
+                                                     zoom);
 
     // don't allow panning the entire image off the view boundary
     // transform the upper left and bottom right corner of the image
@@ -960,7 +973,7 @@ -(void)updateZoom:(float)zoom
 
     // see that rectangle intersects the view, view is -1 to 1
     // this handles inversion
-    float2 ptOrigin = simd::min(pt0.xy, pt1.xy);
+    float2 ptOrigin = SIMD_NAMESPACE::min(pt0.xy, pt1.xy);
     float2 ptSize = abs(pt0.xy - pt1.xy);
 
     float4 imageRect = float4m(ptOrigin.x, ptOrigin.y, ptSize.x, ptSize.y);
@@ -988,23 +1001,23 @@ -(void)updateZoom:(float)zoom
     //float minZoom = std::min(1.0f/8.0f, _showSettings->zoomFit);
 
     // TODO: 3d models have imageBoundsY of 1, so the limits are hit immediately
-    
+
     int32_t gap = _showSettings->showAllPixelGap;
-    
+
     // Note this includes chunks and mips even if those are not shown
     // so image could be not visible.
     float2 maxZoomXY;
     maxZoomXY.x = maxZoom * (_showSettings->imageBoundsX + gap) * numTexturesX;
     maxZoomXY.y = maxZoom * (_showSettings->imageBoundsY + gap) * numTexturesY;
-    
+
     float minPixelSize = 4;
     float2 minZoomXY;
     minZoomXY.x = minPixelSize; // minZoom * (_showSettings->imageBoundsX + gap) * numTexturesX;
     minZoomXY.y = minPixelSize; // minZoom * (_showSettings->imageBoundsY + gap) * numTexturesY;
-   
+
     // don't allow image to get too big
     bool isZoomChanged = true;
-    
+
     if (visibleWidth > maxZoomXY.x || visibleHeight > maxZoomXY.y) {
         isZoomChanged = false;
     }
@@ -1018,7 +1031,7 @@ -(void)updateZoom:(float)zoom
     if (!rectIntersectsRect(imageRect, viewRect)) {
         isZoomChanged = false;
     }
-    
+
     if (!isZoomChanged) {
         _zoomGesture.magnification = _validMagnification; // objC
         return;
@@ -1039,47 +1052,46 @@ -(void)updateZoom:(float)zoom
         _showSettings->panX = newPan.x;
         _showSettings->panY = newPan.y;
 
-//        if (doPrintPanZoom) {
-//            string text;
-//            sprintf(text,
-//                    "Pan %.3f,%.3f\n"
-//                    "Zoom %.2fx\n",
-//                    _showSettings->panX, _showSettings->panY, _showSettings->zoom);
-//            [self setHudText:text.c_str()];
-//        }
+        // if (doPrintPanZoom) {
+        //     string text;
+        //     sprintf(text,
+        //             "Pan %.3f,%.3f\n"
+        //             "Zoom %.2fx\n",
+        //             _showSettings->panX, _showSettings->panY, _showSettings->zoom);
+        //     [self setHudText:text.c_str()];
+        // }
 
         // Cause a new sample for eyedropper
         _data.updateEyedropper();
-        
+
         self.needsDisplay = YES; // objC
     }
 }
 
-
 // left mouse button down
-- (void)mouseDown:(NSEvent *)event
+- (void)mouseDown:(NSEvent*)event
 {
     // skip until image loaded
     if (_showSettings->imageBoundsX == 0) {
         return;
     }
-    
+
     _mouseData.originPoint =
-    _mouseData.oldPoint =
-    _mouseData.newPoint = [self convertPoint:[event locationInWindow] fromView:nil];
+        _mouseData.oldPoint =
+            _mouseData.newPoint = [self convertPoint:[event locationInWindow] fromView:nil];
 
     // capture pan value and cursor value
     _mouseData.pan = NSMakePoint(_showSettings->panX, _showSettings->panY);
 }
 
 // drag is mouse movement with left button down
-- (void)mouseDragged:(NSEvent *)event
+- (void)mouseDragged:(NSEvent*)event
 {
     // skip until image loaded
     if (_showSettings->imageBoundsX == 0) {
         return;
     }
-    
+
     _mouseData.oldPoint = _mouseData.newPoint;
     _mouseData.newPoint = [self convertPoint:[event locationInWindow] fromView:nil];
 
@@ -1089,38 +1101,37 @@ - (void)mouseDragged:(NSEvent *)event
     delta.y = _mouseData.newPoint.y - _mouseData.originPoint.y;
     delta.x = -delta.x;
     delta.y = -delta.y;
-    
+
     // scale to actual px or mouse cursor doesn't track drag
     delta.x *= _showSettings->viewContentScaleFactor;
     delta.y *= _showSettings->viewContentScaleFactor;
-    
+
     // This is correct, but scale to image so cursor tracks the pick location
     // might be over a different mip/chunk though.
     float panX = _mouseData.pan.x + delta.x;
     float panY = _mouseData.pan.y + delta.y;
-    
+
     [self updatePan:panX panY:panY];
 }
 
-- (void)mouseUp:(NSEvent *)event
+- (void)mouseUp:(NSEvent*)event
 {
     // ignore up even though cursor may have moved
-
 }
 
-- (void)mouseMoved:(NSEvent *)event
+- (void)mouseMoved:(NSEvent*)event
 {
     // skip until image loaded
     if (_showSettings->imageBoundsX == 0) {
         return;
     }
-    
+
     // pixel in non-square window coords, run through inverse to get texel space
     // I think magnofication of zoom gesture is affecting coordinates reported by
     // this
 
     NSPoint point = [event locationInWindow];
-    
+
     // This flips so upper left corner is 0,0, vs. bottom left
     point = [self convertPoint:point fromView:nil];
 
@@ -1130,35 +1141,32 @@ - (void)mouseMoved:(NSEvent *)event
     _showSettings->cursorY = (int32_t)point.y;
 
     _data.updateEyedropper();
-    
+
     // Cause a new sample for eyedropper (will run in Metal CompletedHandler)
     self.needsDisplay = YES;
 }
 
-
-
-
--(void)updateEyedropperText
+- (void)updateEyedropperText
 {
     if (_showSettings->imageBoundsX == 0) return;
-    
+
     float2 uv;
     uv.x = _showSettings->textureLookupX / (float)_showSettings->imageBoundsX;
     uv.y = _showSettings->textureLookupY / (float)_showSettings->imageBoundsY;
-    
+
     // convert data to text
     _data.showEyedropperData(uv);
-    
+
     const Atlas* atlas = _data.findAtlasAtUV(uv);
     if (atlas) {
         // convert back to pixels in the current mip
         float mipBoundsX = std::max(1, _showSettings->imageBoundsX >> _showSettings->mipNumber);
         float mipBoundsY = std::max(1, _showSettings->imageBoundsY >> _showSettings->mipNumber);
-        
+
         float4 rect = atlas->rect();
         rect.xz *= mipBoundsX;
         rect.yw *= mipBoundsY;
-        
+
         string atlasText;
         sprintf(atlasText, "%d,%d %dx%d %s",
                 (int32_t)rect.x, (int32_t)rect.y,
@@ -1173,13 +1181,13 @@ -(void)updateEyedropperText
     [self updateHudText];
 }
 
-- (void)setEyedropperText:(const char *)text
+- (void)setEyedropperText:(const char*)text
 {
     _data.setEyedropperText(text);
     [self updateHudText];
 }
 
-- (void)setHudText:(const char *)text
+- (void)setHudText:(const char*)text
 {
     _data.setTextSlot(kTextSlotHud, text);
     [self updateHudText];
@@ -1190,29 +1198,29 @@ - (void)updateHudText
     // combine textSlots
     string text = _data.textFromSlots(_tableView.hidden);
 
-    NSString *textNS = [NSString stringWithUTF8String:text.c_str()];
-    
+    NSString* textNS = [NSString stringWithUTF8String:text.c_str()];
+
     // This is drop shadowed by drawing same text twice
     _hudLabel2.stringValue = textNS;
     _hudLabel2.needsDisplay = YES;
-    
+
     _hudLabel.stringValue = textNS;
     _hudLabel.needsDisplay = YES;
 }
 
-- (void)scrollWheel:(NSEvent *)event
+- (void)scrollWheel:(NSEvent*)event
 {
     // skip until image loaded
     if (_showSettings->imageBoundsX == 0) {
         return;
     }
-    
+
     // From ImGui notes:
     // From macOS 12.1, scrolling with two fingers and then decelerating
     // by tapping two fingers results in two events appearing.
     if (event.phase == NSEventPhaseCancelled)
         return;
-    
+
     double wheelX = [event scrollingDeltaX];
     double wheelY = [event scrollingDeltaY];
 
@@ -1221,20 +1229,20 @@ - (void)scrollWheel:(NSEvent *)event
     //   and trackpad fires on that too causing the image to zoom away to nothing (inertia maybe)
     // https://stackoverflow.com/questions/6642058/mac-cocoa-how-can-i-detect-trackpad-scroll-gestures
     bool isMouse = ![event hasPreciseScrollingDeltas];
-    
+
     if (isMouse) {
         // zoom with mouse
         float zoom = _zoomGesture.magnification;
         if (wheelY != 0.0) {
             wheelY *= 0.01;
             wheelY = std::clamp(wheelY, -0.1, 0.1);
-            
+
             zoom *= 1.0 + wheelY;
-            
+
             // here have to modify the magnfication, since gesture isn't driving it
             _zoomGesture.magnification = zoom;
-            
-            [self updateZoom: zoom];
+
+            [self updateZoom:zoom];
         }
     }
     else {
@@ -1244,7 +1252,7 @@ - (void)scrollWheel:(NSEvent *)event
 
         float panX = _showSettings->panX + wheelX;
         float panY = _showSettings->panY + wheelY;
-        
+
         [self updatePan:panX panY:(float)panY];
     }
 }
@@ -1254,12 +1262,12 @@ bool rectIntersectsRect(float4 lhs, float4 rhs)
     // convert rect from (origin, size) to (min, max)
     float4 lRect = lhs.xyxy;
     lRect.zw += lhs.zw;
-    
+
     float4 rRect = rhs.xyxy;
     rRect.zw += rhs.zw;
-    
+
     return all(lRect.xy <= rRect.zw) && // min <= max
-           all(lRect.zw >= rRect.xy);   // max >= min
+           all(lRect.zw >= rRect.xy); // max >= min
 }
 
 // TODO: move to data, but eliminate CGRect usage
@@ -1268,23 +1276,23 @@ - (void)updatePan:(float)panX panY:(float)panY
     //Renderer* renderer = (Renderer *)self.delegate;
     float4x4 projectionViewModelMatrix =
         _data.computeImageTransform(panX,
-                                   panY,
-                                   _showSettings->zoom);
+                                    panY,
+                                    _showSettings->zoom);
 
     // don't allow panning the entire image off the view boundary
     // transform the upper left and bottom right corner or the image
 
     // what if zoom moves it outside?
     float ar = _showSettings->imageAspectRatio();
-    
-    float4 pt0 = projectionViewModelMatrix * float4m(-0.5 * ar, -0.5f, 0.0f, 1.0f);
-    float4 pt1 = projectionViewModelMatrix * float4m(0.5 * ar, 0.5f, 0.0f, 1.0f);
+
+    float4 pt0 = projectionViewModelMatrix * float4m(-0.5f * ar, -0.5f, 0.0f, 1.0f);
+    float4 pt1 = projectionViewModelMatrix * float4m(0.5f * ar, 0.5f, 0.0f, 1.0f);
 
     // for perspective
     pt0.xyz /= pt0.w;
     pt1.xyz /= pt1.w;
 
-    float2 ptOrigin = simd::min(pt0.xy, pt1.xy);
+    float2 ptOrigin = SIMD_NAMESPACE::min(pt0.xy, pt1.xy);
     float2 ptSize = abs(pt0.xy - pt1.xy);
 
     // see that rectangle intersects the view, view is -1 to 1
@@ -1309,14 +1317,14 @@ - (void)updatePan:(float)panX panY:(float)panY
         _showSettings->panX = panX;
         _showSettings->panY = panY;
 
-//        if (doPrintPanZoom) {
-//            string text;
-//            sprintf(text,
-//                    "Pan %.3f,%.3f\n"
-//                    "Zoom %.2fx\n",
-//                    _showSettings->panX, _showSettings->panY, _showSettings->zoom);
-//            [self setHudText:text.c_str()];
-//        }
+        // if (doPrintPanZoom) {
+        //     string text;
+        //     sprintf(text,
+        //             "Pan %.3f,%.3f\n"
+        //             "Zoom %.2fx\n",
+        //             _showSettings->panX, _showSettings->panY, _showSettings->zoom);
+        //     [self setHudText:text.c_str()];
+        // }
 
         // Cause a new sample from Metal to eyeDropper
         _data.updateEyedropper();
@@ -1337,9 +1345,6 @@ - (BOOL)validateUserInterfaceItem:(id<NSValidatedUserInterfaceItem>)item
     return YES;
 }
 
-
-
-
 - (IBAction)handleAction:(id)sender
 {
     NSEvent* theEvent = [NSApp currentEvent];
@@ -1353,21 +1358,21 @@ - (IBAction)handleAction:(id)sender
     else if ([sender isKindOfClass:[NSMenuItem class]]) {
         action = _data.actionFromMenu(senderPtr);
     }
-    
+
     if (!action) {
         KLOGE("kram", "unknown UI element");
         return;
     }
-    
+
     [self handleEventAction:action isShiftKeyDown:isShiftKeyDown];
 }
 
-- (void)flagsChanged:(NSEvent *)theEvent
+- (void)flagsChanged:(NSEvent*)theEvent
 {
     _modifierFlags = theEvent.modifierFlags;
 }
 
-- (void)keyDown:(NSEvent *)theEvent
+- (void)keyDown:(NSEvent*)theEvent
 {
     bool isShiftKeyDown = theEvent.modifierFlags & NSEventModifierFlagShift;
     uint32_t keyCode = theEvent.keyCode;
@@ -1375,19 +1380,19 @@ - (void)keyDown:(NSEvent *)theEvent
     // for now hit esc to hide the table views
     if (keyCode == Key::Escape) {
         [self hideFileTable];
-        
+
         _hudHidden = false;
         [self updateHudVisibility];
         return;
     }
-    
+
     const Action* action = _data.actionFromKey(keyCode);
     if (!action) {
         [super keyDown:theEvent];
         //KLOGE("kram", "unknown UI element");
         return;
     }
-    
+
     bool isHandled = [self handleEventAction:action isShiftKeyDown:isShiftKeyDown];
     if (!isHandled) {
         // this will bonk
@@ -1417,20 +1422,19 @@ - (void)updateHudVisibility
     _hudLabel2.hidden = _hudHidden || !_showSettings->isHudShown;
 }
 
-
 - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyDown
 {
     Renderer* renderer = (Renderer*)self.delegate;
-   
+
     ActionState actionState;
     if (!_data.handleEventAction(action, isShiftKeyDown, actionState))
         return false;
-        
+
     // Do the leftover action work to call ObjC
     if (action == _data._actionVertical) {
         _buttonStack.orientation = _showSettings->isVerticalUI
-            ? NSUserInterfaceLayoutOrientationVertical
-            : NSUserInterfaceLayoutOrientationHorizontal;
+                                       ? NSUserInterfaceLayoutOrientationVertical
+                                       : NSUserInterfaceLayoutOrientationHorizontal;
     }
     else if (action == _data._actionHideUI) {
         _buttonStack.hidden = _showSettings->isHideUI;
@@ -1440,7 +1444,6 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD
     }
     else if (action == _data._actionInfo) {
         if (_showSettings->isHudShown) {
-            
             // also hide the file table, since this can be long
             [self hideFileTable];
         }
@@ -1454,7 +1457,7 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD
         // tell the renderer to show one or other view
         renderer.isToggleView = !_showSettings->isSRGBShown;
     }
-    
+
     //-------------
     // Update everything
     if (!actionState.hudText.empty()) {
@@ -1471,10 +1474,6 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD
     return true;
 }
 
-
-
-
-
 // Note: docs state that drag&drop should be handled automatically by UTI setup
 // via openURLs but I find these calls are needed, or it doesn't work.  Maybe
 // need to register for NSURL instead of NSPasteboardTypeFileURL.  For example,
@@ -1486,7 +1485,7 @@ - (NSDragOperation)draggingEntered:(id)sender
     if (([sender draggingSourceOperationMask] & NSDragOperationGeneric) ==
         NSDragOperationGeneric) {
         NSPasteboard* pasteboard = [sender draggingPasteboard];
-        
+
         bool canReadPasteboardObjects =
             [pasteboard canReadObjectForClasses:@[ [NSURL class] ]
                                         options:pasteboardOptions];
@@ -1509,9 +1508,9 @@ - (BOOL)prepareForDragOperation:(id)sender
 - (BOOL)performDragOperation:(id)sender
 {
     NSPasteboard* pasteboard = [sender draggingPasteboard];
-    
-    NSArray<NSURL*>* urls = [pasteboard readObjectsForClasses:@[[NSURL class]]
-                                                      options: pasteboardOptions];
+
+    NSArray<NSURL*>* urls = [pasteboard readObjectsForClasses:@[ [NSURL class] ]
+                                                      options:pasteboardOptions];
     int filesCount = [urls count];
     if (filesCount > 0) {
         if ([self loadTextureFromURLs:urls]) {
@@ -1523,10 +1522,6 @@ - (BOOL)performDragOperation:(id)sender
     return NO;
 }
 
-
-
-
-
 - (void)updateFileSelection
 {
     // set selection
@@ -1535,18 +1530,20 @@ - (void)updateFileSelection
     [_tableView scrollRowToVisible:fileIndex];
 }
 
-- (BOOL)setImageFromSelection:(NSInteger)index {
+- (BOOL)setImageFromSelection:(NSInteger)index
+{
     if (!_data._files.empty()) {
         if (_data._fileIndex != index) {
             _data._fileIndex = index;
             return [self loadFile];
         }
     }
-    
+
     return NO;
 }
 
-- (BOOL)setShapeFromSelection:(NSInteger)index {
+- (BOOL)setShapeFromSelection:(NSInteger)index
+{
     if (_showSettings->meshNumber != index) {
         _showSettings->meshNumber = index;
         self.needsDisplay = YES;
@@ -1555,137 +1552,130 @@ - (BOOL)setShapeFromSelection:(NSInteger)index {
     return NO;
 }
 
-
-
-
--(BOOL)loadFile
+- (BOOL)loadFile
 {
     if (_data._files.empty())
         return NO;
-    
+
     // lookup the filename and data at that entry
     const File& file = _data._files[_data._fileIndex];
     const char* filename = file.nameShort.c_str();
-    
-    setErrorLogCapture( true );
-    
+
+    setErrorLogCapture(true);
+
     bool success = _data.loadFile();
-    
+
     // Update these settings
-    
+
     if (!success) {
         string errorText;
         getErrorLogCaptureText(errorText);
         setErrorLogCapture(false);
-        
+
         string finalErrorText;
-        append_sprintf(finalErrorText, "Could not load from file:\n %s\n",
-                       filename);
+        // this does have previous filename set
+        _data.setFailedText(file.name.c_str(), finalErrorText);
         finalErrorText += errorText;
-        
+
         [self setHudText:finalErrorText.c_str()];
         return NO;
     }
-    setErrorLogCapture( false );
-    
+    setErrorLogCapture(false);
+
     //-------
     Renderer* renderer = (Renderer*)self.delegate;
-    
+
     _showSettings->isSRGBShown = false;
     if (success && renderer.hasToggleView) {
         _showSettings->isSRGBShown = isSrgbFormat(_showSettings->originalFormat);
     }
-    
+
     renderer.playAnimations = _showSettings->isPlayAnimations;
     renderer.isToggleView = !_showSettings->isSRGBShown;
-    
+
     // -------------
     string title = _showSettings->windowTitleString(filename);
     self.window.title = [NSString stringWithUTF8String:title.c_str()];
-    
+
     // doesn't set imageURL or update the recent document menu
-    
+
     // show the controls
     if (_data._noImageLoaded) {
         _showSettings->isHideUI = false;
-        _buttonStack.hidden = NO;  // show controls
+        _buttonStack.hidden = NO; // show controls
         _data._noImageLoaded = false;
     }
-    
+
     // show/hide button
     _data.updateUIAfterLoad();
-    
+
     self.needsDisplay = YES;
     return YES;
 }
 
-
-
--(void)loadFilesFromUrls:(NSArray<NSURL*>*)urls skipSubdirs:(BOOL)skipSubdirs
+- (void)loadFilesFromUrls:(NSArray<NSURL*>*)urls skipSubdirs:(BOOL)skipSubdirs
 {
     // convert urls to vector<string> for C++
     vector<string> urlStrings;
     for (NSURL* url in urls) {
         urlStrings.push_back(url.fileSystemRepresentation);
     }
-    
+
     // C++ to build list
     _data.loadFilesFromUrls(urlStrings, skipSubdirs);
-    
+
     //-------------------
-    
+
     NSMutableDictionary* attribsOff = [NSMutableDictionary dictionaryWithObjectsAndKeys:
-        //[NSFont systemFontOfSize:64.0],NSFontAttributeName,
-        [NSColor whiteColor],NSForegroundColorAttributeName,
-        [NSNumber numberWithFloat:-2.0],NSStrokeWidthAttributeName,
-        [NSColor blackColor],NSStrokeColorAttributeName,
-        nil];
-    
+                                                               //[NSFont systemFontOfSize:64.0],NSFontAttributeName,
+                                                               [NSColor whiteColor], NSForegroundColorAttributeName,
+                                                               [NSNumber numberWithFloat:-2.0], NSStrokeWidthAttributeName,
+                                                               [NSColor blackColor], NSStrokeColorAttributeName,
+                                                               nil];
+
     // add the files into the file list
     [_tableViewController.items removeAllObjects];
-    for (const auto& file: _data._files) {
+    for (const auto& file : _data._files) {
         const char* filenameShort = file.nameShort.c_str();
-        
-        NSString* fileMenuText = [NSString stringWithUTF8String: filenameShort];
+
+        NSString* fileMenuText = [NSString stringWithUTF8String:filenameShort];
         NSMutableAttributedString* fileMenuStr = [[NSMutableAttributedString alloc] initWithString:fileMenuText attributes:attribsOff];
-        
+
         [_tableViewController.items addObject:fileMenuStr];
     }
-    
+
     // reloadData calls selectionDidChange which then sets _fileIndex = 0;
     uint32_t fileIndex = _data._fileIndex;
     [_tableView reloadData];
     _data._fileIndex = fileIndex;
-    
+
     [self updateFileSelection];
     [self hideFileTable];
-    
+
     // add it to recent docs (only 10 slots)
     if (urls.count == 1) {
         NSDocumentController* dc =
-        [NSDocumentController sharedDocumentController];
+            [NSDocumentController sharedDocumentController];
         [dc noteNewRecentDocumentURL:urls[0]];
     }
 }
 
-
 - (BOOL)loadTextureFromURLs:(NSArray<NSURL*>*)urls
 {
     // turn back on the hud if was in a list view
     _hudHidden = false;
     [self updateHudVisibility];
-    
+
     const char* filename = "";
     NSURL* url = urls[0];
-    if ([url.scheme isEqualToString:@"kram"])
-    {
+    if ([url.scheme isEqualToString:@"kram"]) {
         // the resource specifier has port and other data
         // for now treat this as a local file path.
-        
+
         // kram://filename.ktx
         filename = [url.resourceSpecifier UTF8String];
         filename = filename + 2; // skip the //
-        
+
         // can't get Slack to honor links like these
         // with a kram:///Users/...
         // or with kram://~/blah
@@ -1694,44 +1684,43 @@ - (BOOL)loadTextureFromURLs:(NSArray<NSURL*>*)urls
         // also need this same treatment instead
         // of relying on url.fileSystemRepresentation
     }
-    else
-    {
+    else {
         filename = url.fileSystemRepresentation;
     }
     bool isSingleFile = urls.count == 1;
-    
-    Renderer* renderer = (Renderer *)self.delegate;
-    
+
+    Renderer* renderer = (Renderer*)self.delegate;
+
     // Handle shader hotload
     if (isSingleFile && endsWithExtension(filename, ".metallib")) {
         if ([renderer hotloadShaders:filename]) {
             NSURL* metallibFileURL =
-            [NSURL fileURLWithPath:[NSString stringWithUTF8String:filename]];
-            
+                [NSURL fileURLWithPath:[NSString stringWithUTF8String:filename]];
+
             // add to recent docs, so can reload quickly
             NSDocumentController* dc =
-            [NSDocumentController sharedDocumentController];
+                [NSDocumentController sharedDocumentController];
             [dc noteNewRecentDocumentURL:metallibFileURL];
-            
+
             return YES;
         }
         return NO;
     }
-    
+
     // don't leave archive table open
     if (isSingleFile)
         [self hideFileTable];
 
     // only recurse down subdirs if cmd key held during drop or recent menu item selection
-    bool skipSubdirs = ( _modifierFlags & NSEventModifierFlagCommand ) == 0;
-    
+    bool skipSubdirs = (_modifierFlags & NSEventModifierFlagCommand) == 0;
+
     [self loadFilesFromUrls:urls skipSubdirs:skipSubdirs];
-    
+
     BOOL success = [self loadFile];
     return success;
 }
-   
--(BOOL)loadModelFile:(const char*)filename
+
+- (BOOL)loadModelFile:(const char*)filename
 {
 #if USE_GLTF
     // Right now can only load these if they are embedded, since sandbox will
@@ -1739,32 +1728,32 @@ -(BOOL)loadModelFile:(const char*)filename
     // related items, but they must all be named the same.  I think if folder
     // instead of the file is selected, then could search and find the gltf files
     // and the other files.
-    
+
     //----------------------
     // These assets should be combined into a single hierarchy, and be able to
     // save out a scene with all of them in a single scene.  But that should
     // probably reference original content in case it's updated.
-    
+
     // const char* filenameShort = toFilenameShort(filename);
     //double timestamp = FileHelper::modificationTimestamp(filename);
-    
+
     // TODO: this used to compare filename timestamp?
-    
+
     // This code only takes url, so construct one
-    Renderer* renderer = (Renderer *)self.delegate;
+    Renderer* renderer = (Renderer*)self.delegate;
     [renderer releaseAllPendingTextures];
     BOOL success = [renderer loadModel:filename];
-    
+
     // TODO: split this off to a completion handler, since loadModel is async
     // and should probably also have a cancellation (or counter)
-    
+
     // show/hide button
     _data.updateUIAfterLoad();
-    
+
     if (!success) {
         return NO;
     }
-    
+
     return success;
 #else
     return NO;
@@ -1781,10 +1770,9 @@ - (void)concludeDragOperation:(id)sender
     // did setNeedsDisplay, but already doing that in loadTextureFromURL
 }
 
-- (void)tableViewSelectionDidChange:(NSNotification *)notification
+- (void)tableViewSelectionDidChange:(NSNotification*)notification
 {
-    if (notification.object == _tableView)
-    {
+    if (notification.object == _tableView) {
         // image
         NSInteger selectedRow = [_tableView selectedRow];
         [self setImageFromSelection:selectedRow];
@@ -1795,8 +1783,9 @@ - (void)addNotifications
 {
     // listen for the selection change messages
     [[NSNotificationCenter defaultCenter] addObserver:self
-                                              selector:@selector(tableViewSelectionDidChange:)
-                                                  name:NSTableViewSelectionDidChangeNotification object:nil];
+                                             selector:@selector(tableViewSelectionDidChange:)
+                                                 name:NSTableViewSelectionDidChangeNotification
+                                               object:nil];
 }
 
 - (void)removeNotifications
@@ -1805,7 +1794,6 @@ - (void)removeNotifications
     [[NSNotificationCenter defaultCenter] removeObserver:self];
 }
 
-
 - (BOOL)acceptsFirstResponder
 {
     return YES;
@@ -1837,7 +1825,7 @@ - (void)viewDidLoad
 {
     [super viewDidLoad];
 
-    _view = (MyMTKView *)self.view;
+    _view = (MyMTKView*)self.view;
 
     // have to disable this since reading back from textures
     // that slows the blit to the screen
@@ -1851,9 +1839,8 @@ - (void)viewDidLoad
 
     _renderer = [[Renderer alloc] initWithMetalKitView:_view
                                               settings:_view.showSettings
-                                              data:_view.data];
+                                                  data:_view.data];
 
-    
     // https://developer.apple.com/library/archive/documentation/Cocoa/Conceptual/EventOverview/TrackingAreaObjects/TrackingAreaObjects.html
     // this is better than requesting mousemoved events, they're only sent when
     // cursor is inside
@@ -1870,7 +1857,7 @@ - (void)viewDidLoad
     [_view addTrackingArea:_trackingArea];
 
     [_view addNotifications];
-    
+
     [_view setupUI];
 
     // original sample code was sending down _view.bounds.size, but need
@@ -1879,30 +1866,28 @@ - (void)viewDidLoad
 
     // ObjC++ delegate
     _view.delegate = _renderer;
-    
+
     [_renderer setEyedropperDelegate:_view];
 }
 
-
-
 @end
 
 bool DataDelegate::loadFile(bool clear)
 {
     MyMTKView* view_ = (__bridge MyMTKView*)view;
-    
+
     if (clear) {
         // set selection
         [view_ updateFileSelection];
-        
+
         // want it to respond to arrow keys
         //[self.window makeFirstResponder: _tableView];
-        
+
         // show the files table
         [view_ showFileTable];
         [view_ setEyedropperText:""];
     }
-    
+
     return [view_ loadFile];
 }
 
@@ -1915,9 +1900,9 @@ - (void)viewDidLoad
 bool DataDelegate::loadTextureFromImage(const char* fullFilename, double timestamp, KTXImage& image, KTXImage* imageNormal, KTXImage* imageDiff, bool isArchive)
 {
     MyMTKView* view_ = (__bridge MyMTKView*)view;
-    Renderer* renderer = (Renderer *)view_.delegate;
+    Renderer* renderer = (Renderer*)view_.delegate;
     [renderer releaseAllPendingTextures];
-    
+
     if (![renderer loadTextureFromImage:fullFilename
                               timestamp:timestamp
                                   image:image
@@ -1926,18 +1911,17 @@ - (void)viewDidLoad
                               isArchive:isArchive]) {
         return false;
     }
-    
+
     return true;
 }
 
-
 //-------------
 
-int main(int argc, const char *argv[])
+int main(int argc, const char* argv[])
 {
-    ThreadInfo infoMain = { "Main", ThreadPriority::Interactive, 0 };
+    ThreadInfo infoMain = {"Main", ThreadPriority::Interactive, 0};
     setThreadInfo(infoMain);
-    
+
     @autoreleasepool {
         // Setup code that might create autoreleased objects goes here.
     }
diff --git a/kramv/Shaders/KramShaders.h b/kramv/Shaders/KramShaders.h
index 3566f1f6..bde752e5 100644
--- a/kramv/Shaders/KramShaders.h
+++ b/kramv/Shaders/KramShaders.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -8,9 +8,10 @@
 #ifndef __METAL_VERSION__
 #import <Foundation/Foundation.h>
 #else
+#define SIMD_NAMESPACE simd
+#import <simd/simd.h>
 #include <metal_stdlib>
 #endif
-#import <simd/simd.h>
 
 #ifdef __METAL_VERSION__
 #define NS_ENUM(_type, _name) \
@@ -111,10 +112,10 @@ typedef NS_ENUM(int32_t, ShaderLightingMode) {
 // TODO: placement of these elements in the struct breaks transfer
 // of data. This seems to work.  Alignment issues with mixing these differently.
 struct Uniforms {
-    simd::float4x4 projectionViewMatrix;
-    simd::float4x4 modelMatrix;
-    simd::float4 modelMatrixInvScale2;  // to supply inverse, w is determinant
-    simd::float3 cameraPosition;        // world-space
+    SIMD_NAMESPACE::float4x4 projectionViewMatrix;
+    SIMD_NAMESPACE::float4x4 modelMatrix;
+    //SIMD_NAMESPACE::float4 modelMatrixInvScale2;  // to supply inverse, w is determinant
+    SIMD_NAMESPACE::float3 cameraPosition;        // world-space
     float uvPreview;
     float uvToShapeRatio;
     
@@ -176,14 +177,14 @@ struct UniformsLevel {
     uint32_t mipLOD;
     uint32_t face;
     uint32_t arrayOrSlice;
-    simd::float2 drawOffset;   // pixel offset to apply
-    simd::float4 textureSize;  // width, height, 1/width, 1/height
+    SIMD_NAMESPACE::float2 drawOffset;   // pixel offset to apply
+    SIMD_NAMESPACE::float4 textureSize;  // width, height, 1/width, 1/height
     uint32_t passNumber; // switch to enum
 };
 
 // This is all tied to a single level sample
 struct UniformsCS {
-    simd::uint2 uv;
+    SIMD_NAMESPACE::int2 uv;
 
     uint32_t arrayOrSlice;
     uint32_t face;
@@ -191,7 +192,7 @@ struct UniformsCS {
 };
 
 struct UniformsDebug {
-    simd::float4 rect;
+    SIMD_NAMESPACE::float4 rect;
 };
     
 #endif
diff --git a/kramv/Shaders/KramShaders.metal b/kramv/Shaders/KramShaders.metal
index 79583a81..3228c91c 100644
--- a/kramv/Shaders/KramShaders.metal
+++ b/kramv/Shaders/KramShaders.metal
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -519,24 +519,28 @@ inline float3x3 toFloat3x3(float4x4 m)
     return float3x3(m[0].xyz, m[1].xyz, m[2].xyz);
 }
 
+// This works even with scale of 0 and is correct over using invT.
+// The normal will be normalize anyways.  Also saves sending down another tfm.
+inline float3x3 adjoint(float3x3 m)
+{
+    return float3x3(cross(m[1], m[2]),
+                    cross(m[2], m[0]),
+                    cross(m[0], m[1]));
+}
+
+
 // this is for vertex shader if tangent supplied
 void transformBasis(thread float3& normal, thread float3& tangent,
-                    float4x4 modelToWorldTfm, float3 invScale2, bool useTangent)
+                    float4x4 modelToWorldTfm, bool useTangent)
 {
     
     float3x3 m = toFloat3x3(modelToWorldTfm);
     
-    // note this is RinvT * n = (Rt)t = R, this is for simple inverse, inv scale handled below
-    // but uniform scale already handled by normalize
-    normal = m * normal;
-    normal *= invScale2;
+    normal = adjoint(m) * normal;
     normal = normalize(normal);
    
-    // question here of whether tangent is transformed by m or mInvT
-    // most apps assume m, but after averaging it can be just as off the surface as the normal
     if (useTangent) {
         tangent = m * tangent;
-        tangent *= invScale2;
         tangent = normalize(tangent);
     }
     
@@ -622,7 +626,7 @@ ColorInOut DrawImageFunc(
     
     if (needsWorldBasis) {
         float3 t = tangent.xyz;
-        transformBasis(normal, t, uniforms.modelMatrix, uniforms.modelMatrixInvScale2.xyz, uniforms.useTangent);
+        transformBasis(normal, t, uniforms.modelMatrix, uniforms.useTangent);
         tangent.xyz = t;
         
         out.normal = toHalf(normal);
@@ -1629,11 +1633,11 @@ kernel void SampleImageCS(
 {
     // the for-loop is replaced with a collection of threads, each of which
     // calls this function.
-    uint2 uv = uniforms.uv; // tie into texture lookup
+    int2 uv = uniforms.uv; // tie into texture lookup
     // uv >>= uniforms.mipLOD;
     
     // the color is returned to linear rgba32f
-    float4 color = colorMap.read(uv, uniforms.mipLOD);
+    float4 color = colorMap.read((uint2)uv, uniforms.mipLOD);
     result.write(color, index);
 }
 
@@ -1646,13 +1650,13 @@ kernel void SampleImageArrayCS(
 {
     // the for-loop is replaced with a collection of threads, each of which
     // calls this function.
-    uint2 uv = uniforms.uv; // tie into texture lookup
+    int2 uv = uniforms.uv; // tie into texture lookup
     //uv >>= uniforms.mipLOD;
     
     uint arrayOrSlice = uniforms.arrayOrSlice;
     
     // the color is returned to linear rgba32f
-    float4 color = colorMap.read(uv, arrayOrSlice, uniforms.mipLOD);
+    float4 color = colorMap.read((uint2)uv, arrayOrSlice, uniforms.mipLOD);
     result.write(color, index);
 }
 
@@ -1705,7 +1709,7 @@ kernel void SampleVolumeCS(
 {
     // the for-loop is replaced with a collection of threads, each of which
     // calls this function.
-    uint3 uv = uint3(uniforms.uv, uniforms.arrayOrSlice); // tie into texture lookup
+    uint3 uv = uint3((uint2)uniforms.uv, uniforms.arrayOrSlice); // tie into texture lookup
     //uv >>= uniforms.mipLOD);
     
     // the color is returned to linear rgba32f
diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt
index f4fc5edb..814aae65 100644
--- a/libkram/CMakeLists.txt
+++ b/libkram/CMakeLists.txt
@@ -1,18 +1,3 @@
-#cmake_minimum_required(VERSION 3.19.1 FATAL_ERROR)
-
-#-----------------------------------------------------
-
-set(BUILD_MAC FALSE)
-set(BUILD_WIN FALSE)
-
-if (APPLE)
-    message("build for macOS")
-    set(BUILD_MAC TRUE)
-elseif (WIN32)
-    message("build for win x64")
-    set(BUILD_WIN TRUE)
-endif()
-
 #-----------------------------------------------------
 
 # TODO: hook these up to added code below, and pass to as command line settings
@@ -24,7 +9,6 @@ option(BCENC "Compile BCenc Encoder" ON)
 option(COMP "Compile Compressonator Encoder" ON)
 
 option(EASTL "Compile EASTL" OFF)
-option(FASTL "Compile FASTL" OFF)
 
 # convert ON to 1, UGH
 set(COMPILE_ATE 0)
@@ -67,13 +51,6 @@ if (EASTL)
     set(COMPILE_EASTL 1)
 endif()
 
-# replace parts of std/stdl with fastl
-set(COMPILE_FASTL 0)
-if (FASTL)
-    set(COMPILE_FASTL 1)
-endif()
-
-
 #-----------------------------------------------------
 # libkram
 
@@ -101,7 +78,12 @@ else()
 endif()
 
 # add_library doesn't establish a project, so still pointing at root CMake
-set(SOURCE_DIR ${PROJECT_SOURCE_DIR}/libkram)
+#set(SOURCE_DIR ${PROJECT_SOURCE_DIR}/libkram)
+#set(SOURCE_DIR ${CMAKE_SOURCE_DIR}/libkram)
+set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+#set(INCLUDE_DIR ${CMAKE_SOURCE_DIR}/libkram)
+set(INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 file(GLOB_RECURSE libSources CONFIGURE_DEPENDS 
 	"${SOURCE_DIR}/astc-encoder/*.cpp"
@@ -131,9 +113,6 @@ file(GLOB_RECURSE libSources CONFIGURE_DEPENDS
     "${SOURCE_DIR}/eastl/*.cpp"
     "${SOURCE_DIR}/eastl/*.h"
     
-    "${SOURCE_DIR}/fastl/*.cpp"
-    "${SOURCE_DIR}/fastl/*.h"
-
     "${SOURCE_DIR}/lodepng/lodepng.cpp"
     "${SOURCE_DIR}/lodepng/lodepng.h"
 
@@ -145,6 +124,9 @@ file(GLOB_RECURSE libSources CONFIGURE_DEPENDS
 
     "${SOURCE_DIR}/tmpfileplus/tmpfileplus.cpp"
     "${SOURCE_DIR}/tmpfileplus/tmpfileplus.h"
+     
+    "${SOURCE_DIR}/vectormath/*.h"
+    "${SOURCE_DIR}/vectormath/*.cpp"
     
     # partial zstd decode-only unity file
     # cd zstd/build/single_file_libs
@@ -162,7 +144,7 @@ file(GLOB_RECURSE libSources CONFIGURE_DEPENDS
 )
 
 # no objc on win or linux
-if (BUILD_WIN)
+if (BUILD_WIN OR BUILD_LINUX)
     list(FILTER libSources EXCLUDE REGEX ".*ateencoder.mm$")
     list(FILTER libSources EXCLUDE REGEX ".*ateencoder.h$")
 endif()
@@ -191,84 +173,92 @@ list(FILTER libSources EXCLUDE REGEX ".fmt.cpp$")
 source_group(TREE "${SOURCE_DIR}" PREFIX "source" FILES ${libSources})
 
 target_include_directories(${myTargetLib} PUBLIC
-    "${SOURCE_DIR}/kram/"
-    
-    # why are these public, must be in public headers
-    "${SOURCE_DIR}/eastl/include/"
-
-    "${SOURCE_DIR}/fastl/"
+    # public since included in other project files
+    "${INCLUDE_DIR}/kram/"
+    "${INCLUDE_DIR}/eastl/include/"
+    "${INCLUDE_DIR}/vectormath/"
 )
 
 target_include_directories(${myTargetLib} PRIVATE
-    "${SOURCE_DIR}/astc-encoder/"
-    "${SOURCE_DIR}/ate/"
-    "${SOURCE_DIR}/bc7enc/"
-    "${SOURCE_DIR}/compressonator/bc6h/"
-    "${SOURCE_DIR}/etc2comp/"
-    "${SOURCE_DIR}/fmt/"
-    "${SOURCE_DIR}/heman/"
-    "${SOURCE_DIR}/lodepng"
-    "${SOURCE_DIR}/miniz/"
-    "${SOURCE_DIR}/squish/"
-    "${SOURCE_DIR}/tmpfileplus/"
-    "${SOURCE_DIR}/zstd/"
-    )
+    "${INCLUDE_DIR}/astc-encoder/"
+    "${INCLUDE_DIR}/ate/"
+    "${INCLUDE_DIR}/bc7enc/"
+    "${INCLUDE_DIR}/compressonator/bc6h/"
+    "${INCLUDE_DIR}/etc2comp/"
+    "${INCLUDE_DIR}/fmt/"
+    "${INCLUDE_DIR}/heman/"
+    "${INCLUDE_DIR}/lodepng"
+    "${INCLUDE_DIR}/miniz/"
+    "${INCLUDE_DIR}/squish/"
+    "${INCLUDE_DIR}/tmpfileplus/"
+    "${INCLUDE_DIR}/zstd/"
+)
      
 # only add sources to the library
 target_sources(${myTargetLib} PRIVATE ${libSources})
 
+#-----------------------------------------------------
+
+if (BUILD_LINUX)
+    # Enable all warnings, and also enable f16c sims op (only x64 though)
+    target_compile_options(${myTargetLib} PRIVATE -W -Wall 
+        -mavx2 -mfma -mf16c 
+        -fno-exceptions -fno-rtti
+        -fdata-sections -ffunction-sections
+    )
+
+    # this is already done by pch for libkram, but other projects need the force include inherited
+    # force include (public)
+    target_compile_options(${myTargetLib} PUBLIC -include KramConfig.h)
+endif()
+
 # note: mac build is all done via Xcode workspace/project now, this cmake build is legacy
 if (BUILD_MAC)
     set_target_properties(${myTargetLib} PROPERTIES
-        # Note: match this up with CXX version
-        # c++11 min
-        XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++20"
-        XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++"
-
-        # avx1 (ignored by universal?)
-        XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx"
-        
-        # turn off exceptions/rtti
-        XCODE_ATTRIBUTE_GCC_ENABLE_CPP_EXCEPTIONS NO
-        XCODE_ATTRIBUTE_GCC_ENABLE_CPP_RTTI NO
-        
-        # can't believe this isn't on by default in CMAKE
-        XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES
+        # set debug style for apps
+        XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT "dwarf-with-dsym"
+        XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH "NO"
     )
 
-    target_compile_options(${myTargetLib} PRIVATE -W -Wall)
+    # Enable all warnings, and also enable f16c sims op (only x64 though)
+    target_compile_options(${myTargetLib} PRIVATE -W -Wall 
+        -mavx2 -mfma -mf16c 
+        -fobjc-arc
+        -fno-exceptions -fno-rtti
+        -fdata-sections -ffunction-sections
+    )
 
-    # TODO: switch to pch setup (KramConfig.pch)
     # this is already done by pch for libkram, but other projects need the force include inherited
     # force include (public)
     target_compile_options(${myTargetLib} PUBLIC -include KramConfig.h)
-    
-elseif (BUILD_WIN)
-    
+endif()
+
+if (BUILD_WIN)
     # TODO: switch to add_target_definitions
-    # TODO: turn on C++17
 
-    # to turn off exceptions/rtti use /GR and /EHsc replacement
-    string(REGEX REPLACE "/GR" "/GR-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-    string(REGEX REPLACE "/EHsc" "/EHs-c-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    set_property(TARGET ${myTargetLib} PROPERTY
+        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
+    )
 
     # this is already done by pch for libkram, but other projects need the force include inherited
     # force include (public)
     target_compile_options(${myTargetLib} PUBLIC /FIKramConfig.h)
        
-    # all warnings, AVX, and multiprocess compiles, clang needs mf16c or -mavx2
-    target_compile_options(${myTargetLib} PRIVATE  /W3 /arch:AVX -mf16c /MP)
+    # all warnings, AVX2, and multiprocess compiles,
+    # eliminate duplicate strings, embed full path
+    # TODO: -fdata-sections -ffunction-sections
+    target_compile_options(${myTargetLib} PRIVATE  /W3 
+        -march=haswell -mf16c -mfma 
+        /GR- /EHs-c-
+        /MP /GF /FC
+    )
     
     # fix STL (don't use -D here, will remove)
     target_compile_definitions(${myTargetLib} PRIVATE _ITERATOR_DEBUG_LEVEL=0 _HAS_EXCEPTIONS=0)
 
 endif()
 
-# turn on dead-code stripping in release.  Don't set this in debug.
-# does this make sense on lib, or just on apps ?
-if (CMAKE_BUILD_TYPE STREQUAL "RELEASE") 
-    target_compile_options(${myTargetLib} PUBLIC -dead_strip)
-endif()
+
 
 # This will be force include (-include, /FI) on GCC/clang/VS.
 # Can't seem to ref KramPrefix.pch file.  Goes into cmake_pch.hxx file
@@ -276,19 +266,16 @@ target_precompile_headers(${myTargetLib} PRIVATE
     ${SOURCE_DIR}/kram/KramPrefix.h
 )
 
-# public
-# TODO: these should not be in quotes?
-target_compile_definitions(${myTargetLib} 
+# These turns into -D, don't add to the lines
+target_compile_definitions(${myTargetLib}
     PUBLIC
-    "-DCOMPILE_EASTL=${COMPILE_EASTL}"
-    "-DCOMPILE_FASTL=${COMPILE_FASTL}"
-
-    PRIVATE
-    "-DCOMPILE_ATE=${COMPILE_ATE}"
-    "-DCOMPILE_BCENC=${COMPILE_BCENC}"
-    "-DCOMPILE_ETCENC=${COMPILE_ETCENC}"
-    "-DCOMPILE_SQUISH=${COMPILE_SQUISH}"
-    "-DCOMPILE_ASTCENC=${COMPILE_ASTCENC}"
-    "-DCOMPILE_COMP=${COMPILE_COMP}"
+    COMPILE_EASTL=${COMPILE_EASTL}
    
+    PRIVATE
+    COMPILE_ATE=${COMPILE_ATE}
+    COMPILE_BCENC=${COMPILE_BCENC}
+    COMPILE_ETCENC=${COMPILE_ETCENC}
+    COMPILE_SQUISH=${COMPILE_SQUISH}
+    COMPILE_ASTCENC=${COMPILE_ASTCENC}
+    COMPILE_COMP=${COMPILE_COMP}
 )
diff --git a/libkram/allocate/dlmalloc.h b/libkram/allocate/dlmalloc.h
index 50f69ab2..b02bcfdd 100644
--- a/libkram/allocate/dlmalloc.h
+++ b/libkram/allocate/dlmalloc.h
@@ -4,7 +4,7 @@
 #include <stddef.h> // for size_t
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 /*
  This version of malloc supports the standard SVID/XPG mallinfo
diff --git a/libkram/astc-encoder/astcenc_mathlib.cpp b/libkram/astc-encoder/astcenc_mathlib.cpp
index f276ac7e..82d5a1b8 100644
--- a/libkram/astc-encoder/astcenc_mathlib.cpp
+++ b/libkram/astc-encoder/astcenc_mathlib.cpp
@@ -46,3 +46,76 @@ uint64_t astc::rand(uint64_t state[2])
 	state[1] = rotl(s1, 37);
 	return res;
 }
+
+#if SIMD_SSE
+
+/* ============================================================================
+  Softfloat library with fp32 and fp16 conversion functionality.
+============================================================================ */
+//#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+//    /* narrowing float->float conversions */
+//    uint16_t float_to_sf16(float val);
+//    float sf16_to_float(uint16_t val);
+//#endif
+
+vint4 float_to_float16(vfloat4 a)
+{
+//#if ASTCENC_F16C >= 1
+    __m128i packedf16 = _mm_cvtps_ph(a.m, 0);
+    __m128i f16 = _mm_cvtepu16_epi32(packedf16);
+    return vint4(f16);
+//#else
+//    return vint4(
+//        float_to_sf16(a.lane<0>()),
+//        float_to_sf16(a.lane<1>()),
+//        float_to_sf16(a.lane<2>()),
+//        float_to_sf16(a.lane<3>()));
+//#endif
+}
+
+/**
+ * @brief Return a float16 value for a float scalar, using round-to-nearest.
+ */
+uint16_t float_to_float16(float a)
+{
+//#if ASTCENC_F16C >= 1
+    __m128i f16 = _mm_cvtps_ph(_mm_set1_ps(a), 0);
+    return  static_cast<uint16_t>(_mm_cvtsi128_si32(f16));
+//#else
+//    return float_to_sf16(a);
+//#endif
+}
+
+/**
+ * @brief Return a float value for a float16 vector.
+ */
+vfloat4 float16_to_float(vint4 a)
+{
+//#if ASTCENC_F16C >= 1
+    __m128i packed = _mm_packs_epi32(a.m, a.m);
+    __m128 f32 = _mm_cvtph_ps(packed);
+    return vfloat4(f32);
+//#else
+//    return vfloat4(
+//        sf16_to_float(a.lane<0>()),
+//        sf16_to_float(a.lane<1>()),
+//        sf16_to_float(a.lane<2>()),
+//        sf16_to_float(a.lane<3>()));
+//#endif
+}
+
+/**
+ * @brief Return a float value for a float16 scalar.
+ */
+float float16_to_float(uint16_t a)
+{
+//#if ASTCENC_F16C >= 1
+    __m128i packed = _mm_set1_epi16(a);
+    __m128 f32 = _mm_cvtph_ps(packed);
+    return _mm_cvtss_f32(f32);
+//#else
+//    return sf16_to_float(a);
+//#endif
+}
+
+#endif
diff --git a/libkram/astc-encoder/astcenc_mathlib.h b/libkram/astc-encoder/astcenc_mathlib.h
index 67e989e7..ebad96c3 100644
--- a/libkram/astc-encoder/astcenc_mathlib.h
+++ b/libkram/astc-encoder/astcenc_mathlib.h
@@ -27,62 +27,81 @@
 #include <cstdint>
 #include <cmath>
 
-#ifndef ASTCENC_POPCNT
-  #if defined(__POPCNT__)
-    #define ASTCENC_POPCNT 1
+#ifndef ASTCENC_NEON
+  #if defined(__aarch64__)
+    #define ASTCENC_NEON 1
   #else
-    #define ASTCENC_POPCNT 0
+    #define ASTCENC_NEON 0
   #endif
 #endif
 
-#ifndef ASTCENC_F16C
-  #if defined(__F16C__)
-    #define ASTCENC_F16C 1
-  #else
-    #define ASTCENC_F16C 0
-  #endif
-#endif
+#if ASTCENC_NEON
 
-#ifndef ASTCENC_SSE
-  #if defined(__SSE4_2__)
-    #define ASTCENC_SSE 42
-  #elif defined(__SSE4_1__)
-    #define ASTCENC_SSE 41
-  #elif defined(__SSE3__)
-    #define ASTCENC_SSE 30
-  #elif defined(__SSE2__)
-    #define ASTCENC_SSE 20
-  #else
+    // Intel simd ops
     #define ASTCENC_SSE 0
-  #endif
-#endif
-
-#ifndef ASTCENC_AVX
-  #if defined(__AVX2__)
-    #define ASTCENC_AVX 2
-  #elif defined(__AVX__)
-    #define ASTCENC_AVX 1
-  #else
     #define ASTCENC_AVX 0
-  #endif
-#endif
 
-#ifndef ASTCENC_NEON
-  #if defined(__aarch64__)
-    #define ASTCENC_NEON 1
-  #else
-    #define ASTCENC_NEON 0
-  #endif
-#endif
+    // Keep alignment at 16B
+    #define ASTCENC_VECALIGN 16
+
+    // These have equivalents in Neon
+    #define ASTCENC_POPCNT 0
+    #define ASTCENC_F16C 0
+
 
-#if ASTCENC_AVX
-  #define ASTCENC_VECALIGN 32
 #else
-  #define ASTCENC_VECALIGN 16
-#endif
 
-#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
-	#include <immintrin.h>
+    #ifndef ASTCENC_SSE
+      #if defined(__SSE4_2__)
+        #define ASTCENC_SSE 42
+      #elif defined(__SSE4_1__)
+        #define ASTCENC_SSE 41
+      #elif defined(__SSE3__)
+        #define ASTCENC_SSE 30
+      #elif defined(__SSE2__)
+        #define ASTCENC_SSE 20
+      #else
+        #define ASTCENC_SSE 0
+      #endif
+    #endif
+
+    #ifndef ASTCENC_AVX
+      #if defined(__AVX2__)
+        #define ASTCENC_AVX 2
+      #elif defined(__AVX__)
+        #define ASTCENC_AVX 1
+      #else
+        #define ASTCENC_AVX 0
+      #endif
+    #endif
+
+    // must set -fpopcount
+    #ifndef ASTCENC_POPCNT
+      #if defined(__POPCNT__)
+        #define ASTCENC_POPCNT 1
+      #else
+        #define ASTCENC_POPCNT 0
+      #endif
+    #endif
+
+    // must set -mf16c only on x86_64 build, avx not enough on clang
+    #ifndef ASTCENC_F16C
+      #if defined(__F16C__)
+        #define ASTCENC_F16C 1
+      #else
+        #define ASTCENC_F16C 0
+      #endif
+    #endif
+
+    //#if ASTCENC_AVX
+    //  #define ASTCENC_VECALIGN 32
+    //#else
+      #define ASTCENC_VECALIGN 16
+    //#endif
+
+    #if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0 || ASTCENC_F16C != 0
+        #include <immintrin.h>
+    #endif
 #endif
 
 /* ============================================================================
@@ -418,15 +437,6 @@ uint64_t rand(uint64_t state[2]);
 
 }
 
-/* ============================================================================
-  Softfloat library with fp32 and fp16 conversion functionality.
-============================================================================ */
-#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
-	/* narrowing float->float conversions */
-	uint16_t float_to_sf16(float val);
-	float sf16_to_float(uint16_t val);
-#endif
-
 /*********************************
   Vector library
 *********************************/
diff --git a/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp b/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp
index d95fb9da..f3f7bd54 100644
--- a/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp
+++ b/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp
@@ -18,7 +18,8 @@
 /**
  * @brief Soft-float library for IEEE-754.
  */
-#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+// Chopped out
+#if 0 // (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
 
 #include "astcenc_mathlib.h"
 
diff --git a/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h b/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h
index fe8a1b16..360c0969 100755
--- a/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h
+++ b/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h
@@ -125,7 +125,11 @@ struct vfloat8
 	 */
 	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
 	{
+#if ASTCENC_VECALIGN == 32
 		return vfloat8(_mm256_load_ps(p));
+#else // 16
+        return vfloat8(_mm256_loadu_ps(p));
+#endif
 	}
 
 	/**
@@ -242,7 +246,11 @@ struct vint8
 	 */
 	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
 	{
+#if ASTCENC_VECALIGN == 32
 		return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
+#else // 16
+        return vint8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)));
+#endif
 	}
 
 	/**
@@ -534,7 +542,11 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
  */
 ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
 {
-	_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
+#if ASTCENC_VECALIGN == 32
+    _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
+#else // 16
+	_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
+#endif
 }
 
 /**
@@ -961,7 +973,11 @@ ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
  */
 ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
 {
-	_mm256_store_ps(p, a.m);
+#if ASTCENC_VECALIGN == 32
+    _mm256_store_ps(p, a.m);
+#else // 16
+	_mm256_storeu_ps(p, a.m);
+#endif
 }
 
 /**
diff --git a/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h b/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h
index 868522dc..aaf5dccb 100755
--- a/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h
+++ b/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h
@@ -941,65 +941,23 @@ ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
 /**
  * @brief Return a float16 value for a float vector, using round-to-nearest.
  */
-ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
-{
-#if ASTCENC_F16C >= 1
-	__m128i packedf16 = _mm_cvtps_ph(a.m, 0);
-	__m128i f16 = _mm_cvtepu16_epi32(packedf16);
-	return vint4(f16);
-#else
-	return vint4(
-		float_to_sf16(a.lane<0>()),
-		float_to_sf16(a.lane<1>()),
-		float_to_sf16(a.lane<2>()),
-		float_to_sf16(a.lane<3>()));
-#endif
-}
+vint4 float_to_float16(vfloat4 a);
 
 /**
  * @brief Return a float16 value for a float scalar, using round-to-nearest.
  */
-static inline uint16_t float_to_float16(float a)
-{
-#if ASTCENC_F16C >= 1
-	__m128i f16 = _mm_cvtps_ph(_mm_set1_ps(a), 0);
-	return  static_cast<uint16_t>(_mm_cvtsi128_si32(f16));
-#else
-	return float_to_sf16(a);
-#endif
-}
+uint16_t float_to_float16(float a);
 
 /**
  * @brief Return a float value for a float16 vector.
  */
-ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
-{
-#if ASTCENC_F16C >= 1
-	__m128i packed = _mm_packs_epi32(a.m, a.m);
-	__m128 f32 = _mm_cvtph_ps(packed);
-	return vfloat4(f32);
-#else
-	return vfloat4(
-		sf16_to_float(a.lane<0>()),
-		sf16_to_float(a.lane<1>()),
-		sf16_to_float(a.lane<2>()),
-		sf16_to_float(a.lane<3>()));
-#endif
-}
+vfloat4 float16_to_float(vint4 a);
+
 
 /**
  * @brief Return a float value for a float16 scalar.
  */
-ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
-{
-#if ASTCENC_F16C >= 1
-	__m128i packed = _mm_set1_epi16(a);
-	__m128 f32 = _mm_cvtph_ps(packed);
-	return _mm_cvtss_f32(f32);
-#else
-	return sf16_to_float(a);
-#endif
-}
+float float16_to_float(uint16_t a);
 
 /**
  * @brief Return a float value as an integer bit pattern (i.e. no conversion).
diff --git a/libkram/bc7enc/bc7enc.cpp b/libkram/bc7enc/bc7enc.cpp
index d7aec202..fd8e43c2 100644
--- a/libkram/bc7enc/bc7enc.cpp
+++ b/libkram/bc7enc/bc7enc.cpp
@@ -2303,8 +2303,7 @@ static void handle_opaque_block(void *pBlock, const color_rgba *pPixels, const b
 	pParams->m_perceptual = pComp_params->m_perceptual;
 	pParams->m_num_pixels = 16;
 	pParams->m_pPixels = pPixels;
-	pParams->m_has_alpha = false;
-
+	
 	opt_results.m_partition = 0;
 	opt_results.m_index_selector = 0;
 	opt_results.m_rotation = 0;
@@ -2318,6 +2317,10 @@ static void handle_opaque_block(void *pBlock, const color_rgba *pPixels, const b
 		pParams->m_comp_bits = 7;
 		pParams->m_has_pbits = true;
 		pParams->m_endpoints_share_pbit = false;
+        
+        // This means mode has alpha even though this is an opaque block
+        // so deal with the error on alpha too.
+        pParams->m_has_alpha = true;
 
 		color_cell_compressor_results results6;
 		results6.m_pSelectors = opt_results.m_selectors;
@@ -2343,6 +2346,7 @@ static void handle_opaque_block(void *pBlock, const color_rgba *pPixels, const b
 		pParams->m_comp_bits = 6;
 		pParams->m_has_pbits = true;
 		pParams->m_endpoints_share_pbit = true;
+        pParams->m_has_alpha = false;
 
 		const uint8_t *pPartition = &g_bc7_partition2[trial_partition * 16];
 
@@ -2396,6 +2400,8 @@ static void handle_opaque_block(void *pBlock, const color_rgba *pPixels, const b
 		}
 	}
 
+    pParams->m_has_alpha = false;
+
 	encode_bc7_block(pBlock, &opt_results);
 }
 
diff --git a/libkram/bc7enc/ert.h b/libkram/bc7enc/ert.h
index 509b1aa3..03268a1e 100644
--- a/libkram/bc7enc/ert.h
+++ b/libkram/bc7enc/ert.h
@@ -12,7 +12,7 @@
 
 namespace ert
 {
-    using namespace NAMESPACE_STL;
+    using namespace STL_NAMESPACE;
 
 	struct color_rgba { uint8_t m_c[4]; };
 
diff --git a/libkram/bc7enc/utils.cpp b/libkram/bc7enc/utils.cpp
index b388d3f9..37e24f70 100644
--- a/libkram/bc7enc/utils.cpp
+++ b/libkram/bc7enc/utils.cpp
@@ -10,7 +10,7 @@
 
 namespace utils 
 {		
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 #define FLOOD_PUSH(y, xl, xr, dy) if (((y + (dy)) >= 0) && ((y + (dy)) < (int)m_height)) { stack.push_back(fill_segment(y, xl, xr, dy)); }
 
diff --git a/libkram/bc7enc/utils.h b/libkram/bc7enc/utils.h
index e07a0c20..60c574b2 100644
--- a/libkram/bc7enc/utils.h
+++ b/libkram/bc7enc/utils.h
@@ -36,7 +36,7 @@
 
 namespace utils
 {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 extern const uint32_t g_pretty_colors[];
 extern const uint32_t g_num_pretty_colors;
diff --git a/libkram/compressonator/bc6h/bc6h_decode.cpp b/libkram/compressonator/bc6h/bc6h_decode.cpp
index f3d7be77..7bc4643a 100644
--- a/libkram/compressonator/bc6h/bc6h_decode.cpp
+++ b/libkram/compressonator/bc6h/bc6h_decode.cpp
@@ -210,6 +210,7 @@ int lerp(int a, int b, int i, int denom) {
     case 3:
         denom *= 5;
         i *= 5;    // fall through to case 15
+        [[fallthrough]];
     case 15:
         weights = g_aWeights4;
         break;
diff --git a/libkram/compressonator/bc6h/bc6h_encode.cpp b/libkram/compressonator/bc6h/bc6h_encode.cpp
index 97fd41cd..b371e1de 100644
--- a/libkram/compressonator/bc6h/bc6h_encode.cpp
+++ b/libkram/compressonator/bc6h/bc6h_encode.cpp
@@ -473,7 +473,10 @@ void BC6HBlockEncoder::QuantizeEndPointToF16Prec(float EndPoints[MAX_SUBSETS][MA
     so that indices at fix up points have higher order bit set to 0
 ==================================================================*/
 
-void BC6HBlockEncoder::SwapIndices(int iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], int iIndices[3][BC6H_MAX_SUBSET_SIZE], int  entryCount[BC6H_MAX_SUBSETS], int max_subsets, int mode, int shape_pattern) {
+void BC6HBlockEncoder::SwapIndices(int iEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG], int iIndices[MAX_SUBSETS][BC6H_MAX_SUBSET_SIZE],
+  // int  entryCount[BC6H_MAX_SUBSETS], // this is 2 but callers pass array[MAX_SUBSETS]
+  int  entryCount[MAX_SUBSETS], // to keep compiler happy
+  int max_subsets, int mode, int shape_pattern) {
 
     unsigned int uNumIndices    = 1 << ModePartition[mode].IndexPrec;
     unsigned int uHighIndexBit    = uNumIndices >> 1;
@@ -594,7 +597,7 @@ bool BC6HBlockEncoder::TransformEndPoints(AMD_BC6H_Format &BC6H_data, int iEndPo
 
 void BC6HBlockEncoder::SaveCompressedBlockData( AMD_BC6H_Format &BC6H_data,
         int oEndPoints[MAX_SUBSETS][MAX_END_POINTS][MAX_DIMENSION_BIG],
-        int iIndices[2][MAX_SUBSET_SIZE],
+        int iIndices[MAX_SUBSETS][MAX_SUBSET_SIZE], // did harcode 2 = BC6H_MAX_SUBSET_SIZE, but not what is passed
         int max_subsets,
         int mode) {
     BC6H_data.m_mode    = (unsigned short)mode;
diff --git a/libkram/compressonator/bc6h/hdr_encode.cpp b/libkram/compressonator/bc6h/hdr_encode.cpp
index 32b4090f..e2f76ba0 100644
--- a/libkram/compressonator/bc6h/hdr_encode.cpp
+++ b/libkram/compressonator/bc6h/hdr_encode.cpp
@@ -72,6 +72,7 @@ float lerpf(float a, float b, int i, int denom) {
     case 3:
         denom *= 5;
         i *= 5;    // fall through to case 15
+        [[fallthrough]];
     case 7:
         weights = g_aWeights3;
         break;
diff --git a/libkram/eastl/include/EASTL/chrono.h b/libkram/eastl/include/EASTL/chrono.h
index ccfeb2f9..1d59a9b1 100644
--- a/libkram/eastl/include/EASTL/chrono.h
+++ b/libkram/eastl/include/EASTL/chrono.h
@@ -584,9 +584,42 @@ namespace chrono
 			EA_RESTORE_VC_WARNING()
 			return uint64_t(frequency * queryCounter());
         #elif defined EA_PLATFORM_SONY
-			return sceKernelGetProcessTimeCounter();
+            auto queryFrequency = []
+            {
+                // nanoseconds/seconds / ticks/seconds
+                return double(1000000000.0L / (long double)sceKernelGetProcessTimeCounterFrequency());  // nanoseconds per tick
+            };
+
+            auto queryCounter = []
+            {
+                return sceKernelGetProcessTimeCounter();
+            };
+
+            EA_DISABLE_VC_WARNING(4640)  // warning C4640: construction of local static object is not thread-safe (VS2013)
+            static auto frequency = queryFrequency(); // cache cpu frequency on first call
+            EA_RESTORE_VC_WARNING()
+            return uint64_t(frequency * (double)queryCounter());
 		#elif defined(EA_PLATFORM_APPLE)
-		   return mach_absolute_time();
+            // took this from newer from newer drop of EASTL from 2022 release on 11/8/24
+            // Note that numer/denom will often be 1 and 1, so can skip math.
+            // but is 125/3 on some iOS and M1.  Added inNanos check.  Test.
+            auto queryTimeInfo = []
+            {
+                mach_timebase_info_data_t info;
+                mach_timebase_info(&info);
+                return info;
+            };
+            
+            uint64_t t = mach_absolute_time();
+            
+            static auto timeInfo = queryTimeInfo();
+            static const bool isNanos = timeInfo.numer == 1 && timeInfo.denom == 1;
+            if (!isNanos)
+            {
+                t *= timeInfo.numer;
+                t /= timeInfo.denom;
+            }
+            return t;
 		#elif defined(EA_PLATFORM_POSIX) // Posix means Linux, Unix, and Macintosh OSX, among others (including Linux-based mobile platforms).
 			#if (defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC))
 				timespec ts;
diff --git a/libkram/etc2comp/Etc.cpp b/libkram/etc2comp/Etc.cpp
index 059e86c1..95151ddd 100644
--- a/libkram/etc2comp/Etc.cpp
+++ b/libkram/etc2comp/Etc.cpp
@@ -1,142 +1,142 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcConfig.h"
-#include "Etc.h"
-#include "EtcFilter.h"
-
-#include <string.h>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	// C-style inteface to the encoder
-	//
-	void Encode(float *a_pafSourceRGBA,
-				unsigned int a_uiSourceWidth, 
-				unsigned int a_uiSourceHeight,
-				Image::Format a_format,
-				ErrorMetric a_eErrMetric,
-				float a_fEffort,
-				unsigned int a_uiJobs,
-				unsigned int a_uiMaxJobs,
-				unsigned char **a_ppaucEncodingBits,
-				unsigned int *a_puiEncodingBitsBytes,
-				unsigned int *a_puiExtendedWidth,
-				unsigned int *a_puiExtendedHeight, 
-				int *a_piEncodingTime_ms, bool a_bVerboseOutput)
-	{
-
-		Image image(a_pafSourceRGBA, a_uiSourceWidth,
-					a_uiSourceHeight,
-					a_eErrMetric);
-		image.m_bVerboseOutput = a_bVerboseOutput;
-		image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
-
-		*a_ppaucEncodingBits = image.GetEncodingBits();
-		*a_puiEncodingBitsBytes = image.GetEncodingBitsBytes();
-		//*a_puiExtendedWidth = image.GetExtendedWidth();
-		//*a_puiExtendedHeight = image.GetExtendedHeight();
-		*a_piEncodingTime_ms = image.GetEncodingTimeMs();
-	}
-
-	void EncodeMipmaps(float *a_pafSourceRGBA,
-		unsigned int a_uiSourceWidth,
-		unsigned int a_uiSourceHeight,
-		Image::Format a_format,
-		ErrorMetric a_eErrMetric,
-		float a_fEffort,
-		unsigned int a_uiJobs,
-		unsigned int a_uiMaxJobs,
-		unsigned int a_uiMaxMipmaps,
-		unsigned int a_uiMipFilterFlags,
-		RawImage* a_pMipmapImages,
-		int *a_piEncodingTime_ms, 
-		bool a_bVerboseOutput)
-	{
-		auto mipWidth = a_uiSourceWidth;
-		auto mipHeight = a_uiSourceHeight;
-		int totalEncodingTime = 0;
-		for(unsigned int mip = 0; mip < a_uiMaxMipmaps && mipWidth >= 1 && mipHeight >= 1; mip++)
-		{
-			float* pImageData = nullptr;
-			float* pMipImage = nullptr;
-
-			if(mip == 0)
-			{
-				pImageData = a_pafSourceRGBA;
-			}
-			else
-			{
-				pMipImage = new float[mipWidth*mipHeight*4];
-				if(FilterTwoPass(a_pafSourceRGBA, a_uiSourceWidth, a_uiSourceHeight, pMipImage, mipWidth, mipHeight, a_uiMipFilterFlags, Etc::FilterLanczos3) )
-				{
-					pImageData = pMipImage;
-				}
-			}
-
-			if ( pImageData )
-			{
-			
-				Image image(pImageData, mipWidth, mipHeight,	a_eErrMetric);
-
-                image.m_bVerboseOutput = a_bVerboseOutput;
-                image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
-
-                a_pMipmapImages[mip].paucEncodingBits = std::shared_ptr<unsigned char>(image.GetEncodingBits(), [](unsigned char *p) { delete[] p; });
-                a_pMipmapImages[mip].uiEncodingBitsBytes = image.GetEncodingBitsBytes();
-                //a_pMipmapImages[mip].uiExtendedWidth = image.GetExtendedWidth();
-                //a_pMipmapImages[mip].uiExtendedHeight = image.GetExtendedHeight();
-
-                totalEncodingTime += image.GetEncodingTimeMs();
-			}
-
-			if(pMipImage)
-			{
-				delete[] pMipImage;
-			}
-
-			if (!pImageData)
-			{
-				break;
-			}
-
-			mipWidth >>= 1;
-			mipHeight >>= 1;
-            
-            // Get out of the loop if both shifted dimensions are zero
-            if ((mipWidth==0) && (mipHeight==0))
-            {
-                break;
-            }
-            // Make sure to generate mipmap chain down to 1x1 for iOS
-            if (mipWidth==0)
-            {
-                mipWidth = 1;
-            }
-            if (mipHeight==0) {
-                mipHeight = 1;
-            }
-		}
-
-		*a_piEncodingTime_ms = totalEncodingTime;
-	}
-
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "EtcConfig.h"
+#include "Etc.h"
+#include "EtcFilter.h"
+
+#include <string.h>
+
+namespace Etc
+{
+	// ----------------------------------------------------------------------------------------------------
+	// C-style inteface to the encoder
+	//
+	void Encode(float *a_pafSourceRGBA,
+				unsigned int a_uiSourceWidth, 
+				unsigned int a_uiSourceHeight,
+				Image::Format a_format,
+				ErrorMetric a_eErrMetric,
+				float a_fEffort,
+				unsigned int a_uiJobs,
+				unsigned int a_uiMaxJobs,
+				unsigned char **a_ppaucEncodingBits,
+				unsigned int *a_puiEncodingBitsBytes,
+				unsigned int *a_puiExtendedWidth,
+				unsigned int *a_puiExtendedHeight, 
+				int *a_piEncodingTime_ms, bool a_bVerboseOutput)
+	{
+
+		Image image(a_pafSourceRGBA, a_uiSourceWidth,
+					a_uiSourceHeight,
+					a_eErrMetric);
+		image.m_bVerboseOutput = a_bVerboseOutput;
+		image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
+
+		*a_ppaucEncodingBits = image.GetEncodingBits();
+		*a_puiEncodingBitsBytes = image.GetEncodingBitsBytes();
+		//*a_puiExtendedWidth = image.GetExtendedWidth();
+		//*a_puiExtendedHeight = image.GetExtendedHeight();
+		*a_piEncodingTime_ms = image.GetEncodingTimeMs();
+	}
+
+	void EncodeMipmaps(float *a_pafSourceRGBA,
+		unsigned int a_uiSourceWidth,
+		unsigned int a_uiSourceHeight,
+		Image::Format a_format,
+		ErrorMetric a_eErrMetric,
+		float a_fEffort,
+		unsigned int a_uiJobs,
+		unsigned int a_uiMaxJobs,
+		unsigned int a_uiMaxMipmaps,
+		unsigned int a_uiMipFilterFlags,
+		RawImage* a_pMipmapImages,
+		int *a_piEncodingTime_ms, 
+		bool a_bVerboseOutput)
+	{
+		auto mipWidth = a_uiSourceWidth;
+		auto mipHeight = a_uiSourceHeight;
+		int totalEncodingTime = 0;
+		for(unsigned int mip = 0; mip < a_uiMaxMipmaps && mipWidth >= 1 && mipHeight >= 1; mip++)
+		{
+			float* pImageData = nullptr;
+			float* pMipImage = nullptr;
+
+			if(mip == 0)
+			{
+				pImageData = a_pafSourceRGBA;
+			}
+			else
+			{
+				pMipImage = new float[mipWidth*mipHeight*4];
+				if(FilterTwoPass(a_pafSourceRGBA, a_uiSourceWidth, a_uiSourceHeight, pMipImage, mipWidth, mipHeight, a_uiMipFilterFlags, Etc::FilterLanczos3) )
+				{
+					pImageData = pMipImage;
+				}
+			}
+
+			if ( pImageData )
+			{
+			
+				Image image(pImageData, mipWidth, mipHeight,	a_eErrMetric);
+
+                image.m_bVerboseOutput = a_bVerboseOutput;
+                image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
+
+                a_pMipmapImages[mip].paucEncodingBits = std::shared_ptr<unsigned char>(image.GetEncodingBits(), [](unsigned char *p) { delete[] p; });
+                a_pMipmapImages[mip].uiEncodingBitsBytes = image.GetEncodingBitsBytes();
+                //a_pMipmapImages[mip].uiExtendedWidth = image.GetExtendedWidth();
+                //a_pMipmapImages[mip].uiExtendedHeight = image.GetExtendedHeight();
+
+                totalEncodingTime += image.GetEncodingTimeMs();
+			}
+
+			if(pMipImage)
+			{
+				delete[] pMipImage;
+			}
+
+			if (!pImageData)
+			{
+				break;
+			}
+
+			mipWidth >>= 1;
+			mipHeight >>= 1;
+            
+            // Get out of the loop if both shifted dimensions are zero
+            if ((mipWidth==0) && (mipHeight==0))
+            {
+                break;
+            }
+            // Make sure to generate mipmap chain down to 1x1 for iOS
+            if (mipWidth==0)
+            {
+                mipWidth = 1;
+            }
+            if (mipHeight==0) {
+                mipHeight = 1;
+            }
+		}
+
+		*a_piEncodingTime_ms = totalEncodingTime;
+	}
+
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+}
diff --git a/libkram/etc2comp/Etc.h b/libkram/etc2comp/Etc.h
index 90962efb..439388d6 100644
--- a/libkram/etc2comp/Etc.h
+++ b/libkram/etc2comp/Etc.h
@@ -1,71 +1,71 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcConfig.h"
-#include "EtcImage.h"
-#include "EtcColor.h"
-#include "EtcErrorMetric.h"
-#include <memory>
-
-#define ETCCOMP_MIN_EFFORT_LEVEL (0.0f)
-#define ETCCOMP_DEFAULT_EFFORT_LEVEL (40.0f)
-#define ETCCOMP_MAX_EFFORT_LEVEL (100.0f)
-
-namespace Etc
-{
-	class Block4x4EncodingBits;
-
-	struct RawImage
-	{
-		int uiExtendedWidth;
-		int uiExtendedHeight;
-		unsigned int uiEncodingBitsBytes;
-		std::shared_ptr<unsigned char> paucEncodingBits;
-	};
-
-
-
-	// C-style inteface to the encoder
-	void Encode(float *a_pafSourceRGBA,
-				unsigned int a_uiSourceWidth,
-				unsigned int a_uiSourceHeight,
-				Image::Format a_format,
-				ErrorMetric a_eErrMetric,
-				float a_fEffort,
-				unsigned int a_uiJobs,
-				unsigned int a_uimaxJobs,
-				unsigned char **a_ppaucEncodingBits,
-				unsigned int *a_puiEncodingBitsBytes,
-				unsigned int *a_puiExtendedWidth,
-				unsigned int *a_puiExtendedHeight,
-				int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
-
-	void EncodeMipmaps(float *a_pafSourceRGBA,
-		unsigned int a_uiSourceWidth,
-		unsigned int a_uiSourceHeight,
-		Image::Format a_format,
-		ErrorMetric a_eErrMetric,
-		float a_fEffort,
-		unsigned int a_uiJobs,
-		unsigned int a_uiMaxJobs,
-		unsigned int a_uiMaxMipmaps,
-		unsigned int a_uiMipFilterFlags,
-		RawImage* a_pMipmaps,
-		int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
-
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcConfig.h"
+#include "EtcImage.h"
+#include "EtcColor.h"
+#include "EtcErrorMetric.h"
+#include <memory>
+
+#define ETCCOMP_MIN_EFFORT_LEVEL (0.0f)
+#define ETCCOMP_DEFAULT_EFFORT_LEVEL (40.0f)
+#define ETCCOMP_MAX_EFFORT_LEVEL (100.0f)
+
+namespace Etc
+{
+	class Block4x4EncodingBits;
+
+	struct RawImage
+	{
+		int uiExtendedWidth;
+		int uiExtendedHeight;
+		unsigned int uiEncodingBitsBytes;
+		std::shared_ptr<unsigned char> paucEncodingBits;
+	};
+
+
+
+	// C-style inteface to the encoder
+	void Encode(float *a_pafSourceRGBA,
+				unsigned int a_uiSourceWidth,
+				unsigned int a_uiSourceHeight,
+				Image::Format a_format,
+				ErrorMetric a_eErrMetric,
+				float a_fEffort,
+				unsigned int a_uiJobs,
+				unsigned int a_uimaxJobs,
+				unsigned char **a_ppaucEncodingBits,
+				unsigned int *a_puiEncodingBitsBytes,
+				unsigned int *a_puiExtendedWidth,
+				unsigned int *a_puiExtendedHeight,
+				int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
+
+	void EncodeMipmaps(float *a_pafSourceRGBA,
+		unsigned int a_uiSourceWidth,
+		unsigned int a_uiSourceHeight,
+		Image::Format a_format,
+		ErrorMetric a_eErrMetric,
+		float a_fEffort,
+		unsigned int a_uiJobs,
+		unsigned int a_uiMaxJobs,
+		unsigned int a_uiMaxMipmaps,
+		unsigned int a_uiMipFilterFlags,
+		RawImage* a_pMipmaps,
+		int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
+
+}
diff --git a/libkram/etc2comp/EtcBlock4x4.cpp b/libkram/etc2comp/EtcBlock4x4.cpp
index ce753774..de71c663 100644
--- a/libkram/etc2comp/EtcBlock4x4.cpp
+++ b/libkram/etc2comp/EtcBlock4x4.cpp
@@ -1,317 +1,317 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* 
-EtcBlock4x4.cpp
-
-Implements the state associated with each 4x4 block of pixels in an image
-
-Source images that are not a multiple of 4x4 are extended to fill the Block4x4 using pixels with an 
-alpha of NAN
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4.h"
-
-//#include "EtcColor.h"
-#include "EtcImage.h"
-#include "EtcColorFloatRGBA.h"
-
-// only the rgb/a encoders use Block4x4
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4Encoding_ETC1.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcBlock4x4Encoding_RGBA8.h"
-#include "EtcBlock4x4Encoding_RGB8A1.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	// ETC pixels are scanned vertically.  
-	// this mapping is for when someone wants to scan the ETC pixels horizontally
-	const uint8_t Block4x4::s_auiPixelOrderHScan[PIXELS] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4::Block4x4(void)
-	{
-        Init();
-	}
-	Block4x4::~Block4x4()
-	{
-		m_pimageSource = nullptr;
-        
-		if (m_pencoding)
-		{
-			delete m_pencoding;
-			m_pencoding = nullptr;
-		}
-	}
-    
-    void Block4x4::Init() {
-        m_pimageSource = nullptr;
-        m_pencoding = nullptr;
-
-        m_uiSourceH = 0;
-        m_uiSourceV = 0;
-
-        m_sourcealphamix = SourceAlphaMix::UNKNOWN;
-        //m_boolBorderPixels = false;
-        m_boolPunchThroughPixels = false;
-        m_hasColorPixels = true;
-
-        //m_errormetric = ErrorMetric::NUMERIC;
-    }
-
-    Block4x4Encoding* Block4x4::NewEncoderIfNeeded(Image::Format format)
-    {
-        Block4x4Encoding* p_encoding = m_pencoding;
-        if (!p_encoding)
-        {
-            switch(format) {
-            case Image::Format::RGB8:
-            case Image::Format::SRGB8:
-                p_encoding = new Block4x4Encoding_RGB8;
-                break;
-
-            case Image::Format::RGBA8:
-            case Image::Format::SRGBA8:
-                p_encoding = new Block4x4Encoding_RGBA8;
-                break;
-
-            // don't really care about using ETC1 or A1
-            case Image::Format::ETC1:
-                p_encoding = new Block4x4Encoding_ETC1;
-                break;
-                        
-            case Image::Format::RGB8A1:
-            case Image::Format::SRGB8A1:
-                p_encoding = new Block4x4Encoding_RGB8A1;
-                break;
-                    
-            default:
-                assert(false);
-                break;
-            }
-        }
-        return p_encoding;
-    }
-
-    void Block4x4::Encode(Image *a_pimageSource,
-                                    unsigned int a_uiSourceH, unsigned int a_uiSourceV,
-                                    unsigned char *a_paucEncodingBits)
-    {
-        // this is use the same encoding over and over, so don't delete existing
-        Etc::Block4x4Encoding* p_encoding = NewEncoderIfNeeded(a_pimageSource->GetFormat());
-        ErrorMetric errorMetric = a_pimageSource->GetErrorMetric();
-        
-        m_pencoding = nullptr;
-        Block4x4::Init();
-        
-        m_pimageSource = a_pimageSource;
-        
-        m_uiSourceH = a_uiSourceH;
-        m_uiSourceV = a_uiSourceV;
-        //m_errormetric = errorMetric;
-        m_pencoding = p_encoding;
-        
-        SetSourcePixels();
-
-        m_pencoding->Encode(this, m_afrgbaSource,
-                                    a_paucEncodingBits, errorMetric);
-
-    }
-
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization of encoding state from a prior encoding using encoding bits
-	// [a_uiSourceH,a_uiSourceV] is the location of the block in a_pimageSource
-	// a_paucEncodingBits is the place to read the prior encoding
-	// a_imageformat is used to determine how to interpret a_paucEncodingBits
-	// a_errormetric was used for the prior encoding
-	//
-	void Block4x4::Decode(
-											unsigned int a_uiSourceH, unsigned int a_uiSourceV,
-											unsigned char *a_paucEncodingBits,
-											Image *a_pimageSource,
-                                            uint16_t iterationCount)
-	{
-        // this is use the same encoding over and over, so don't delete existing
-        Etc::Block4x4Encoding* p_encoding = NewEncoderIfNeeded(a_pimageSource->GetFormat());
-        ErrorMetric errorMetric = a_pimageSource->GetErrorMetric();
-        
-        //delete m_pencoding;
-        m_pencoding = nullptr;
-        Block4x4::Init();
-
-		m_pimageSource = a_pimageSource;
-        
-		m_uiSourceH = a_uiSourceH;
-		m_uiSourceV = a_uiSourceV;
-		//m_errormetric = errorMetric;
-        m_pencoding = p_encoding;
-
-        if (m_pimageSource->HasSourcePixels()) {
-            SetSourcePixels();
-
-            m_pencoding->Decode(this, a_paucEncodingBits, m_afrgbaSource, errorMetric, iterationCount);
-        }
-        else {
-            // pure decode
-            m_pencoding->Decode(this, a_paucEncodingBits, nullptr, errorMetric, iterationCount);
-        }
-
-	}
-	
-	// ----------------------------------------------------------------------------------------------------
-	// set source pixels from m_pimageSource
-	// set m_alphamix
-	//
-	void Block4x4::SetSourcePixels(void)
-	{
-		// copy source to consecutive memory locations
-		// convert from image horizontal scan to block vertical scan
-		int uiPixel = 0;
-		for (int x = 0; x < 4; x++)
-		{
-			int uiSourcePixelH = m_uiSourceH + x;
-
-			for (int y = 0; y < 4; y++)
-			{
-				int uiSourcePixelV = m_uiSourceV + y;
-
-				ColorFloatRGBA pfrgbaSource = m_pimageSource->GetSourcePixel(uiSourcePixelH, uiSourcePixelV);
-            
-                ColorFloatRGBA& sourcePixel = m_afrgbaSource[uiPixel];
-                sourcePixel = pfrgbaSource;
-                uiPixel++;
-            }
-        }
-        
-        //----------------------------------------
-         
-        m_hasColorPixels = false;
-        for (uiPixel = 0; uiPixel < 16; ++uiPixel)
-        {
-            ColorFloatRGBA& sourcePixel = m_afrgbaSource[uiPixel];
-            
-            // this is doing fp equality
-            if (sourcePixel.fR != sourcePixel.fG || sourcePixel.fR != sourcePixel.fB)
-            {
-                m_hasColorPixels = true;
-                break;
-            }
-        }
-        
-        //----------------------------------------
-        
-        // alpha census
-        int uiTransparentSourcePixels = 0;
-        int uiOpaqueSourcePixels = 0;
-              
-        Image::Format imageformat = m_pimageSource->GetFormat();
-
-        for (uiPixel = 0; uiPixel < 16; ++uiPixel)
-        {
-            ColorFloatRGBA& sourcePixel = m_afrgbaSource[uiPixel];
-            
-            // for formats with no alpha, set source alpha to 1
-            if (imageformat == Image::Format::ETC1 ||
-                imageformat == Image::Format::RGB8 ||
-                imageformat == Image::Format::SRGB8)
-            {
-                sourcePixel.fA = 1.0f;
-            }
-            
-            // for RGB8A1, set source alpha to 0.0 or 1.0
-            // set punch through flag
-            else if (imageformat == Image::Format::RGB8A1 ||
-                     imageformat == Image::Format::SRGB8A1)
-            {
-                if (sourcePixel.fA >= 0.5f)
-                {
-                    sourcePixel.fA = 1.0f;
-                }
-                else
-                {
-                    sourcePixel.fA = 0.0f;
-                    m_boolPunchThroughPixels = true;
-                }
-            }
-
-            if (sourcePixel.fA == 1.0f)
-            {
-                uiOpaqueSourcePixels++;
-            }
-            else if (sourcePixel.fA == 0.0f)
-            {
-                // TODO: an assumption here that R/G/B are 0, but with multichannel that's not the case
-                // A could be all 0, but rgb contain valid channel content
-                uiTransparentSourcePixels++;
-            }
-        }
-
-        // This only applies for RGBA (premul weighted calcs)
-		if (uiOpaqueSourcePixels == PIXELS)
-		{
-			m_sourcealphamix = SourceAlphaMix::OPAQUE;
-		}
-		else if (uiTransparentSourcePixels == PIXELS)
-		{
-            // TODO: could check rgb for all 0, and then set TRANSPARENT
-            m_sourcealphamix = SourceAlphaMix::TRANSPARENT;
-            
-            // TODO: nothing setting ALL_ZERO_ALPHA.  Could look at all rgb to identify that.
-            
-            //(m_pimageSource->GetErrorMetric() == ErrorMetric::NUMERIC || m_pimageSource->GetErrorMetric() == ErrorMetric::RGBX) ? SourceAlphaMix::ALL_ZERO_ALPHA :
-            //    SourceAlphaMix::TRANSPARENT;
-		}
-		else
-		{
-            m_sourcealphamix = SourceAlphaMix::TRANSLUCENT;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// return a name for the encoding mode
-	//
-//	const char * Block4x4::GetEncodingModeName(void)
-//	{
-//
-//		switch (m_pencoding->GetMode())
-//		{
-//		case Block4x4Encoding::MODE_ETC1:
-//			return "ETC1";
-//		case Block4x4Encoding::MODE_T:
-//			return "T";
-//		case Block4x4Encoding::MODE_H:
-//			return "H";
-//		case Block4x4Encoding::MODE_PLANAR:
-//			return "PLANAR";
-//		default:
-//			return "???";
-//		}
-//	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* 
+EtcBlock4x4.cpp
+
+Implements the state associated with each 4x4 block of pixels in an image
+
+Source images that are not a multiple of 4x4 are extended to fill the Block4x4 using pixels with an 
+alpha of NAN
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcBlock4x4.h"
+
+//#include "EtcColor.h"
+#include "EtcImage.h"
+#include "EtcColorFloatRGBA.h"
+
+// only the rgb/a encoders use Block4x4
+#include "EtcBlock4x4EncodingBits.h"
+#include "EtcBlock4x4Encoding_ETC1.h"
+#include "EtcBlock4x4Encoding_RGB8.h"
+#include "EtcBlock4x4Encoding_RGBA8.h"
+#include "EtcBlock4x4Encoding_RGB8A1.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+namespace Etc
+{
+	// ETC pixels are scanned vertically.  
+	// this mapping is for when someone wants to scan the ETC pixels horizontally
+	const uint8_t Block4x4::s_auiPixelOrderHScan[PIXELS] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+	Block4x4::Block4x4(void)
+	{
+        Init();
+	}
+	Block4x4::~Block4x4()
+	{
+		m_pimageSource = nullptr;
+        
+		if (m_pencoding)
+		{
+			delete m_pencoding;
+			m_pencoding = nullptr;
+		}
+	}
+    
+    void Block4x4::Init() {
+        m_pimageSource = nullptr;
+        m_pencoding = nullptr;
+
+        m_uiSourceH = 0;
+        m_uiSourceV = 0;
+
+        m_sourcealphamix = SourceAlphaMix::UNKNOWN;
+        //m_boolBorderPixels = false;
+        m_boolPunchThroughPixels = false;
+        m_hasColorPixels = true;
+
+        //m_errormetric = ErrorMetric::NUMERIC;
+    }
+
+    Block4x4Encoding* Block4x4::NewEncoderIfNeeded(Image::Format format)
+    {
+        Block4x4Encoding* p_encoding = m_pencoding;
+        if (!p_encoding)
+        {
+            switch(format) {
+            case Image::Format::RGB8:
+            case Image::Format::SRGB8:
+                p_encoding = new Block4x4Encoding_RGB8;
+                break;
+
+            case Image::Format::RGBA8:
+            case Image::Format::SRGBA8:
+                p_encoding = new Block4x4Encoding_RGBA8;
+                break;
+
+            // don't really care about using ETC1 or A1
+            case Image::Format::ETC1:
+                p_encoding = new Block4x4Encoding_ETC1;
+                break;
+                        
+            case Image::Format::RGB8A1:
+            case Image::Format::SRGB8A1:
+                p_encoding = new Block4x4Encoding_RGB8A1;
+                break;
+                    
+            default:
+                assert(false);
+                break;
+            }
+        }
+        return p_encoding;
+    }
+
+    void Block4x4::Encode(Image *a_pimageSource,
+                                    unsigned int a_uiSourceH, unsigned int a_uiSourceV,
+                                    unsigned char *a_paucEncodingBits)
+    {
+        // this is use the same encoding over and over, so don't delete existing
+        Etc::Block4x4Encoding* p_encoding = NewEncoderIfNeeded(a_pimageSource->GetFormat());
+        ErrorMetric errorMetric = a_pimageSource->GetErrorMetric();
+        
+        m_pencoding = nullptr;
+        Block4x4::Init();
+        
+        m_pimageSource = a_pimageSource;
+        
+        m_uiSourceH = a_uiSourceH;
+        m_uiSourceV = a_uiSourceV;
+        //m_errormetric = errorMetric;
+        m_pencoding = p_encoding;
+        
+        SetSourcePixels();
+
+        m_pencoding->Encode(this, m_afrgbaSource,
+                                    a_paucEncodingBits, errorMetric);
+
+    }
+
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization of encoding state from a prior encoding using encoding bits
+	// [a_uiSourceH,a_uiSourceV] is the location of the block in a_pimageSource
+	// a_paucEncodingBits is the place to read the prior encoding
+	// a_imageformat is used to determine how to interpret a_paucEncodingBits
+	// a_errormetric was used for the prior encoding
+	//
+	void Block4x4::Decode(
+											unsigned int a_uiSourceH, unsigned int a_uiSourceV,
+											unsigned char *a_paucEncodingBits,
+											Image *a_pimageSource,
+                                            uint16_t iterationCount)
+	{
+        // this is use the same encoding over and over, so don't delete existing
+        Etc::Block4x4Encoding* p_encoding = NewEncoderIfNeeded(a_pimageSource->GetFormat());
+        ErrorMetric errorMetric = a_pimageSource->GetErrorMetric();
+        
+        //delete m_pencoding;
+        m_pencoding = nullptr;
+        Block4x4::Init();
+
+		m_pimageSource = a_pimageSource;
+        
+		m_uiSourceH = a_uiSourceH;
+		m_uiSourceV = a_uiSourceV;
+		//m_errormetric = errorMetric;
+        m_pencoding = p_encoding;
+
+        if (m_pimageSource->HasSourcePixels()) {
+            SetSourcePixels();
+
+            m_pencoding->Decode(this, a_paucEncodingBits, m_afrgbaSource, errorMetric, iterationCount);
+        }
+        else {
+            // pure decode
+            m_pencoding->Decode(this, a_paucEncodingBits, nullptr, errorMetric, iterationCount);
+        }
+
+	}
+	
+	// ----------------------------------------------------------------------------------------------------
+	// set source pixels from m_pimageSource
+	// set m_alphamix
+	//
+	void Block4x4::SetSourcePixels(void)
+	{
+		// copy source to consecutive memory locations
+		// convert from image horizontal scan to block vertical scan
+		int uiPixel = 0;
+		for (int x = 0; x < 4; x++)
+		{
+			int uiSourcePixelH = m_uiSourceH + x;
+
+			for (int y = 0; y < 4; y++)
+			{
+				int uiSourcePixelV = m_uiSourceV + y;
+
+				ColorFloatRGBA pfrgbaSource = m_pimageSource->GetSourcePixel(uiSourcePixelH, uiSourcePixelV);
+            
+                ColorFloatRGBA& sourcePixel = m_afrgbaSource[uiPixel];
+                sourcePixel = pfrgbaSource;
+                uiPixel++;
+            }
+        }
+        
+        //----------------------------------------
+         
+        m_hasColorPixels = false;
+        for (uiPixel = 0; uiPixel < 16; ++uiPixel)
+        {
+            ColorFloatRGBA& sourcePixel = m_afrgbaSource[uiPixel];
+            
+            // this is doing fp equality
+            if (sourcePixel.fR != sourcePixel.fG || sourcePixel.fR != sourcePixel.fB)
+            {
+                m_hasColorPixels = true;
+                break;
+            }
+        }
+        
+        //----------------------------------------
+        
+        // alpha census
+        int uiTransparentSourcePixels = 0;
+        int uiOpaqueSourcePixels = 0;
+              
+        Image::Format imageformat = m_pimageSource->GetFormat();
+
+        for (uiPixel = 0; uiPixel < 16; ++uiPixel)
+        {
+            ColorFloatRGBA& sourcePixel = m_afrgbaSource[uiPixel];
+            
+            // for formats with no alpha, set source alpha to 1
+            if (imageformat == Image::Format::ETC1 ||
+                imageformat == Image::Format::RGB8 ||
+                imageformat == Image::Format::SRGB8)
+            {
+                sourcePixel.fA = 1.0f;
+            }
+            
+            // for RGB8A1, set source alpha to 0.0 or 1.0
+            // set punch through flag
+            else if (imageformat == Image::Format::RGB8A1 ||
+                     imageformat == Image::Format::SRGB8A1)
+            {
+                if (sourcePixel.fA >= 0.5f)
+                {
+                    sourcePixel.fA = 1.0f;
+                }
+                else
+                {
+                    sourcePixel.fA = 0.0f;
+                    m_boolPunchThroughPixels = true;
+                }
+            }
+
+            if (sourcePixel.fA == 1.0f)
+            {
+                uiOpaqueSourcePixels++;
+            }
+            else if (sourcePixel.fA == 0.0f)
+            {
+                // TODO: an assumption here that R/G/B are 0, but with multichannel that's not the case
+                // A could be all 0, but rgb contain valid channel content
+                uiTransparentSourcePixels++;
+            }
+        }
+
+        // This only applies for RGBA (premul weighted calcs)
+		if (uiOpaqueSourcePixels == PIXELS)
+		{
+			m_sourcealphamix = SourceAlphaMix::OPAQUE;
+		}
+		else if (uiTransparentSourcePixels == PIXELS)
+		{
+            // TODO: could check rgb for all 0, and then set TRANSPARENT
+            m_sourcealphamix = SourceAlphaMix::TRANSPARENT;
+            
+            // TODO: nothing setting ALL_ZERO_ALPHA.  Could look at all rgb to identify that.
+            
+            //(m_pimageSource->GetErrorMetric() == ErrorMetric::NUMERIC || m_pimageSource->GetErrorMetric() == ErrorMetric::RGBX) ? SourceAlphaMix::ALL_ZERO_ALPHA :
+            //    SourceAlphaMix::TRANSPARENT;
+		}
+		else
+		{
+            m_sourcealphamix = SourceAlphaMix::TRANSLUCENT;
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// return a name for the encoding mode
+	//
+//	const char * Block4x4::GetEncodingModeName(void)
+//	{
+//
+//		switch (m_pencoding->GetMode())
+//		{
+//		case Block4x4Encoding::MODE_ETC1:
+//			return "ETC1";
+//		case Block4x4Encoding::MODE_T:
+//			return "T";
+//		case Block4x4Encoding::MODE_H:
+//			return "H";
+//		case Block4x4Encoding::MODE_PLANAR:
+//			return "PLANAR";
+//		default:
+//			return "???";
+//		}
+//	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+}
diff --git a/libkram/etc2comp/EtcBlock4x4.h b/libkram/etc2comp/EtcBlock4x4.h
index 3e869938..518a7ba7 100644
--- a/libkram/etc2comp/EtcBlock4x4.h
+++ b/libkram/etc2comp/EtcBlock4x4.h
@@ -1,132 +1,132 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-//#include "EtcColor.h"
-#include "EtcColorFloatRGBA.h"
-//#include "EtcErrorMetric.h"
-#include "EtcImage.h"
-#include "EtcBlock4x4Encoding.h"
-
-namespace Etc
-{
-	//class Block4x4Encoding;
-   
-    // This base holds a 4x4 block, and is only used for RGB/RGBA encodings
-	class Block4x4
-	{
-	public:
-
-		static const unsigned int ROWS = 4;
-		static const unsigned int COLUMNS = 4;
-		static const unsigned int PIXELS = ROWS * COLUMNS;
-
-		// the alpha mix for a 4x4 block of pixels
-		enum class SourceAlphaMix
-		{
-			UNKNOWN,
-			//
-			OPAQUE,			// all 1.0
-			TRANSPARENT,	// all channels 0.0
-			TRANSLUCENT,	// not all opaque or transparent
-            ALL_ZERO_ALPHA  // used for multichannel where all A = 0, but rgb contain data
-		};
-
-		typedef void (Block4x4::*EncoderFunctionPtr)(void);
-
-		Block4x4(void);
-		~Block4x4();
-        
-        // called on first init of a block with/without multipass
-		void Encode(Image *a_pimageSource,
-                    unsigned int a_uiSourceH,
-                    unsigned int a_uiSourceV,
-                    unsigned char *a_paucEncodingBits
-				  );
-        
-        // used on subsequent passes with multipass to decode from block for subsequent encodes
-        void Decode(unsigned int a_uiSourceH,
-                    unsigned int a_uiSourceV,
-                    unsigned char *a_paucEncodingBits,
-                    Image *a_pimageSource,
-                    uint16_t iterationCount);
-
-        inline Block4x4Encoding * GetEncoding(void)
-        {
-            return m_pencoding;
-        }
-    
-        //----------------------
-        
-        inline unsigned int GetSourceH(void) const
-        {
-            return m_uiSourceH;
-        }
-
-        inline unsigned int GetSourceV(void) const
-        {
-            return m_uiSourceV;
-        }
-        
-		inline const ColorFloatRGBA * GetSource() const
-		{
-			return m_afrgbaSource;
-        }
-                
-		inline SourceAlphaMix GetSourceAlphaMix(void) const
-		{
-			return m_sourcealphamix; // or return from m_pimageSource->GetSourceAlphaMix()
-		}
-
-		inline const Image * GetImageSource(void) const
-		{
-			return m_pimageSource;
-		}
-
-		inline bool HasPunchThroughPixels(void) const
-		{
-			return m_boolPunchThroughPixels;
-		}
-        
-        // gray vs. color
-        inline bool HasColorPixels(void) const
-        {
-            return m_hasColorPixels;
-        }
-
-	private:
-        Block4x4Encoding* NewEncoderIfNeeded(Image::Format format);
-        void Init();
-        
-		void SetSourcePixels(void);
-
-        static const uint8_t s_auiPixelOrderHScan[PIXELS];
-
-		Image				*m_pimageSource;
-		unsigned int		m_uiSourceH;
-		unsigned int		m_uiSourceV;
-		ColorFloatRGBA		m_afrgbaSource[PIXELS];		// vertical scan (Not std. pixel order, it's stored transposed)
-
-		SourceAlphaMix		m_sourcealphamix;
-		bool				m_boolPunchThroughPixels;	// RGB8A1 or SRGB8A1 with any pixels with alpha < 0.5
-        bool                m_hasColorPixels;
-        
-		Block4x4Encoding	*m_pencoding;
-
-	};
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+//#include "EtcColor.h"
+#include "EtcColorFloatRGBA.h"
+//#include "EtcErrorMetric.h"
+#include "EtcImage.h"
+#include "EtcBlock4x4Encoding.h"
+
+namespace Etc
+{
+	//class Block4x4Encoding;
+   
+    // This base holds a 4x4 block, and is only used for RGB/RGBA encodings
+	class Block4x4
+	{
+	public:
+
+		static const unsigned int ROWS = 4;
+		static const unsigned int COLUMNS = 4;
+		static const unsigned int PIXELS = ROWS * COLUMNS;
+
+		// the alpha mix for a 4x4 block of pixels
+		enum class SourceAlphaMix
+		{
+			UNKNOWN,
+			//
+			OPAQUE,			// all 1.0
+			TRANSPARENT,	// all channels 0.0
+			TRANSLUCENT,	// not all opaque or transparent
+            ALL_ZERO_ALPHA  // used for multichannel where all A = 0, but rgb contain data
+		};
+
+		typedef void (Block4x4::*EncoderFunctionPtr)(void);
+
+		Block4x4(void);
+		~Block4x4();
+        
+        // called on first init of a block with/without multipass
+		void Encode(Image *a_pimageSource,
+                    unsigned int a_uiSourceH,
+                    unsigned int a_uiSourceV,
+                    unsigned char *a_paucEncodingBits
+				  );
+        
+        // used on subsequent passes with multipass to decode from block for subsequent encodes
+        void Decode(unsigned int a_uiSourceH,
+                    unsigned int a_uiSourceV,
+                    unsigned char *a_paucEncodingBits,
+                    Image *a_pimageSource,
+                    uint16_t iterationCount);
+
+        inline Block4x4Encoding * GetEncoding(void)
+        {
+            return m_pencoding;
+        }
+    
+        //----------------------
+        
+        inline unsigned int GetSourceH(void) const
+        {
+            return m_uiSourceH;
+        }
+
+        inline unsigned int GetSourceV(void) const
+        {
+            return m_uiSourceV;
+        }
+        
+		inline const ColorFloatRGBA * GetSource() const
+		{
+			return m_afrgbaSource;
+        }
+                
+		inline SourceAlphaMix GetSourceAlphaMix(void) const
+		{
+			return m_sourcealphamix; // or return from m_pimageSource->GetSourceAlphaMix()
+		}
+
+		inline const Image * GetImageSource(void) const
+		{
+			return m_pimageSource;
+		}
+
+		inline bool HasPunchThroughPixels(void) const
+		{
+			return m_boolPunchThroughPixels;
+		}
+        
+        // gray vs. color
+        inline bool HasColorPixels(void) const
+        {
+            return m_hasColorPixels;
+        }
+
+	private:
+        Block4x4Encoding* NewEncoderIfNeeded(Image::Format format);
+        void Init();
+        
+		void SetSourcePixels(void);
+
+        static const uint8_t s_auiPixelOrderHScan[PIXELS];
+
+		Image				*m_pimageSource;
+		unsigned int		m_uiSourceH;
+		unsigned int		m_uiSourceV;
+		ColorFloatRGBA		m_afrgbaSource[PIXELS];		// vertical scan (Not std. pixel order, it's stored transposed)
+
+		SourceAlphaMix		m_sourcealphamix;
+		bool				m_boolPunchThroughPixels;	// RGB8A1 or SRGB8A1 with any pixels with alpha < 0.5
+        bool                m_hasColorPixels;
+        
+		Block4x4Encoding	*m_pencoding;
+
+	};
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding.cpp b/libkram/etc2comp/EtcBlock4x4Encoding.cpp
index fcbf5ee9..2a0068b6 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding.cpp
+++ b/libkram/etc2comp/EtcBlock4x4Encoding.cpp
@@ -1,124 +1,124 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding.cpp
-
-Block4x4Encoding is the abstract base class for the different encoders.  Each encoder targets a 
-particular file format (e.g. ETC1, RGB8, RGBA8, R11)
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding::Block4x4Encoding(void)
-	{
-        Init();
-	}
-
-    void Block4x4Encoding::Init()
-    {
-        m_pblockParent = nullptr;
-
-        m_pafrgbaSource = nullptr;
-
-        m_fError = 0.0f;
-
-        m_mode = MODE_UNKNOWN;
-
-        m_uiEncodingIterations = 0;
-        m_boolDone = false;
-
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(0.0f, 0.0f, 0.0f, 1.0f);
-        }
-    }
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialize the generic encoding for a 4x4 block
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// init the decoded pixels to -1 to mark them as undefined
-	// init the error to -1 to mark it as undefined
-	//
-	void Block4x4Encoding::Init(Block4x4 *a_pblockParent,
-								const ColorFloatRGBA *a_pafrgbaSource,
-								ErrorMetric a_errormetric,
-                                uint16_t iterationCount)
-	{
-        Init();
-        
-		m_pblockParent = a_pblockParent;
-		m_pafrgbaSource = a_pafrgbaSource;
-        m_errormetric = a_errormetric;
-        
-        m_uiEncodingIterations = iterationCount;
-	}
-
-    // ----------------------------------------------------------------------------------------------------
-
-    void Block4x4Encoding::SetDoneIfPerfect()
-    {
-        float kErrorTolerance = 0.0f;
-
-        // instead of comparing to 0 which is almost never achieved in float,
-        // use a normalized 8-bit tolerance.  See A8 and R11 code for kErrorTolerance.
-#define ADD_TOLERANCE 1
-#if ADD_TOLERANCE
-        // 16 pixels accumulated, all within 1/255 of final value, and then weights
-        static const float kErrorToleranceRec709 = (1.0f / 255.0f) * (1.0f / 255.0f) * 5.0f * 16.0f;
-        static const float kErrorToleranceNumeric = (1.0f / 255.0f) * (1.0f / 255.0f) * 3.0f * 16.0f;
-        static const float kErrorToleranceGray = (1.0f / 255.0f) * (1.0f / 255.0f) * 1.0f * 16.0f;
-        
-        switch(m_errormetric)
-        {
-            case ErrorMetric::GRAY:
-                kErrorTolerance = kErrorToleranceGray;
-                break;
-            case ErrorMetric::NUMERIC:
-                kErrorTolerance = kErrorToleranceNumeric;
-                break;
-            case ErrorMetric::REC709:
-                kErrorTolerance = kErrorToleranceRec709;
-                break;
-        }
-#endif
-        
-        assert(m_fError >= 0.0f);
-        if (m_fError <= kErrorTolerance)
-        {
-            m_boolDone = true;
-        }
-    }
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
-
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcBlock4x4Encoding.cpp
+
+Block4x4Encoding is the abstract base class for the different encoders.  Each encoder targets a 
+particular file format (e.g. ETC1, RGB8, RGBA8, R11)
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcBlock4x4Encoding.h"
+
+#include "EtcBlock4x4EncodingBits.h"
+#include "EtcBlock4x4.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+namespace Etc
+{
+	// ----------------------------------------------------------------------------------------------------
+	//
+	Block4x4Encoding::Block4x4Encoding(void)
+	{
+        Init();
+	}
+
+    void Block4x4Encoding::Init()
+    {
+        m_pblockParent = nullptr;
+
+        m_pafrgbaSource = nullptr;
+
+        m_fError = 0.0f;
+
+        m_mode = MODE_UNKNOWN;
+
+        m_uiEncodingIterations = 0;
+        m_boolDone = false;
+
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(0.0f, 0.0f, 0.0f, 1.0f);
+        }
+    }
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialize the generic encoding for a 4x4 block
+	// a_pblockParent points to the block associated with this encoding
+	// a_errormetric is used to choose the best encoding
+	// init the decoded pixels to -1 to mark them as undefined
+	// init the error to -1 to mark it as undefined
+	//
+	void Block4x4Encoding::Init(Block4x4 *a_pblockParent,
+								const ColorFloatRGBA *a_pafrgbaSource,
+								ErrorMetric a_errormetric,
+                                uint16_t iterationCount)
+	{
+        Init();
+        
+		m_pblockParent = a_pblockParent;
+		m_pafrgbaSource = a_pafrgbaSource;
+        m_errormetric = a_errormetric;
+        
+        m_uiEncodingIterations = iterationCount;
+	}
+
+    // ----------------------------------------------------------------------------------------------------
+
+    void Block4x4Encoding::SetDoneIfPerfect()
+    {
+        float kErrorTolerance = 0.0f;
+
+        // instead of comparing to 0 which is almost never achieved in float,
+        // use a normalized 8-bit tolerance.  See A8 and R11 code for kErrorTolerance.
+#define ADD_TOLERANCE 1
+#if ADD_TOLERANCE
+        // 16 pixels accumulated, all within 1/255 of final value, and then weights
+        static const float kErrorToleranceRec709 = (1.0f / 255.0f) * (1.0f / 255.0f) * 5.0f * 16.0f;
+        static const float kErrorToleranceNumeric = (1.0f / 255.0f) * (1.0f / 255.0f) * 3.0f * 16.0f;
+        static const float kErrorToleranceGray = (1.0f / 255.0f) * (1.0f / 255.0f) * 1.0f * 16.0f;
+        
+        switch(m_errormetric)
+        {
+            case ErrorMetric::GRAY:
+                kErrorTolerance = kErrorToleranceGray;
+                break;
+            case ErrorMetric::NUMERIC:
+                kErrorTolerance = kErrorToleranceNumeric;
+                break;
+            case ErrorMetric::REC709:
+                kErrorTolerance = kErrorToleranceRec709;
+                break;
+        }
+#endif
+        
+        assert(m_fError >= 0.0f);
+        if (m_fError <= kErrorTolerance)
+        {
+            m_boolDone = true;
+        }
+    }
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
+
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding.h b/libkram/etc2comp/EtcBlock4x4Encoding.h
index 33bfe880..91f0cf6a 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding.h
+++ b/libkram/etc2comp/EtcBlock4x4Encoding.h
@@ -1,220 +1,220 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-#include "EtcErrorMetric.h"
-
-#include <assert.h>
-#include <float.h>
-
-namespace Etc
-{
-	class Block4x4;
-
-	// abstract base class only for rgb/a encodings
-	class Block4x4Encoding
-	{
-	public:
-
-		static const int ROWS = 4;
-		static const int COLUMNS = 4;
-		static const int PIXELS = ROWS * COLUMNS;
-		
-		typedef enum
-		{
-			MODE_UNKNOWN,
-			//
-			MODE_ETC1,
-			MODE_T,
-			MODE_H,
-			MODE_PLANAR,
-			MODES
-		} Mode;
-
-		Block4x4Encoding(void);
-		virtual ~Block4x4Encoding(void) {}
-        
-		virtual void Encode(Block4x4 *a_pblockParent,
-                            const ColorFloatRGBA *a_pafrgbaSource,
-                            unsigned char *a_paucEncodingBits,
-                            ErrorMetric a_errormetric) = 0;
-
-		virtual void Decode(Block4x4 *a_pblockParent,
-                            unsigned char *a_paucEncodingBits,
-                            const ColorFloatRGBA *a_pafrgbaSource,
-                            ErrorMetric a_errormetric,
-                            uint16_t iterationCount) = 0;
-
-        // is is only called on S/RGBA format to copy alpha into decoded pixels of encoding
-        virtual void DecodeAlpha() { }
-        
-		// perform an iteration of the encoding
-		// the first iteration must generate a complete, valid (if poor) encoding
-		virtual void PerformIteration(float a_fEffort) = 0;
-
-        // write output block
-        virtual void SetEncodingBits(void) = 0;
-
-        // the count of the last iteration, can be useful in multipass encoding
-        inline uint16_t GetIterationCount(void) const
-        {
-            return m_uiEncodingIterations;
-        }
-        
-        //-------------------
-        
-		//void CalcBlockError(void);
-        //float CalcPixelError(const ColorFloatRGBA& a_frgbaDecodedColor, int uiPixel) const;
-        
-		inline float GetError(void) const
-		{
-			return m_fError;
-		}
-
-		inline Mode GetMode(void) const
-		{
-			return m_mode;
-		}
-
-		inline bool IsDone(void) const
-		{
-			return m_boolDone;
-		}
-
-        void SetDoneIfPerfect();
-
-		
-        inline const ColorFloatRGBA& GetDecodedPixel(int uiPixel) const
-        {
-            return m_afrgbaDecodedColors[uiPixel];
-        }
-
-        // CalcPixelError is a major hotspot. Called in inner loops.
-        // calculate the error between the source pixel and the decoded pixel
-        // the error amount is base on the error metric
-        inline float CalcPixelError(const ColorFloatRGBA& encodedPixel,
-                                                int uiPixel) const
-        {
-
-            const ColorFloatRGBA& sourcePixel = m_pafrgbaSource[uiPixel];
-            float error = 0.0f;
-            
-            // don't use alpha in any calcs.  This is only RGB error.
-            
-            switch(m_errormetric)
-            {
-
-            case ErrorMetric::GRAY:
-            {
-                error = encodedPixel.fR - sourcePixel.fR;
-                error *= error;
-                
-                break;
-            }
-                    
-            case ErrorMetric::REC709:
-            case ErrorMetric::NUMERIC:
-            {
-                float fDX = encodedPixel.fR - sourcePixel.fR;
-                float fDY = encodedPixel.fG - sourcePixel.fG;
-                float fDZ = encodedPixel.fB - sourcePixel.fB;
-                            
-                error = fDX*fDX + fDY*fDY + fDZ*fDZ;
-                break;
-            }
-                    
-            /* This slows down encoding 28s vs. 20s when not inlined, so stop using it
-                also the src isn't cached.
-             
-            case ErrorMetric::REC709:
-            {
-                //assert(a_fDecodedAlpha >= 0.0f);
-
-                // YCbCr of source and encodedColor
-                // TODO: could cache sourcePixel values to move out of loops
-                float fLuma1 = sourcePixel.fR*0.2126f + sourcePixel.fG*0.7152f + sourcePixel.fB*0.0722f;
-                float fChromaR1 = (sourcePixel.fR - fLuma1) * (0.5f / (1.0f - 0.2126f));
-                float fChromaB1 = (sourcePixel.fB - fLuma1) * (0.5f / (1.0f - 0.0722f));
-
-                float fLuma2 = encodedPixel.fR*0.2126f + encodedPixel.fG*0.7152f + encodedPixel.fB*0.0722f;
-                float fChromaR2 = (encodedPixel.fR - fLuma2) * (0.5f / (1.0f - 0.2126f));
-                float fChromaB2 = (encodedPixel.fB - fLuma2) * (0.5f / (1.0f - 0.0722f));
-
-                float fDeltaL = fLuma1 - fLuma2;
-                float fDeltaCr = fChromaR1 - fChromaR2;
-                float fDeltaCb = fChromaB1 - fChromaB2;
-
-                const float LUMA_WEIGHT = 3.0f;
-                const float CHROMA_RED_WEIGHT = 1.0f;
-                const float CHROMA_BLUE_WEIGHT = 1.0f;
-               
-                // Favor Luma accuracy over Chroma
-                error = LUMA_WEIGHT * fDeltaL*fDeltaL +
-                        CHROMA_RED_WEIGHT * fDeltaCr*fDeltaCr +
-                        CHROMA_BLUE_WEIGHT * fDeltaCb*fDeltaCb;
-        
-                break;
-            }
-            */
-            
-            }
-            
-            return error;
-        }
-        
-        // CalcBlockError is a major hotspot. Called in inner loops.
-        // calculate the error for the block by summing the pixel errors
-        inline void CalcBlockError(void)
-        {
-            m_fError = 0.0f;
-
-            if (m_pafrgbaSource)
-            {
-                for (int uiPixel = 0; uiPixel < (int)PIXELS; uiPixel++)
-                {
-                    m_fError += CalcPixelError(m_afrgbaDecodedColors[uiPixel], uiPixel);
-                }
-            }
-        }
-        
-	protected:
-        void Init(Block4x4 *a_pblockParent,
-					const ColorFloatRGBA *a_pafrgbaSource,
-					ErrorMetric a_errormetric,
-                    uint16_t iterationCount);
-
-		Block4x4		*m_pblockParent;
-		const ColorFloatRGBA	*m_pafrgbaSource;
-
-		ColorFloatRGBA	m_afrgbaDecodedColors[PIXELS];	// decoded RGB components, ignore Alpha
-		float			m_fError;						// error for RGB relative to m_pafrgbaSource.rgb
-
-		// intermediate encoding
-		Mode			m_mode;
-
-		unsigned int	m_uiEncodingIterations;
-		bool			m_boolDone;						// all iterations have been done
-		ErrorMetric		m_errormetric;
-
-	private:
-        void Init();
-        
-	};
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcColorFloatRGBA.h"
+
+#include "EtcErrorMetric.h"
+
+#include <assert.h>
+#include <float.h>
+
+namespace Etc
+{
+	class Block4x4;
+
+	// abstract base class only for rgb/a encodings
+	class Block4x4Encoding
+	{
+	public:
+
+		static const int ROWS = 4;
+		static const int COLUMNS = 4;
+		static const int PIXELS = ROWS * COLUMNS;
+		
+		typedef enum
+		{
+			MODE_UNKNOWN,
+			//
+			MODE_ETC1,
+			MODE_T,
+			MODE_H,
+			MODE_PLANAR,
+			MODES
+		} Mode;
+
+		Block4x4Encoding(void);
+		virtual ~Block4x4Encoding(void) {}
+        
+		virtual void Encode(Block4x4 *a_pblockParent,
+                            const ColorFloatRGBA *a_pafrgbaSource,
+                            unsigned char *a_paucEncodingBits,
+                            ErrorMetric a_errormetric) = 0;
+
+		virtual void Decode(Block4x4 *a_pblockParent,
+                            unsigned char *a_paucEncodingBits,
+                            const ColorFloatRGBA *a_pafrgbaSource,
+                            ErrorMetric a_errormetric,
+                            uint16_t iterationCount) = 0;
+
+        // is is only called on S/RGBA format to copy alpha into decoded pixels of encoding
+        virtual void DecodeAlpha() { }
+        
+		// perform an iteration of the encoding
+		// the first iteration must generate a complete, valid (if poor) encoding
+		virtual void PerformIteration(float a_fEffort) = 0;
+
+        // write output block
+        virtual void SetEncodingBits(void) = 0;
+
+        // the count of the last iteration, can be useful in multipass encoding
+        inline uint16_t GetIterationCount(void) const
+        {
+            return m_uiEncodingIterations;
+        }
+        
+        //-------------------
+        
+		//void CalcBlockError(void);
+        //float CalcPixelError(const ColorFloatRGBA& a_frgbaDecodedColor, int uiPixel) const;
+        
+		inline float GetError(void) const
+		{
+			return m_fError;
+		}
+
+		inline Mode GetMode(void) const
+		{
+			return m_mode;
+		}
+
+		inline bool IsDone(void) const
+		{
+			return m_boolDone;
+		}
+
+        void SetDoneIfPerfect();
+
+		
+        inline const ColorFloatRGBA& GetDecodedPixel(int uiPixel) const
+        {
+            return m_afrgbaDecodedColors[uiPixel];
+        }
+
+        // CalcPixelError is a major hotspot. Called in inner loops.
+        // calculate the error between the source pixel and the decoded pixel
+        // the error amount is base on the error metric
+        inline float CalcPixelError(const ColorFloatRGBA& encodedPixel,
+                                                int uiPixel) const
+        {
+
+            const ColorFloatRGBA& sourcePixel = m_pafrgbaSource[uiPixel];
+            float error = 0.0f;
+            
+            // don't use alpha in any calcs.  This is only RGB error.
+            
+            switch(m_errormetric)
+            {
+
+            case ErrorMetric::GRAY:
+            {
+                error = encodedPixel.fR - sourcePixel.fR;
+                error *= error;
+                
+                break;
+            }
+                    
+            case ErrorMetric::REC709:
+            case ErrorMetric::NUMERIC:
+            {
+                float fDX = encodedPixel.fR - sourcePixel.fR;
+                float fDY = encodedPixel.fG - sourcePixel.fG;
+                float fDZ = encodedPixel.fB - sourcePixel.fB;
+                            
+                error = fDX*fDX + fDY*fDY + fDZ*fDZ;
+                break;
+            }
+                    
+            /* This slows down encoding 28s vs. 20s when not inlined, so stop using it
+                also the src isn't cached.
+             
+            case ErrorMetric::REC709:
+            {
+                //assert(a_fDecodedAlpha >= 0.0f);
+
+                // YCbCr of source and encodedColor
+                // TODO: could cache sourcePixel values to move out of loops
+                float fLuma1 = sourcePixel.fR*0.2126f + sourcePixel.fG*0.7152f + sourcePixel.fB*0.0722f;
+                float fChromaR1 = (sourcePixel.fR - fLuma1) * (0.5f / (1.0f - 0.2126f));
+                float fChromaB1 = (sourcePixel.fB - fLuma1) * (0.5f / (1.0f - 0.0722f));
+
+                float fLuma2 = encodedPixel.fR*0.2126f + encodedPixel.fG*0.7152f + encodedPixel.fB*0.0722f;
+                float fChromaR2 = (encodedPixel.fR - fLuma2) * (0.5f / (1.0f - 0.2126f));
+                float fChromaB2 = (encodedPixel.fB - fLuma2) * (0.5f / (1.0f - 0.0722f));
+
+                float fDeltaL = fLuma1 - fLuma2;
+                float fDeltaCr = fChromaR1 - fChromaR2;
+                float fDeltaCb = fChromaB1 - fChromaB2;
+
+                const float LUMA_WEIGHT = 3.0f;
+                const float CHROMA_RED_WEIGHT = 1.0f;
+                const float CHROMA_BLUE_WEIGHT = 1.0f;
+               
+                // Favor Luma accuracy over Chroma
+                error = LUMA_WEIGHT * fDeltaL*fDeltaL +
+                        CHROMA_RED_WEIGHT * fDeltaCr*fDeltaCr +
+                        CHROMA_BLUE_WEIGHT * fDeltaCb*fDeltaCb;
+        
+                break;
+            }
+            */
+            
+            }
+            
+            return error;
+        }
+        
+        // CalcBlockError is a major hotspot. Called in inner loops.
+        // calculate the error for the block by summing the pixel errors
+        inline void CalcBlockError(void)
+        {
+            m_fError = 0.0f;
+
+            if (m_pafrgbaSource)
+            {
+                for (int uiPixel = 0; uiPixel < (int)PIXELS; uiPixel++)
+                {
+                    m_fError += CalcPixelError(m_afrgbaDecodedColors[uiPixel], uiPixel);
+                }
+            }
+        }
+        
+	protected:
+        void Init(Block4x4 *a_pblockParent,
+					const ColorFloatRGBA *a_pafrgbaSource,
+					ErrorMetric a_errormetric,
+                    uint16_t iterationCount);
+
+		Block4x4		*m_pblockParent;
+		const ColorFloatRGBA	*m_pafrgbaSource;
+
+		ColorFloatRGBA	m_afrgbaDecodedColors[PIXELS];	// decoded RGB components, ignore Alpha
+		float			m_fError;						// error for RGB relative to m_pafrgbaSource.rgb
+
+		// intermediate encoding
+		Mode			m_mode;
+
+		unsigned int	m_uiEncodingIterations;
+		bool			m_boolDone;						// all iterations have been done
+		ErrorMetric		m_errormetric;
+
+	private:
+        void Init();
+        
+	};
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_R11.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_R11.cpp
index cb37505c..957967ba 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_R11.cpp
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_R11.cpp
@@ -1,544 +1,544 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_R11.cpp
-
-Block4x4Encoding_R11 is the encoder to use when targetting file format R11 and SR11 (signed R11).  
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_R11.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-//#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-//#include <algorithm>
-
-namespace Etc
-{
-    template<typename T>
-    T clamp(T value, T mn, T mx) {
-        return (value <= mn) ? mn : ((value >= mx) ? mx : value);
-    }
-
-    const int MODIFIER_TABLE_ENTRYS = 16;
-    const int SELECTOR_BITS = 3;
-    const int SELECTORS = 1 << SELECTOR_BITS;
-
-    // modifier values to use for R11, SR11, RG11 and SRG11
-    const int8_t s_modifierTable8[MODIFIER_TABLE_ENTRYS][SELECTORS]
-    {
-        { -3, -6,  -9, -15, 2, 5, 8, 14 },
-        { -3, -7, -10, -13, 2, 6, 9, 12 },
-        { -2, -5,  -8, -13, 1, 4, 7, 12 },
-        { -2, -4,  -6, -13, 1, 3, 5, 12 },
-
-        { -3, -6,  -8, -12, 2, 5, 7, 11 },
-        { -3, -7,  -9, -11, 2, 6, 8, 10 },
-        { -4, -7,  -8, -11, 3, 6, 7, 10 },
-        { -3, -5,  -8, -11, 2, 4, 7, 10 },
-
-        { -2, -6,  -8, -10, 1, 5, 7,  9 },
-        { -2, -5,  -8, -10, 1, 4, 7,  9 },
-        { -2, -4,  -8, -10, 1, 3, 7,  9 },
-        { -2, -5,  -7, -10, 1, 4, 6,  9 },
-
-        { -3, -4,  -7, -10, 2, 3, 6,  9 },
-        { -1, -2,  -3, -10, 0, 1, 2,  9 },
-        { -4, -6,  -8,  -9, 3, 5, 7,  8 },
-        { -3, -5,  -7,  -9, 2, 4, 6,  8 }
-    };
-
-    // this is simplified for interation
-    // stripped down, since it's one of the hotspots of encoding
-    inline int DecodePixelRedInt(int baseMul8Plus4, int multiplier, int modifier)
-    {
-        int pixel = baseMul8Plus4 + modifier * multiplier;
-        
-        // see here
-        // https://www.khronos.org/registry/DataFormat/specs/1.1/dataformat.1.1.html
-        
-//        if (multiplier > 0)
-//        {
-//            //fPixel = (a_fBase * 8 + 4) + 8 * fModifier * a_fMultiplier;
-//            pixel = baseMul8Plus4 + 8 * modifier * multiplier;
-//        }
-//        else
-//        {
-//            //fPixel = (a_fBase * 8 + 4) + fModifier;
-//            pixel = baseMul8Plus4 + modifier;
-//        }
-        
-        // just to debug over range pixels
-//        if (pixel < 0 || pixel > 2047)
-//        {
-//            int bp = 0;
-//            bp = bp;
-//        }
-        
-        // modifier and multiplier can push base outside valid range, but hw clamps
-        pixel = clamp(pixel, 0, 2047);
-        return pixel;
-    }
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_R11::Block4x4Encoding_R11(void)
-	{
-		m_pencodingbitsR11 = nullptr;
-	}
-
-	Block4x4Encoding_R11::~Block4x4Encoding_R11(void) {}
-
-	// ----------------------------------------------------------------------------------------------------
-	void Block4x4Encoding_R11::Encode(
-		const float *sourcePixels,
-		uint8_t *encodingBits,
-        bool isSnorm
-    )
-	{
-        int numSourceChannels = 4; // advance by 4 floats
-        
-        int fMinRed = 2047;
-        int fMaxRed = 0;
-                        
-        // assumption of unorm float data for sourcePixels here
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            int fRed = clamp((int)roundf(2047.0f * sourcePixels[numSourceChannels * uiPixel]), 0, 2047);
-                        
-            if (fRed < fMinRed)
-            {
-                fMinRed = fRed;
-            }
-            if (fRed > fMaxRed)
-            {
-                fMaxRed = fRed;
-            }
-                        
-            m_srcPixels[uiPixel] = fRed;
-        }
-        
-        m_redMin = fMinRed;
-        m_redMax = fMaxRed;
-        
-        // now setup for iteration
-        m_uiEncodingIterations = 0;
-        m_fError = FLT_MAX;
-        m_isDone = false;
-        m_isSnorm = isSnorm;
-        
-		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)encodingBits;
-    }
-
-	// ----------------------------------------------------------------------------------------------------
-	void Block4x4Encoding_R11::Decode(
-		uint8_t *encodingBits,
-		const float *sourcePixels,
-        bool isSnorm,
-        uint16_t lastIterationCount
-    )
-	{
-        m_isDone = (lastIterationCount & 0x80) != 0; // done high bit
-        
-        if (m_isDone)
-        {
-            m_pencodingbitsR11 = nullptr; // skip decode/encode on partially done block
-            m_fError = 0.0f;
-            return;
-        }
-    
-        m_uiEncodingIterations = lastIterationCount;
-        
-        // everything is re-established from the encoded block and iteration count
-        // since we already have to allocate the block storage, an iteration count per block is only additional
-        // also encoders are now across all blocks, so could just allocate one block per thread and iterate until
-        // done and skip the priority system.
-        //
-        // Note: don't call this on done blocks and then iterate, or iteration count will advance
-        // m_isDone is set to false in the Encode. Priority queue should ignore done blocks already.
-        
-		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)encodingBits;
-        m_isSnorm = isSnorm;
-        
-        if (m_isSnorm)
-        {
-            m_redBase = (int8_t)m_pencodingbitsR11->data.base + 128;
-        }
-        else
-        {
-            m_redBase = (uint8_t)m_pencodingbitsR11->data.base;
-        }
-                        
-        m_redMultiplier = m_pencodingbitsR11->data.multiplier;
-        m_redModifierTableIndex = m_pencodingbitsR11->data.table;
-
-        uint64_t selectorBits = 0;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors0 << (uint64_t)40;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors1 << (uint64_t)32;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors2 << (uint64_t)24;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors3 << (uint64_t)16;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors4 << (uint64_t)8;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors5;
-                        
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            uint64_t uiShift = 45 - (3 * uiPixel);
-            m_redSelectors[uiPixel] = (selectorBits >> uiShift) & (uint64_t)(SELECTORS - 1);
-        }
-
-        // call this to continue encoding later iterations
-        Encode(sourcePixels, encodingBits, isSnorm);
-    
-        // recompute the block error by decoding each pixel
-        // could save out error to SortedBlock avoid needing to compute all this
-        // but would need to store r and g error separately.
-        int blockError = 0;
-        
-        int baseForDecode = m_redBase * 8 + 4;
-        int multiplierForDecode = (m_redMultiplier == 0) ? 1 : (8 * m_redMultiplier);
-        
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            int modifier = s_modifierTable8[m_redModifierTableIndex][m_redSelectors[uiPixel]];
-           
-            int decodedPixelData = DecodePixelRedInt(baseForDecode, multiplierForDecode, modifier);
-            
-            // add up the error
-            int error = decodedPixelData - m_srcPixels[uiPixel];
-            blockError += error * error;
-        }
-    
-        m_fError = (float)blockError;
-    }
-        
-    void Block4x4Encoding_R11::DecodeOnly(
-        const uint8_t *encodingBits,
-        float *decodedPixels,
-        bool isSnorm)
-    {
-        m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)encodingBits;
-        m_isSnorm = isSnorm;
-        
-        if (m_isSnorm)
-        {
-            m_redBase = (int8_t)m_pencodingbitsR11->data.base + 128;
-        }
-        else
-        {
-            m_redBase = (uint8_t)m_pencodingbitsR11->data.base;
-        }
-                        
-        m_redMultiplier = m_pencodingbitsR11->data.multiplier;
-        m_redModifierTableIndex = m_pencodingbitsR11->data.table;
-
-        uint64_t selectorBits = 0;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors0 << (uint64_t)40;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors1 << (uint64_t)32;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors2 << (uint64_t)24;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors3 << (uint64_t)16;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors4 << (uint64_t)8;
-        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors5;
-                        
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            uint64_t uiShift = 45 - (3 * uiPixel);
-            m_redSelectors[uiPixel] = (selectorBits >> uiShift) & (uint64_t)(SELECTORS - 1);
-        }
-        
-        // now extract the pixels from the block values above
-        int numChannels = 4;
-        
-        int baseForDecode = m_redBase * 8 + 4;
-        int multiplierForDecode = (m_redMultiplier == 0) ? 1 : (8 * m_redMultiplier);
-        
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            int modifier = s_modifierTable8[m_redModifierTableIndex][m_redSelectors[uiPixel]];
-            
-            int decodedPixelData = DecodePixelRedInt(baseForDecode, multiplierForDecode, modifier);
-            
-            decodedPixels[uiPixel * numChannels] = decodedPixelData / 2047.0f;
-        }
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	
-    // 16 pixels x 1 unit squared out of 2047
-    const float kErrorTolerance = 16 * 1 * 1;
-
-    void Block4x4Encoding_R11::PerformIteration(float a_fEffort)
-	{
-        if (m_pencodingbitsR11 == nullptr)
-        {
-            return;
-        }
-        
-        if (m_isDone)
-        {
-            return;
-        }
-		
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			CalculateR11(8, 0, 0);
-			break;
-
-		case 1:
-			CalculateR11(8, 2, 1);
-			if (a_fEffort <= 24.5f) // TODO: decouple effort from this, this is more of an iteration quality
-			{
-                m_isDone = true;
-            }
-			break;
-
-		case 2:
-			CalculateR11(8, 12, 1);
-			if (a_fEffort <= 49.5f)
-			{
-                m_isDone = true;
-            }
-			break;
-
-		case 3:
-			CalculateR11(7, 6, 1);
-			break;
-
-		case 4:
-			CalculateR11(6, 3, 1);
-			break;
-
-		case 5:
-			CalculateR11(5, 1, 0);
-			m_isDone = true;
-            break;
-		}
-
-        // advance to next iteration
-        if (!m_isDone)
-        {
-            if (m_fError < kErrorTolerance)
-            {
-                m_isDone = true;
-            }
-            else
-            {
-                m_uiEncodingIterations++;
-            }
-        }
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	
-    // find the best combination of base color, multiplier and selectors
-	void Block4x4Encoding_R11::CalculateR11(unsigned int a_uiSelectorsUsed,
-												int a_fBaseRadius, int a_fMultiplierRadius)
-	{
-		// maps from virtual (monotonic) selector to ETC selector
-		static const uint8_t auiVirtualSelectorMap[8] = {3, 2, 1, 0, 4, 5, 6, 7};
-
-        // don't search any extra radius if range is 0
-        // TODO: there's probably an instant "done" case here without any iteration
-		int fRedRange = (m_redMax - m_redMin);
-        
-        if (fRedRange == 0)
-        {
-            a_fBaseRadius = 0;
-            a_fMultiplierRadius = 0;
-        }
-        
-        // 16 x 8 x 3 x 16 x 16 x 8 iterations = 786K iteraatins / block worst case
-        
-      	// try each modifier table entry
-        // 16 of these
-		for (int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-		{
-            // up to 8 of these
-			for (int uiMinVirtualSelector = 0;
-					uiMinVirtualSelector <= (int)(8 - a_uiSelectorsUsed);
-					uiMinVirtualSelector++)
-			{
-				int uiMaxVirtualSelector = uiMinVirtualSelector + a_uiSelectorsUsed - 1;
-
-				int uiMinSelector = auiVirtualSelectorMap[uiMinVirtualSelector];
-				int uiMaxSelector = auiVirtualSelectorMap[uiMaxVirtualSelector];
-
-				int fTableEntryCenter = -s_modifierTable8[uiTableEntry][uiMinSelector];
-
-				int fTableEntryRange = s_modifierTable8[uiTableEntry][uiMaxSelector] -
-                                       s_modifierTable8[uiTableEntry][uiMinSelector];
-
-                float fCenterRatio = fTableEntryCenter / (float)fTableEntryRange;
-                float fCenter = m_redMin + fCenterRatio * fRedRange;
-				int fCenterInt = (int)roundf((255.0f/2047.0f) * fCenter);
-
-                // base of 0 to 255 maps to 0 to 2047
-                // search a radius of values off center of range
-                int fMinBase = fCenterInt - a_fBaseRadius;
-                int fMaxBase = fCenterInt + a_fBaseRadius;
-                if (fMinBase < 0)
-                {
-                    fMinBase = 0;
-                }
-				if (fMaxBase > 255)
-				{
-					fMaxBase = 255;
-				}
-
-                // 255 / up to 29
-                int fRangeMultiplier = (int)roundf((fRedRange * (255.0 / 2047.0f)) / fTableEntryRange);
-
-                int fMinMultiplier = clamp(fRangeMultiplier - a_fMultiplierRadius, 0, 15); // yes, 0
-                int fMaxMultiplier = clamp(fRangeMultiplier + a_fMultiplierRadius, 1, 15);
-                
-                // find best selector for each pixel
-                uint8_t bestSelectors[PIXELS];
-                int bestRedError[PIXELS];
-                
-                // only for debug
-                //int bestPixelRed[PIXELS];
-                
-                // up to 3 of these
-				for (int fBase = fMinBase; fBase <= fMaxBase; fBase++)
-				{
-                    int baseForDecode = fBase * 8 + 4;
-                    
-                    // up to 16 of these
-					for (int fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier++)
-					{
-                        int multiplierForDecode = (fMultiplier == 0) ? 1 : (8 * fMultiplier);
-                        
-                        // 16 of these
-						for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-                            int bestPixelError = 2047 * 2047;
-
-                            // 8 of these
-							for (int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-                                int modifier = s_modifierTable8[uiTableEntry][uiSelector];
-                                
-								int fPixelRed = DecodePixelRedInt(baseForDecode, multiplierForDecode, modifier);
-
-                                int error = fPixelRed - (int)m_srcPixels[uiPixel];
-                                error *= error;
-                                
-                                // this is guaranteed to pick one selector for every pixel
-                                // the one with the lowest error.
-								if (error < bestPixelError)
-								{
-                                    bestPixelError = error;
-                                    bestRedError[uiPixel] = error;
-                                    bestSelectors[uiPixel] = uiSelector;
-									
-									//bestPixelRed[uiPixel] = fPixelRed;
-								}
-							}
-						}
-                        
-                        // accumulate all best pixel error into block error total
-						int blockError = 0;
-						for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-                            blockError += bestRedError[uiPixel];
-						}
-                        
-                        // pick that if it's the smallest error
-                        if (m_fError > (float)blockError)
-						{
-                            m_fError = (float)blockError;
-                           
-							if (m_isSnorm)
-							{
-								m_redBase = fBase - 128;
-							}
-							else
-							{
-                                m_redBase = fBase;
-							}
-							m_redMultiplier = fMultiplier;
-							m_redModifierTableIndex = uiTableEntry;
-
-                            for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_redSelectors[uiPixel] = bestSelectors[uiPixel];
-                               
-                                // nothing looks at this data, but useful to compare to source
-                                //m_decodedPixels[uiPixel] = bestPixelRed[uiPixel]; //  / 2047.0f;
-							}
-                            
-                            
-                            // compare to tolerance, since reaching 0 is difficult in float
-                            if (m_fError <= kErrorTolerance)
-                            {
-                                return;
-                            }
-						}
-					}
-				}
-
-			}
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_R11::SetEncodingBits(void)
-	{
-        // skip encode if block is already done
-        if (m_pencodingbitsR11 == nullptr)
-        {
-            return;
-        }
-        
-        if (m_isSnorm)
-		{
-			m_pencodingbitsR11->data.base = (int8_t)m_redBase;
-		}
-		else
-		{
-            m_pencodingbitsR11->data.base = (uint8_t)m_redBase;
-		}
-		m_pencodingbitsR11->data.table = m_redModifierTableIndex;
-		m_pencodingbitsR11->data.multiplier = m_redMultiplier;
-
-		uint64_t selectorBits = 0;
-		for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			uint64_t uiShift = 45 - (3 * uiPixel);
-			selectorBits |= ((uint64_t)m_redSelectors[uiPixel]) << uiShift;
-		}
-
-		m_pencodingbitsR11->data.selectors0 = uint32_t(selectorBits >> (uint64_t)40);
-		m_pencodingbitsR11->data.selectors1 = uint32_t(selectorBits >> (uint64_t)32);
-		m_pencodingbitsR11->data.selectors2 = uint32_t(selectorBits >> (uint64_t)24);
-		m_pencodingbitsR11->data.selectors3 = uint32_t(selectorBits >> (uint64_t)16);
-		m_pencodingbitsR11->data.selectors4 = uint32_t(selectorBits >> (uint64_t)8);
-		m_pencodingbitsR11->data.selectors5 = uint32_t(selectorBits);
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcBlock4x4Encoding_R11.cpp
+
+Block4x4Encoding_R11 is the encoder to use when targetting file format R11 and SR11 (signed R11).  
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcBlock4x4Encoding_R11.h"
+
+#include "EtcBlock4x4EncodingBits.h"
+//#include "EtcBlock4x4.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <limits>
+//#include <algorithm>
+
+namespace Etc
+{
+    template<typename T>
+    T clamp(T value, T mn, T mx) {
+        return (value <= mn) ? mn : ((value >= mx) ? mx : value);
+    }
+
+    const int MODIFIER_TABLE_ENTRYS = 16;
+    const int SELECTOR_BITS = 3;
+    const int SELECTORS = 1 << SELECTOR_BITS;
+
+    // modifier values to use for R11, SR11, RG11 and SRG11
+    const int8_t s_modifierTable8[MODIFIER_TABLE_ENTRYS][SELECTORS]
+    {
+        { -3, -6,  -9, -15, 2, 5, 8, 14 },
+        { -3, -7, -10, -13, 2, 6, 9, 12 },
+        { -2, -5,  -8, -13, 1, 4, 7, 12 },
+        { -2, -4,  -6, -13, 1, 3, 5, 12 },
+
+        { -3, -6,  -8, -12, 2, 5, 7, 11 },
+        { -3, -7,  -9, -11, 2, 6, 8, 10 },
+        { -4, -7,  -8, -11, 3, 6, 7, 10 },
+        { -3, -5,  -8, -11, 2, 4, 7, 10 },
+
+        { -2, -6,  -8, -10, 1, 5, 7,  9 },
+        { -2, -5,  -8, -10, 1, 4, 7,  9 },
+        { -2, -4,  -8, -10, 1, 3, 7,  9 },
+        { -2, -5,  -7, -10, 1, 4, 6,  9 },
+
+        { -3, -4,  -7, -10, 2, 3, 6,  9 },
+        { -1, -2,  -3, -10, 0, 1, 2,  9 },
+        { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+        { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+    };
+
+    // this is simplified for interation
+    // stripped down, since it's one of the hotspots of encoding
+    inline int DecodePixelRedInt(int baseMul8Plus4, int multiplier, int modifier)
+    {
+        int pixel = baseMul8Plus4 + modifier * multiplier;
+        
+        // see here
+        // https://www.khronos.org/registry/DataFormat/specs/1.1/dataformat.1.1.html
+        
+//        if (multiplier > 0)
+//        {
+//            //fPixel = (a_fBase * 8 + 4) + 8 * fModifier * a_fMultiplier;
+//            pixel = baseMul8Plus4 + 8 * modifier * multiplier;
+//        }
+//        else
+//        {
+//            //fPixel = (a_fBase * 8 + 4) + fModifier;
+//            pixel = baseMul8Plus4 + modifier;
+//        }
+        
+        // just to debug over range pixels
+//        if (pixel < 0 || pixel > 2047)
+//        {
+//            int bp = 0;
+//            bp = bp;
+//        }
+        
+        // modifier and multiplier can push base outside valid range, but hw clamps
+        pixel = clamp(pixel, 0, 2047);
+        return pixel;
+    }
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+	Block4x4Encoding_R11::Block4x4Encoding_R11(void)
+	{
+		m_pencodingbitsR11 = nullptr;
+	}
+
+	Block4x4Encoding_R11::~Block4x4Encoding_R11(void) {}
+
+	// ----------------------------------------------------------------------------------------------------
+	void Block4x4Encoding_R11::Encode(
+		const float *sourcePixels,
+		uint8_t *encodingBits,
+        bool isSnorm
+    )
+	{
+        int numSourceChannels = 4; // advance by 4 floats
+        
+        int fMinRed = 2047;
+        int fMaxRed = 0;
+                        
+        // assumption of unorm float data for sourcePixels here
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            int fRed = clamp((int)roundf(2047.0f * sourcePixels[numSourceChannels * uiPixel]), 0, 2047);
+                        
+            if (fRed < fMinRed)
+            {
+                fMinRed = fRed;
+            }
+            if (fRed > fMaxRed)
+            {
+                fMaxRed = fRed;
+            }
+                        
+            m_srcPixels[uiPixel] = fRed;
+        }
+        
+        m_redMin = fMinRed;
+        m_redMax = fMaxRed;
+        
+        // now setup for iteration
+        m_uiEncodingIterations = 0;
+        m_fError = FLT_MAX;
+        m_isDone = false;
+        m_isSnorm = isSnorm;
+        
+		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)encodingBits;
+    }
+
+	// ----------------------------------------------------------------------------------------------------
+	void Block4x4Encoding_R11::Decode(
+		uint8_t *encodingBits,
+		const float *sourcePixels,
+        bool isSnorm,
+        uint16_t lastIterationCount
+    )
+	{
+        m_isDone = (lastIterationCount & 0x80) != 0; // done high bit
+        
+        if (m_isDone)
+        {
+            m_pencodingbitsR11 = nullptr; // skip decode/encode on partially done block
+            m_fError = 0.0f;
+            return;
+        }
+    
+        m_uiEncodingIterations = lastIterationCount;
+        
+        // everything is re-established from the encoded block and iteration count
+        // since we already have to allocate the block storage, an iteration count per block is only additional
+        // also encoders are now across all blocks, so could just allocate one block per thread and iterate until
+        // done and skip the priority system.
+        //
+        // Note: don't call this on done blocks and then iterate, or iteration count will advance
+        // m_isDone is set to false in the Encode. Priority queue should ignore done blocks already.
+        
+		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)encodingBits;
+        m_isSnorm = isSnorm;
+        
+        if (m_isSnorm)
+        {
+            m_redBase = (int8_t)m_pencodingbitsR11->data.base + 128;
+        }
+        else
+        {
+            m_redBase = (uint8_t)m_pencodingbitsR11->data.base;
+        }
+                        
+        m_redMultiplier = m_pencodingbitsR11->data.multiplier;
+        m_redModifierTableIndex = m_pencodingbitsR11->data.table;
+
+        uint64_t selectorBits = 0;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors0 << (uint64_t)40;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors1 << (uint64_t)32;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors2 << (uint64_t)24;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors3 << (uint64_t)16;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors4 << (uint64_t)8;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors5;
+                        
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            uint64_t uiShift = 45 - (3 * uiPixel);
+            m_redSelectors[uiPixel] = (selectorBits >> uiShift) & (uint64_t)(SELECTORS - 1);
+        }
+
+        // call this to continue encoding later iterations
+        Encode(sourcePixels, encodingBits, isSnorm);
+    
+        // recompute the block error by decoding each pixel
+        // could save out error to SortedBlock avoid needing to compute all this
+        // but would need to store r and g error separately.
+        int blockError = 0;
+        
+        int baseForDecode = m_redBase * 8 + 4;
+        int multiplierForDecode = (m_redMultiplier == 0) ? 1 : (8 * m_redMultiplier);
+        
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            int modifier = s_modifierTable8[m_redModifierTableIndex][m_redSelectors[uiPixel]];
+           
+            int decodedPixelData = DecodePixelRedInt(baseForDecode, multiplierForDecode, modifier);
+            
+            // add up the error
+            int error = decodedPixelData - m_srcPixels[uiPixel];
+            blockError += error * error;
+        }
+    
+        m_fError = (float)blockError;
+    }
+        
+    void Block4x4Encoding_R11::DecodeOnly(
+        const uint8_t *encodingBits,
+        float *decodedPixels,
+        bool isSnorm)
+    {
+        m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)encodingBits;
+        m_isSnorm = isSnorm;
+        
+        if (m_isSnorm)
+        {
+            m_redBase = (int8_t)m_pencodingbitsR11->data.base + 128;
+        }
+        else
+        {
+            m_redBase = (uint8_t)m_pencodingbitsR11->data.base;
+        }
+                        
+        m_redMultiplier = m_pencodingbitsR11->data.multiplier;
+        m_redModifierTableIndex = m_pencodingbitsR11->data.table;
+
+        uint64_t selectorBits = 0;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors0 << (uint64_t)40;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors1 << (uint64_t)32;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors2 << (uint64_t)24;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors3 << (uint64_t)16;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors4 << (uint64_t)8;
+        selectorBits |= (uint64_t)m_pencodingbitsR11->data.selectors5;
+                        
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            uint64_t uiShift = 45 - (3 * uiPixel);
+            m_redSelectors[uiPixel] = (selectorBits >> uiShift) & (uint64_t)(SELECTORS - 1);
+        }
+        
+        // now extract the pixels from the block values above
+        int numChannels = 4;
+        
+        int baseForDecode = m_redBase * 8 + 4;
+        int multiplierForDecode = (m_redMultiplier == 0) ? 1 : (8 * m_redMultiplier);
+        
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            int modifier = s_modifierTable8[m_redModifierTableIndex][m_redSelectors[uiPixel]];
+            
+            int decodedPixelData = DecodePixelRedInt(baseForDecode, multiplierForDecode, modifier);
+            
+            decodedPixels[uiPixel * numChannels] = decodedPixelData / 2047.0f;
+        }
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	
+    // 16 pixels x 1 unit squared out of 2047
+    const float kErrorTolerance = 16 * 1 * 1;
+
+    void Block4x4Encoding_R11::PerformIteration(float a_fEffort)
+	{
+        if (m_pencodingbitsR11 == nullptr)
+        {
+            return;
+        }
+        
+        if (m_isDone)
+        {
+            return;
+        }
+		
+		switch (m_uiEncodingIterations)
+		{
+		case 0:
+			CalculateR11(8, 0, 0);
+			break;
+
+		case 1:
+			CalculateR11(8, 2, 1);
+			if (a_fEffort <= 24.5f) // TODO: decouple effort from this, this is more of an iteration quality
+			{
+                m_isDone = true;
+            }
+			break;
+
+		case 2:
+			CalculateR11(8, 12, 1);
+			if (a_fEffort <= 49.5f)
+			{
+                m_isDone = true;
+            }
+			break;
+
+		case 3:
+			CalculateR11(7, 6, 1);
+			break;
+
+		case 4:
+			CalculateR11(6, 3, 1);
+			break;
+
+		case 5:
+			CalculateR11(5, 1, 0);
+			m_isDone = true;
+            break;
+		}
+
+        // advance to next iteration
+        if (!m_isDone)
+        {
+            if (m_fError < kErrorTolerance)
+            {
+                m_isDone = true;
+            }
+            else
+            {
+                m_uiEncodingIterations++;
+            }
+        }
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	
+    // find the best combination of base color, multiplier and selectors
+	void Block4x4Encoding_R11::CalculateR11(unsigned int a_uiSelectorsUsed,
+												int a_fBaseRadius, int a_fMultiplierRadius)
+	{
+		// maps from virtual (monotonic) selector to ETC selector
+		static const uint8_t auiVirtualSelectorMap[8] = {3, 2, 1, 0, 4, 5, 6, 7};
+
+        // don't search any extra radius if range is 0
+        // TODO: there's probably an instant "done" case here without any iteration
+		int fRedRange = (m_redMax - m_redMin);
+        
+        if (fRedRange == 0)
+        {
+            a_fBaseRadius = 0;
+            a_fMultiplierRadius = 0;
+        }
+        
+        // 16 x 8 x 3 x 16 x 16 x 8 iterations = 786K iteraatins / block worst case
+        
+      	// try each modifier table entry
+        // 16 of these
+		for (int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
+		{
+            // up to 8 of these
+			for (int uiMinVirtualSelector = 0;
+					uiMinVirtualSelector <= (int)(8 - a_uiSelectorsUsed);
+					uiMinVirtualSelector++)
+			{
+				int uiMaxVirtualSelector = uiMinVirtualSelector + a_uiSelectorsUsed - 1;
+
+				int uiMinSelector = auiVirtualSelectorMap[uiMinVirtualSelector];
+				int uiMaxSelector = auiVirtualSelectorMap[uiMaxVirtualSelector];
+
+				int fTableEntryCenter = -s_modifierTable8[uiTableEntry][uiMinSelector];
+
+				int fTableEntryRange = s_modifierTable8[uiTableEntry][uiMaxSelector] -
+                                       s_modifierTable8[uiTableEntry][uiMinSelector];
+
+                float fCenterRatio = fTableEntryCenter / (float)fTableEntryRange;
+                float fCenter = m_redMin + fCenterRatio * fRedRange;
+				int fCenterInt = (int)roundf((255.0f/2047.0f) * fCenter);
+
+                // base of 0 to 255 maps to 0 to 2047
+                // search a radius of values off center of range
+                int fMinBase = fCenterInt - a_fBaseRadius;
+                int fMaxBase = fCenterInt + a_fBaseRadius;
+                if (fMinBase < 0)
+                {
+                    fMinBase = 0;
+                }
+				if (fMaxBase > 255)
+				{
+					fMaxBase = 255;
+				}
+
+                // 255 / up to 29
+                int fRangeMultiplier = (int)roundf((fRedRange * (255.0 / 2047.0f)) / fTableEntryRange);
+
+                int fMinMultiplier = clamp(fRangeMultiplier - a_fMultiplierRadius, 0, 15); // yes, 0
+                int fMaxMultiplier = clamp(fRangeMultiplier + a_fMultiplierRadius, 1, 15);
+                
+                // find best selector for each pixel
+                uint8_t bestSelectors[PIXELS];
+                int bestRedError[PIXELS];
+                
+                // only for debug
+                //int bestPixelRed[PIXELS];
+                
+                // up to 3 of these
+				for (int fBase = fMinBase; fBase <= fMaxBase; fBase++)
+				{
+                    int baseForDecode = fBase * 8 + 4;
+                    
+                    // up to 16 of these
+					for (int fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier++)
+					{
+                        int multiplierForDecode = (fMultiplier == 0) ? 1 : (8 * fMultiplier);
+                        
+                        // 16 of these
+						for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+						{
+                            int bestPixelError = 2047 * 2047;
+
+                            // 8 of these
+							for (int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
+							{
+                                int modifier = s_modifierTable8[uiTableEntry][uiSelector];
+                                
+								int fPixelRed = DecodePixelRedInt(baseForDecode, multiplierForDecode, modifier);
+
+                                int error = fPixelRed - (int)m_srcPixels[uiPixel];
+                                error *= error;
+                                
+                                // this is guaranteed to pick one selector for every pixel
+                                // the one with the lowest error.
+								if (error < bestPixelError)
+								{
+                                    bestPixelError = error;
+                                    bestRedError[uiPixel] = error;
+                                    bestSelectors[uiPixel] = uiSelector;
+									
+									//bestPixelRed[uiPixel] = fPixelRed;
+								}
+							}
+						}
+                        
+                        // accumulate all best pixel error into block error total
+						int blockError = 0;
+						for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+						{
+                            blockError += bestRedError[uiPixel];
+						}
+                        
+                        // pick that if it's the smallest error
+                        if (m_fError > (float)blockError)
+						{
+                            m_fError = (float)blockError;
+                           
+							if (m_isSnorm)
+							{
+								m_redBase = fBase - 128;
+							}
+							else
+							{
+                                m_redBase = fBase;
+							}
+							m_redMultiplier = fMultiplier;
+							m_redModifierTableIndex = uiTableEntry;
+
+                            for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+							{
+								m_redSelectors[uiPixel] = bestSelectors[uiPixel];
+                               
+                                // nothing looks at this data, but useful to compare to source
+                                //m_decodedPixels[uiPixel] = bestPixelRed[uiPixel]; //  / 2047.0f;
+							}
+                            
+                            
+                            // compare to tolerance, since reaching 0 is difficult in float
+                            if (m_fError <= kErrorTolerance)
+                            {
+                                return;
+                            }
+						}
+					}
+				}
+
+			}
+		}
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state
+	//
+	void Block4x4Encoding_R11::SetEncodingBits(void)
+	{
+        // skip encode if block is already done
+        if (m_pencodingbitsR11 == nullptr)
+        {
+            return;
+        }
+        
+        if (m_isSnorm)
+		{
+			m_pencodingbitsR11->data.base = (int8_t)m_redBase;
+		}
+		else
+		{
+            m_pencodingbitsR11->data.base = (uint8_t)m_redBase;
+		}
+		m_pencodingbitsR11->data.table = m_redModifierTableIndex;
+		m_pencodingbitsR11->data.multiplier = m_redMultiplier;
+
+		uint64_t selectorBits = 0;
+		for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			uint64_t uiShift = 45 - (3 * uiPixel);
+			selectorBits |= ((uint64_t)m_redSelectors[uiPixel]) << uiShift;
+		}
+
+		m_pencodingbitsR11->data.selectors0 = uint32_t(selectorBits >> (uint64_t)40);
+		m_pencodingbitsR11->data.selectors1 = uint32_t(selectorBits >> (uint64_t)32);
+		m_pencodingbitsR11->data.selectors2 = uint32_t(selectorBits >> (uint64_t)24);
+		m_pencodingbitsR11->data.selectors3 = uint32_t(selectorBits >> (uint64_t)16);
+		m_pencodingbitsR11->data.selectors4 = uint32_t(selectorBits >> (uint64_t)8);
+		m_pencodingbitsR11->data.selectors5 = uint32_t(selectorBits);
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+}
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_R11.h b/libkram/etc2comp/EtcBlock4x4Encoding_R11.h
index 5c175d9e..31c1a21c 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_R11.h
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_R11.h
@@ -1,136 +1,136 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-//#include "EtcBlock4x4Encoding_RGB8.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_R11;
-
-	// ################################################################################
-	// Block4x4Encoding_R11
-	// ################################################################################
-
-    // Simpler interface for R11 and RG11 without all the code/data from Block4x4.
-    class IBlockEncoding
-    {
-    public:
-        virtual ~IBlockEncoding() {}
-        
-        // setup block for encoding iteration, isDone() true when finished
-        virtual void Encode(
-            const float *sourcePixels,uint8_t *encodingBits, bool isSnorm) = 0;
-        
-        // this is for decoding a block in multipass
-        virtual void Decode(
-            uint8_t *encodingBits, const float *sourcePixels, bool isSnorm,
-            uint16_t lastIterationCount
-        ) = 0;
-        
-        // for decoding a block for display or conversion
-        virtual void DecodeOnly(
-            const uint8_t *encodingBits, float *decodedPixels, bool isSnorm) = 0;
-        
-        // iterate to reduce the error
-        virtual void PerformIteration(float a_fEffort) = 0;
-
-        // write out block
-        virtual void SetEncodingBits(void) = 0;
-
-        // when error is zero, or effort level also limits iteration
-        virtual bool IsDone() const = 0;
-        
-        virtual uint16_t GetIterationCount() const = 0;
-        
-        virtual float GetError() const = 0;
-    };
-
-	class Block4x4Encoding_R11 : public IBlockEncoding
-	{
-	public:
-
-		Block4x4Encoding_R11(void);
-		virtual ~Block4x4Encoding_R11(void);
-
-        // setup block for encoding iteration, isDone() true when finished
-		virtual void Encode(
-            const float *sourcePixels, uint8_t *encodingBits, bool isSnorm) override;
-
-        // this is for decoding a block in multipass
-		virtual void Decode(
-			uint8_t *encodingBits, const float *sourcePixels, bool isSnorm,
-            uint16_t lastIterationCount) override;
-
-        // for decoding a block for display or conversion
-        virtual void DecodeOnly(
-            const uint8_t *encodingBits,
-            float *decodedPixels,
-            bool isSnorm) override;
-        
-		virtual void PerformIteration(float a_fEffort) override;
-
-		virtual void SetEncodingBits(void) override;
-
-        virtual bool IsDone() const override { return m_isDone; }
-        
-        // done bit embedded into high bit of each 8-bit count
-        virtual uint16_t GetIterationCount() const override
-        {
-            uint16_t count = m_uiEncodingIterations;
-            if (m_isDone)
-            {
-                count |= 0x80; // done high bit
-            }
-            return count;
-        }
-        
-        virtual float GetError() const override { return m_fError; }
-        
-	private:
-		void CalculateR11(unsigned int a_uiSelectorsUsed, 
-							int a_fBaseRadius, int a_fMultiplierRadius);
-
-        Block4x4EncodingBits_R11 *m_pencodingbitsR11;
-
-        //float m_fRedBlockError;
-        
-        static const int PIXELS = 16; // 4 * 4
-        
-        // adding data for block reuse (only set on first iteration)
-        int16_t m_srcPixels[PIXELS];
-        int16_t m_redMin;
-        int16_t m_redMax;
-        
-        // this can all be encoded/decoded from the EAC block
-        int16_t m_redBase;
-        int16_t m_redMultiplier;
-		uint8_t m_redSelectors[PIXELS];
-        uint8_t m_redModifierTableIndex;
-        
-        bool m_isDone;
-        bool m_isSnorm; // shifts fBase by 128
-        
-        // this is only data needed to reiterate, can decode and build up rest
-        uint8_t m_uiEncodingIterations;
-        float m_fError; // 22-bits + 4-bits = 26 bits        
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+//#include "EtcBlock4x4Encoding_RGB8.h"
+
+namespace Etc
+{
+	class Block4x4EncodingBits_R11;
+
+	// ################################################################################
+	// Block4x4Encoding_R11
+	// ################################################################################
+
+    // Simpler interface for R11 and RG11 without all the code/data from Block4x4.
+    class IBlockEncoding
+    {
+    public:
+        virtual ~IBlockEncoding() {}
+        
+        // setup block for encoding iteration, isDone() true when finished
+        virtual void Encode(
+            const float *sourcePixels,uint8_t *encodingBits, bool isSnorm) = 0;
+        
+        // this is for decoding a block in multipass
+        virtual void Decode(
+            uint8_t *encodingBits, const float *sourcePixels, bool isSnorm,
+            uint16_t lastIterationCount
+        ) = 0;
+        
+        // for decoding a block for display or conversion
+        virtual void DecodeOnly(
+            const uint8_t *encodingBits, float *decodedPixels, bool isSnorm) = 0;
+        
+        // iterate to reduce the error
+        virtual void PerformIteration(float a_fEffort) = 0;
+
+        // write out block
+        virtual void SetEncodingBits(void) = 0;
+
+        // when error is zero, or effort level also limits iteration
+        virtual bool IsDone() const = 0;
+        
+        virtual uint16_t GetIterationCount() const = 0;
+        
+        virtual float GetError() const = 0;
+    };
+
+	class Block4x4Encoding_R11 : public IBlockEncoding
+	{
+	public:
+
+		Block4x4Encoding_R11(void);
+		virtual ~Block4x4Encoding_R11(void);
+
+        // setup block for encoding iteration, isDone() true when finished
+		virtual void Encode(
+            const float *sourcePixels, uint8_t *encodingBits, bool isSnorm) override;
+
+        // this is for decoding a block in multipass
+		virtual void Decode(
+			uint8_t *encodingBits, const float *sourcePixels, bool isSnorm,
+            uint16_t lastIterationCount) override;
+
+        // for decoding a block for display or conversion
+        virtual void DecodeOnly(
+            const uint8_t *encodingBits,
+            float *decodedPixels,
+            bool isSnorm) override;
+        
+		virtual void PerformIteration(float a_fEffort) override;
+
+		virtual void SetEncodingBits(void) override;
+
+        virtual bool IsDone() const override { return m_isDone; }
+        
+        // done bit embedded into high bit of each 8-bit count
+        virtual uint16_t GetIterationCount() const override
+        {
+            uint16_t count = m_uiEncodingIterations;
+            if (m_isDone)
+            {
+                count |= 0x80; // done high bit
+            }
+            return count;
+        }
+        
+        virtual float GetError() const override { return m_fError; }
+        
+	private:
+		void CalculateR11(unsigned int a_uiSelectorsUsed, 
+							int a_fBaseRadius, int a_fMultiplierRadius);
+
+        Block4x4EncodingBits_R11 *m_pencodingbitsR11;
+
+        //float m_fRedBlockError;
+        
+        static const int PIXELS = 16; // 4 * 4
+        
+        // adding data for block reuse (only set on first iteration)
+        int16_t m_srcPixels[PIXELS];
+        int16_t m_redMin;
+        int16_t m_redMax;
+        
+        // this can all be encoded/decoded from the EAC block
+        int16_t m_redBase;
+        int16_t m_redMultiplier;
+		uint8_t m_redSelectors[PIXELS];
+        uint8_t m_redModifierTableIndex;
+        
+        bool m_isDone;
+        bool m_isSnorm; // shifts fBase by 128
+        
+        // this is only data needed to reiterate, can decode and build up rest
+        uint8_t m_uiEncodingIterations;
+        float m_fError; // 22-bits + 4-bits = 26 bits        
+	};
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RG11.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_RG11.cpp
index 960f083a..ee2e8569 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RG11.cpp
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RG11.cpp
@@ -1,68 +1,68 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RG11.cpp
-
-Block4x4Encoding_RG11 is the encoder to use when targetting file format RG11 and SRG11 (signed RG11).
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RG11.h"
-
-namespace Etc
-{
-	Block4x4Encoding_RG11::Block4x4Encoding_RG11(void)
-	{
-	}
-
-	Block4x4Encoding_RG11::~Block4x4Encoding_RG11(void) {}
-
-	void Block4x4Encoding_RG11::Encode(
-		const float *sourcePixels, uint8_t *encodingBits, bool isSnorm)
-	{
-        m_red.Encode(sourcePixels + 0, encodingBits, isSnorm);
-        m_green.Encode(sourcePixels + 1, encodingBits + 8, isSnorm);
-	}
-
-	void Block4x4Encoding_RG11::Decode(
-		unsigned char *encodingBits, const float *sourcePixels, bool isSnorm,
-        uint16_t lastIteration)
-    {
-        m_red.Decode(encodingBits, sourcePixels, isSnorm, (lastIteration >> 0) & 0xFF);
-        m_green.Decode(encodingBits + 8, sourcePixels + 1, isSnorm, (lastIteration >> 8) & 0xFF);
- 	}
-
-    void Block4x4Encoding_RG11::DecodeOnly(
-        const uint8_t *encodingBits, float *decodedPixels, bool isSnorm)
-    {
-        m_red.DecodeOnly(encodingBits, decodedPixels, isSnorm);
-        m_green.DecodeOnly(encodingBits + 8, decodedPixels + 1, isSnorm);
-    }
-
-	void Block4x4Encoding_RG11::PerformIteration(float a_fEffort)
-	{
-        m_red.PerformIteration(a_fEffort);
-        m_green.PerformIteration(a_fEffort);
-	}
-
-	void Block4x4Encoding_RG11::SetEncodingBits(void)
-	{
-        m_red.SetEncodingBits();
-        m_green.SetEncodingBits();
-	}
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcBlock4x4Encoding_RG11.cpp
+
+Block4x4Encoding_RG11 is the encoder to use when targetting file format RG11 and SRG11 (signed RG11).
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcBlock4x4Encoding_RG11.h"
+
+namespace Etc
+{
+	Block4x4Encoding_RG11::Block4x4Encoding_RG11(void)
+	{
+	}
+
+	Block4x4Encoding_RG11::~Block4x4Encoding_RG11(void) {}
+
+	void Block4x4Encoding_RG11::Encode(
+		const float *sourcePixels, uint8_t *encodingBits, bool isSnorm)
+	{
+        m_red.Encode(sourcePixels + 0, encodingBits, isSnorm);
+        m_green.Encode(sourcePixels + 1, encodingBits + 8, isSnorm);
+	}
+
+	void Block4x4Encoding_RG11::Decode(
+		unsigned char *encodingBits, const float *sourcePixels, bool isSnorm,
+        uint16_t lastIteration)
+    {
+        m_red.Decode(encodingBits, sourcePixels, isSnorm, (lastIteration >> 0) & 0xFF);
+        m_green.Decode(encodingBits + 8, sourcePixels + 1, isSnorm, (lastIteration >> 8) & 0xFF);
+ 	}
+
+    void Block4x4Encoding_RG11::DecodeOnly(
+        const uint8_t *encodingBits, float *decodedPixels, bool isSnorm)
+    {
+        m_red.DecodeOnly(encodingBits, decodedPixels, isSnorm);
+        m_green.DecodeOnly(encodingBits + 8, decodedPixels + 1, isSnorm);
+    }
+
+	void Block4x4Encoding_RG11::PerformIteration(float a_fEffort)
+	{
+        m_red.PerformIteration(a_fEffort);
+        m_green.PerformIteration(a_fEffort);
+	}
+
+	void Block4x4Encoding_RG11::SetEncodingBits(void)
+	{
+        m_red.SetEncodingBits();
+        m_green.SetEncodingBits();
+	}
+}
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RG11.h b/libkram/etc2comp/EtcBlock4x4Encoding_RG11.h
index 71ed4b9a..eca31b0d 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RG11.h
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RG11.h
@@ -1,66 +1,66 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-//#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcBlock4x4Encoding_R11.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_RG11;
-
-	// ################################################################################
-	// Block4x4Encoding_RG11
-	// ################################################################################
-
-	class Block4x4Encoding_RG11 : public IBlockEncoding
-	{
-	public:
-
-		Block4x4Encoding_RG11(void);
-		virtual ~Block4x4Encoding_RG11(void);
-
-		virtual void Encode(
-			const float *sourcePixels, uint8_t *encodingBits, bool isSnorm) override;
-
-		virtual void Decode(
-			uint8_t *encodingBits, const float *sourcePixels, bool isSnorm, uint16_t lastIteration) override;
-
-        virtual void DecodeOnly(
-             const uint8_t *encodingBits, float *decodedPixels, bool isSnorm) override;
-        
-		virtual void PerformIteration(float a_fEffort) override;
-
-		virtual void SetEncodingBits() override;
-
-        virtual bool IsDone() const override { return m_red.IsDone() && m_green.IsDone(); }
-        
-        // done bit embedded into high bit of each 8-bit count
-        // because r and g can be done independently, and with multipass need to skip iteration, though decode/re-encode will occur
-        virtual uint16_t GetIterationCount() const override { return m_red.GetIterationCount() + (m_green.GetIterationCount() << 8); }
-        
-        virtual float GetError() const override { return m_red.GetError() + m_green.GetError(); }
-        
-    private:
-        Block4x4Encoding_R11 m_red;
-        Block4x4Encoding_R11 m_green;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+//#include "EtcBlock4x4Encoding_RGB8.h"
+#include "EtcBlock4x4Encoding_R11.h"
+
+namespace Etc
+{
+	class Block4x4EncodingBits_RG11;
+
+	// ################################################################################
+	// Block4x4Encoding_RG11
+	// ################################################################################
+
+	class Block4x4Encoding_RG11 : public IBlockEncoding
+	{
+	public:
+
+		Block4x4Encoding_RG11(void);
+		virtual ~Block4x4Encoding_RG11(void);
+
+		virtual void Encode(
+			const float *sourcePixels, uint8_t *encodingBits, bool isSnorm) override;
+
+		virtual void Decode(
+			uint8_t *encodingBits, const float *sourcePixels, bool isSnorm, uint16_t lastIteration) override;
+
+        virtual void DecodeOnly(
+             const uint8_t *encodingBits, float *decodedPixels, bool isSnorm) override;
+        
+		virtual void PerformIteration(float a_fEffort) override;
+
+		virtual void SetEncodingBits() override;
+
+        virtual bool IsDone() const override { return m_red.IsDone() && m_green.IsDone(); }
+        
+        // done bit embedded into high bit of each 8-bit count
+        // because r and g can be done independently, and with multipass need to skip iteration, though decode/re-encode will occur
+        virtual uint16_t GetIterationCount() const override { return m_red.GetIterationCount() + (m_green.GetIterationCount() << 8); }
+        
+        virtual float GetError() const override { return m_red.GetError() + m_green.GetError(); }
+        
+    private:
+        Block4x4Encoding_R11 m_red;
+        Block4x4Encoding_R11 m_green;
+	};
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
index fe593a26..3d6786cf 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
@@ -1,1801 +1,1801 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGB8.cpp
-
-Block4x4Encoding_RGB8 is the encoder to use for the ETC2 extensions when targetting file format RGB8.  
-This encoder is also used for the ETC2 subset of file format RGBA8.
-
-Block4x4Encoding_ETC1 encodes the ETC1 subset of RGB8.
-
-*/
-
-// TODO: add isGray opimizations where rgb are iterated once for a single radius
-// instead of as individual channels.
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-#include "EtcMath.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-	float Block4x4Encoding_RGB8::s_afTHDistanceTable[TH_DISTANCES] =
-	{
-		3.0f / 255.0f,
-		6.0f / 255.0f,
-		11.0f / 255.0f,
-		16.0f / 255.0f,
-		23.0f / 255.0f,
-		32.0f / 255.0f,
-		41.0f / 255.0f,
-		64.0f / 255.0f
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGB8::Block4x4Encoding_RGB8(void)
-	{
-
-		m_pencodingbitsRGB8 = nullptr;
-
-	}
-
-	Block4x4Encoding_RGB8::~Block4x4Encoding_RGB8(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGB8::Decode(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														const ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric,
-                                                        uint16_t iterationCount )
-	{
-		
-		// handle ETC1 modes
-		Block4x4Encoding_ETC1::Decode(a_pblockParent,
-													a_paucEncodingBits, a_pafrgbaSource,a_errormetric, iterationCount);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		// detect if there is a T, H or Planar mode present
-		if (m_pencodingbitsRGB8->differential.diff)
-		{
-			int iRed1 = (int)m_pencodingbitsRGB8->differential.red1;
-			int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
-			int iRed2 = iRed1 + iDRed2;
-
-			int iGreen1 = (int)m_pencodingbitsRGB8->differential.green1;
-			int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
-			int iGreen2 = iGreen1 + iDGreen2;
-
-			int iBlue1 = (int)m_pencodingbitsRGB8->differential.blue1;
-			int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
-			int iBlue2 = iBlue1 + iDBlue2;
-
-			if (iRed2 < 0 || iRed2 > 31)
-			{
-				InitFromEncodingBits_T();
-			}
-			else if (iGreen2 < 0 || iGreen2 > 31)
-			{
-				InitFromEncodingBits_H();
-			}
-			else if (iBlue2 < 0 || iBlue2 > 31)
-			{
-				InitFromEncodingBits_Planar();
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if T mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_T(void)
-	{
-
-		m_mode = MODE_T;
-
-		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
-								m_pencodingbitsRGB8->t.red1b);
-		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
-		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
-		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_T();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if H mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_H(void)
-	{
-
-		m_mode = MODE_H;
-		
-		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
-		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
-									m_pencodingbitsRGB8->h.green1b);
-		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
-								(m_pencodingbitsRGB8->h.blue1b << 1) + 
-								m_pencodingbitsRGB8->h.blue1c);
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
-		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
-									m_pencodingbitsRGB8->h.green2b);
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		// used to determine the LSB of the CW
-		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
-		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
-		if (uiRGB1 >= uiRGB2)
-		{
-			m_uiCW1++;
-		}
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_H();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if Planar mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_Planar(void)
-	{
-
-		m_mode = MODE_PLANAR;
-
-		unsigned char ucOriginRed = m_pencodingbitsRGB8->planar.originRed;
-		unsigned char ucOriginGreen = (unsigned char)((m_pencodingbitsRGB8->planar.originGreen1 << 6) +
-										m_pencodingbitsRGB8->planar.originGreen2);
-		unsigned char ucOriginBlue = (unsigned char)((m_pencodingbitsRGB8->planar.originBlue1 << 5) +
-										(m_pencodingbitsRGB8->planar.originBlue2 << 3) +
-										(m_pencodingbitsRGB8->planar.originBlue3 << 1) +
-										m_pencodingbitsRGB8->planar.originBlue4);
-
-		unsigned char ucHorizRed = (unsigned char)((m_pencodingbitsRGB8->planar.horizRed1 << 1) +
-									m_pencodingbitsRGB8->planar.horizRed2);
-		unsigned char ucHorizGreen = m_pencodingbitsRGB8->planar.horizGreen;
-		unsigned char ucHorizBlue = (unsigned char)((m_pencodingbitsRGB8->planar.horizBlue1 << 5) +
-									m_pencodingbitsRGB8->planar.horizBlue2);
-
-		unsigned char ucVertRed = (unsigned char)((m_pencodingbitsRGB8->planar.vertRed1 << 3) +
-									m_pencodingbitsRGB8->planar.vertRed2);
-		unsigned char ucVertGreen = (unsigned char)((m_pencodingbitsRGB8->planar.vertGreen1 << 2) +
-									m_pencodingbitsRGB8->planar.vertGreen2);
-		unsigned char ucVertBlue = m_pencodingbitsRGB8->planar.vertBlue;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromR6G7B6(ucOriginRed, ucOriginGreen, ucOriginBlue);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromR6G7B6(ucHorizRed, ucHorizGreen, ucHorizBlue);
-		m_frgbaColor3 = ColorFloatRGBA::ConvertFromR6G7B6(ucVertRed, ucVertGreen, ucVertBlue);
-
-		DecodePixels_Planar();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			Block4x4Encoding_ETC1::PerformFirstIteration();
-			if (m_boolDone)
-			{
-				break;
-			}
-                
-			TryPlanar(0);
-			SetDoneIfPerfect();
-			if (m_boolDone)
-			{
-				break;
-			}
-                
-			TryTAndH(0);
-            break;
-
-		case 1:
-			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			Block4x4Encoding_ETC1::TryIndividual(m_boolMostLikelyFlip, 1);
-			break;
-
-		case 3:
-			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 4:
-			Block4x4Encoding_ETC1::TryIndividual(!m_boolMostLikelyFlip, 1);
-			break;
-
-		case 5:
-			TryPlanar(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryTAndH(1);
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			Block4x4Encoding_ETC1::TryDegenerates1();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			Block4x4Encoding_ETC1::TryDegenerates2();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 9:
-			Block4x4Encoding_ETC1::TryDegenerates3();
-			if (a_fEffort <= 89.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 10:
-			Block4x4Encoding_ETC1::TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in Planar mode
-	// save this encoding if it improves the error
-	//
-	void Block4x4Encoding_RGB8::TryPlanar(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		encodingTry.CalculatePlanarCornerColors();
-
-		encodingTry.DecodePixels_Planar();
-
-		encodingTry.CalcBlockError();
-
-		if (a_uiRadius > 0)
-		{
-			encodingTry.TwiddlePlanar();
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_PLANAR;
-			m_boolDiff = true;
-			m_boolFlip = false;
-			m_frgbaColor1 = encodingTry.m_frgbaColor1;
-			m_frgbaColor2 = encodingTry.m_frgbaColor2;
-			m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-			}
-
-			m_fError = encodingTry.m_fError;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode or H mode
-	// save this encoding if it improves the error
-	//
-	void Block4x4Encoding_RGB8::TryTAndH(unsigned int a_uiRadius)
-	{
-
-		CalculateBaseColorsForTAndH();
-
-		TryT(a_uiRadius);
-
-        if (!IsDone())
-        {
-            TryH(a_uiRadius);
-        }
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate original values for base colors
-	// store them in m_frgbaOriginalColor1 and m_frgbaOriginalColor2
-	//
-	void Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH(void)
-	{
-
-		//bool boolRGBX = m_pblockParent->GetImageSource()->GetErrorMetric() == ErrorMetric::RGBX;
-
-		ColorFloatRGBA frgbaBlockAverage = (m_frgbaSourceAverageLeft + m_frgbaSourceAverageRight) * 0.5f;
-
-		// find pixel farthest from average gray line
-		unsigned int uiFarthestPixel = 0;
-		float fFarthestGrayDistance2 = 0.0f;
-		//unsigned int uiTransparentPixels = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// don't count transparent
-//			if (m_pafrgbaSource[uiPixel].fA == 0.0f && !boolRGBX)
-//			{
-//				uiTransparentPixels++;
-//			}
-//			else
-			{
-				float fGrayDistance2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], frgbaBlockAverage);
-
-				if (fGrayDistance2 > fFarthestGrayDistance2)
-				{
-					uiFarthestPixel = uiPixel;
-					fFarthestGrayDistance2 = fGrayDistance2;
-				}
-			}
-		}
-		// a transparent block should not reach this method
-		//assert(uiTransparentPixels < PIXELS);
-
-		// set the original base colors to:
-		//		half way to the farthest pixel and
-		//		the mirror color on the other side of the average
-		ColorFloatRGBA frgbaOffset = (m_pafrgbaSource[uiFarthestPixel] - frgbaBlockAverage) * 0.5f;
-		m_frgbaOriginalColor1_TAndH = (frgbaBlockAverage + frgbaOffset).QuantizeR4G4B4();
-		m_frgbaOriginalColor2_TAndH = (frgbaBlockAverage - frgbaOffset).ClampRGB().QuantizeR4G4B4();	// the "other side" might be out of range
-
-		// move base colors to find best fit
-		for (unsigned int uiIteration = 0; uiIteration < 10; uiIteration++)
-		{
-			// find the center of pixels closest to each color
-			float fPixelsCloserToColor1 = 0.0f;
-			ColorFloatRGBA frgbSumPixelsCloserToColor1;
-			float fPixelsCloserToColor2 = 0.0f;
-			ColorFloatRGBA frgbSumPixelsCloserToColor2;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				// don't count transparent pixels
-                // Can't do this, use premul to weight the colors before they are encoded
-                float alpha = 1.0f; // m_pafrgbaSource[uiPixel].fA;
-//				if (alpha == 0.0f)
-//				{
-//					continue;
-//				}
-
-				float fGrayDistance2ToColor1 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor1_TAndH);
-				float fGrayDistance2ToColor2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor2_TAndH);
-
-				ColorFloatRGBA frgbaAlphaWeightedSource = m_pafrgbaSource[uiPixel] * alpha;
-                frgbaAlphaWeightedSource.fA = 1.0f;
-                
-				if (fGrayDistance2ToColor1 <= fGrayDistance2ToColor2)
-				{
-					fPixelsCloserToColor1 += alpha;
-					frgbSumPixelsCloserToColor1 = frgbSumPixelsCloserToColor1 + frgbaAlphaWeightedSource;
-				}
-				else
-				{
-					fPixelsCloserToColor2 += alpha;
-					frgbSumPixelsCloserToColor2 = frgbSumPixelsCloserToColor2 + frgbaAlphaWeightedSource;
-				}
-			}
-			if (fPixelsCloserToColor1 == 0.0f || fPixelsCloserToColor2 == 0.0f)
-			{
-				break;
-			}
-
-            // this doesn't scale alpha
-			ColorFloatRGBA frgbAvgColor1Pixels = (frgbSumPixelsCloserToColor1 * (1.0f / fPixelsCloserToColor1)).QuantizeR4G4B4();
-			ColorFloatRGBA frgbAvgColor2Pixels = (frgbSumPixelsCloserToColor2 * (1.0f / fPixelsCloserToColor2)).QuantizeR4G4B4();
-
-            frgbAvgColor1Pixels.fA = 1.0f;
-            frgbAvgColor2Pixels.fA = 1.0f;
-            
-			if (frgbAvgColor1Pixels.fR == m_frgbaOriginalColor1_TAndH.fR &&
-				frgbAvgColor1Pixels.fG == m_frgbaOriginalColor1_TAndH.fG &&
-				frgbAvgColor1Pixels.fB == m_frgbaOriginalColor1_TAndH.fB &&
-                
-				frgbAvgColor2Pixels.fR == m_frgbaOriginalColor2_TAndH.fR &&
-				frgbAvgColor2Pixels.fG == m_frgbaOriginalColor2_TAndH.fG &&
-				frgbAvgColor2Pixels.fB == m_frgbaOriginalColor2_TAndH.fB)
-			{
-				break;
-			}
-
-			m_frgbaOriginalColor1_TAndH = frgbAvgColor1Pixels;
-			m_frgbaOriginalColor2_TAndH = frgbAvgColor2Pixels;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
-	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
-	//
-	void Block4x4Encoding_RGB8::TryT(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_T;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-        int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-        int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-        
-        int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-        int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-        int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-        
-        if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-        if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-        if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-        
-        if (iMaxRed1 > 15)
-        {
-            iMaxRed1 = 15;
-        }
-        if (iMaxGreen1 > 15)
-        {
-            iMaxGreen1 = 15;
-        }
-        if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-        int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-        int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-        
-        int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-        int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-        int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-        
-        if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-        if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-        
-        if (iMaxRed2 > 15)
-        {
-            iMaxRed2 = 15;
-        }
-        if (iMaxGreen2 > 15)
-        {
-            iMaxGreen2 = 15;
-        }
-        if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-        bool isGray = m_errormetric == GRAY || !m_pblockParent->HasColorPixels();
-        
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
-			//
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-                        if (isGray && (iRed2 != iGreen2 || iRed2 != iBlue2))
-                        {
-                            continue;
-                        }
-                        
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-                                
-                                SetDoneIfPerfect();
-                                if (IsDone())
-                                {
-                                    return;
-                                }
-							}
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-                        if (isGray && (iRed1 != iGreen1 || iRed1 != iBlue1))
-                        {
-                            continue;
-                        }
-                        
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-                                
-                                SetDoneIfPerfect();
-                                if (IsDone())
-                                {
-                                    return;
-                                }
-							}
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryT
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8::TryT_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-		
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = m_frgbaColor1;
-		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = m_frgbaColor2;
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-		
-		// try each selector
-		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], 
-														uiPixel);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
-	// TWIDDLE_RADIUS of 2 is WAY too slow
-	//
-	void Block4x4Encoding_RGB8::TryH(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_H;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-        int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-        int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-        int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-        int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-        int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-        
-        if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-        int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-        int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-        int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-        int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-        int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-        
-        if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-        bool isGray = m_errormetric == GRAY || !m_pblockParent->HasColorPixels();
-        
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-                        // gray only iterates red
-                        if (isGray && (iRed1 != iGreen1 || iRed1 != iBlue1))
-                        {
-                            continue;
-                        }
-                        
-						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-                            
-                            SetDoneIfPerfect();
-                            if (IsDone())
-                            {
-                                return;
-                            }
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-                        // gray only iterates red
-                        if (isGray && (iRed2 != iGreen2 || iRed2 != iBlue2))
-                        {
-                            continue;
-                        }
-                        
-						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-                            
-                            SetDoneIfPerfect();
-                            if (IsDone())
-                            {
-                                return;
-                            }
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryH
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8::TryH_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-		
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-		
-		// try each selector
-		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector],
-														uiPixel);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (m_fError > fBlockError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// use linear regression to find the best fit for colors along the edges of the 4x4 block
-	//
-	void Block4x4Encoding_RGB8::CalculatePlanarCornerColors(void)
-	{
-		ColorFloatRGBA afrgbaRegression[MAX_PLANAR_REGRESSION_SIZE];
-		ColorFloatRGBA frgbaSlope;
-		ColorFloatRGBA frgbaOffset;
-
-		// top edge
-		afrgbaRegression[0] = m_pafrgbaSource[0];
-		afrgbaRegression[1] = m_pafrgbaSource[4];
-		afrgbaRegression[2] = m_pafrgbaSource[8];
-		afrgbaRegression[3] = m_pafrgbaSource[12];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor1 = frgbaOffset;
-		m_frgbaColor2 = (frgbaSlope * 4.0f) + frgbaOffset;
-
-		// left edge
-		afrgbaRegression[0] = m_pafrgbaSource[0];
-		afrgbaRegression[1] = m_pafrgbaSource[1];
-		afrgbaRegression[2] = m_pafrgbaSource[2];
-		afrgbaRegression[3] = m_pafrgbaSource[3];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor1 = (m_frgbaColor1 + frgbaOffset) * 0.5f;		// average with top edge
-		m_frgbaColor3 = (frgbaSlope * 4.0f) + frgbaOffset;
-
-		// right edge
-		afrgbaRegression[0] = m_pafrgbaSource[12];
-		afrgbaRegression[1] = m_pafrgbaSource[13];
-		afrgbaRegression[2] = m_pafrgbaSource[14];
-		afrgbaRegression[3] = m_pafrgbaSource[15];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor2 = (m_frgbaColor2 + frgbaOffset) * 0.5f;		// average with top edge
-
-		// bottom edge
-		afrgbaRegression[0] = m_pafrgbaSource[3];
-		afrgbaRegression[1] = m_pafrgbaSource[7];
-		afrgbaRegression[2] = m_pafrgbaSource[11];
-		afrgbaRegression[3] = m_pafrgbaSource[15];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor3 = (m_frgbaColor3 + frgbaOffset) * 0.5f;		// average with left edge
-
-		// quantize corner colors to 6/7/6
-		m_frgbaColor1 = m_frgbaColor1.QuantizeR6G7B6();
-		m_frgbaColor2 = m_frgbaColor2.QuantizeR6G7B6();
-		m_frgbaColor3 = m_frgbaColor3.QuantizeR6G7B6();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing R, G and B independently
-	//
-	// R, G and B decoding and errors are independent, so R, G and B twiddles can be independent
-	//
-	// return true if improvement
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanar(void)
-	{
-		bool boolImprovement = false;
-        bool isGray = m_errormetric == GRAY || !m_pblockParent->HasColorPixels();
-        
-		while (TwiddlePlanarR())
-		{
-			boolImprovement = true;
-		}
-
-        if (!isGray) {
-            while (TwiddlePlanarG())
-            {
-                boolImprovement = true;
-            }
-
-            while (TwiddlePlanarB())
-            {
-                boolImprovement = true;
-            }
-        }
-        
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing R
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarR()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginRed = encodingTry.m_frgbaColor1.IntRed(63.0f);
-		int iHorizRed = encodingTry.m_frgbaColor2.IntRed(63.0f);
-		int iVertRed = encodingTry.m_frgbaColor3.IntRed(63.0f);
-
-		for (int iTryOriginRed = iOriginRed - 1; iTryOriginRed <= iOriginRed + 1; iTryOriginRed++)
-		{
-			// check for out of range
-			if (iTryOriginRed < 0 || iTryOriginRed > 63)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fR = ((iTryOriginRed << 2) + (iTryOriginRed >> 4)) / 255.0f;
-
-			for (int iTryHorizRed = iHorizRed - 1; iTryHorizRed <= iHorizRed + 1; iTryHorizRed++)
-			{
-				// check for out of range
-				if (iTryHorizRed < 0 || iTryHorizRed > 63)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fR = ((iTryHorizRed << 2) + (iTryHorizRed >> 4)) / 255.0f;
-
-				for (int iTryVertRed = iVertRed - 1; iTryVertRed <= iVertRed + 1; iTryVertRed++)
-				{
-					// check for out of range
-					if (iTryVertRed < 0 || iTryVertRed > 63)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginRed == iOriginRed && iTryHorizRed == iHorizRed && iTryVertRed == iVertRed)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fR = ((iTryVertRed << 2) + (iTryVertRed >> 4)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing G
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarG()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginGreen = encodingTry.m_frgbaColor1.IntGreen(127.0f);
-		int iHorizGreen = encodingTry.m_frgbaColor2.IntGreen(127.0f);
-		int iVertGreen = encodingTry.m_frgbaColor3.IntGreen(127.0f);
-
-		for (int iTryOriginGreen = iOriginGreen - 1; iTryOriginGreen <= iOriginGreen + 1; iTryOriginGreen++)
-		{
-			// check for out of range
-			if (iTryOriginGreen < 0 || iTryOriginGreen > 127)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fG = ((iTryOriginGreen << 1) + (iTryOriginGreen >> 6)) / 255.0f;
-
-			for (int iTryHorizGreen = iHorizGreen - 1; iTryHorizGreen <= iHorizGreen + 1; iTryHorizGreen++)
-			{
-				// check for out of range
-				if (iTryHorizGreen < 0 || iTryHorizGreen > 127)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fG = ((iTryHorizGreen << 1) + (iTryHorizGreen >> 6)) / 255.0f;
-
-				for (int iTryVertGreen = iVertGreen - 1; iTryVertGreen <= iVertGreen + 1; iTryVertGreen++)
-				{
-					// check for out of range
-					if (iTryVertGreen < 0 || iTryVertGreen > 127)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginGreen == iOriginGreen && 
-						iTryHorizGreen == iHorizGreen && 
-						iTryVertGreen == iVertGreen)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fG = ((iTryVertGreen << 1) + (iTryVertGreen >> 6)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing B
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarB()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginBlue = encodingTry.m_frgbaColor1.IntBlue(63.0f);
-		int iHorizBlue = encodingTry.m_frgbaColor2.IntBlue(63.0f);
-		int iVertBlue = encodingTry.m_frgbaColor3.IntBlue(63.0f);
-
-		for (int iTryOriginBlue = iOriginBlue - 1; iTryOriginBlue <= iOriginBlue + 1; iTryOriginBlue++)
-		{
-			// check for out of range
-			if (iTryOriginBlue < 0 || iTryOriginBlue > 63)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fB = ((iTryOriginBlue << 2) + (iTryOriginBlue >> 4)) / 255.0f;
-
-			for (int iTryHorizBlue = iHorizBlue - 1; iTryHorizBlue <= iHorizBlue + 1; iTryHorizBlue++)
-			{
-				// check for out of range
-				if (iTryHorizBlue < 0 || iTryHorizBlue > 63)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fB = ((iTryHorizBlue << 2) + (iTryHorizBlue >> 4)) / 255.0f;
-
-				for (int iTryVertBlue = iVertBlue - 1; iTryVertBlue <= iVertBlue + 1; iTryVertBlue++)
-				{
-					// check for out of range
-					if (iTryVertBlue < 0 || iTryVertBlue > 63)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginBlue == iOriginBlue && iTryHorizBlue == iHorizBlue && iTryVertBlue == iVertBlue)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fB = ((iTryVertBlue << 2) + (iTryVertBlue >> 4)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits(void)
-	{
-
-		switch (m_mode)
-		{
-		case MODE_ETC1:
-			Block4x4Encoding_ETC1::SetEncodingBits();
-			break;
-
-		case MODE_T:
-			SetEncodingBits_T();
-			break;
-
-		case MODE_H:
-			SetEncodingBits_H();
-			break;
-
-		case MODE_PLANAR:
-			SetEncodingBits_Planar();
-			break;
-
-		default:
-			assert(false);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for T mode
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_T(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_T);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
-		m_pencodingbitsRGB8->t.red1b = uiRed1;
-		m_pencodingbitsRGB8->t.green1 = uiGreen1;
-		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
-
-		m_pencodingbitsRGB8->t.red2 = uiRed2;
-		m_pencodingbitsRGB8->t.green2 = uiGreen2;
-		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
-
-		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
-		m_pencodingbitsRGB8->t.db = m_uiCW1;
-
-		m_pencodingbitsRGB8->t.diff = 1;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->t.detect1 = 0;
-		m_pencodingbitsRGB8->t.detect2 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		if (iRed2 >= 4)
-		{
-			m_pencodingbitsRGB8->t.detect1 = 7;
-			m_pencodingbitsRGB8->t.detect2 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->t.detect1 = 0;
-			m_pencodingbitsRGB8->t.detect2 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-
-			// make sure red overflows
-			assert(iRed2 < 0 || iRed2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for H mode
-	//
-	// colors and selectors may need to swap in order to generate lsb of distance index
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_H(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_H);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-		bool boolOddDistance = m_uiCW1 & 1;
-		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed2;
-			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed1;
-			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed1;
-			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed2;
-			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-
-		m_pencodingbitsRGB8->h.diff = 1;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
-		}
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->h.detect1 = 0;
-		m_pencodingbitsRGB8->h.detect2 = 0;
-		m_pencodingbitsRGB8->h.detect3 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->h.detect1 = 1;
-		}
-		if (iGreen2 >= 4)
-		{
-			m_pencodingbitsRGB8->h.detect2 = 7;
-			m_pencodingbitsRGB8->h.detect3 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.detect2 = 0;
-			m_pencodingbitsRGB8->h.detect3 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-
-			// make sure red doesn't overflow and green does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 < 0 || iGreen2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for Planar mode
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_Planar(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_PLANAR);
-		assert(m_boolDiff == true);
-
-		unsigned int uiOriginRed = (unsigned int)m_frgbaColor1.IntRed(63.0f);
-		unsigned int uiOriginGreen = (unsigned int)m_frgbaColor1.IntGreen(127.0f);
-		unsigned int uiOriginBlue = (unsigned int)m_frgbaColor1.IntBlue(63.0f);
-
-		unsigned int uiHorizRed = (unsigned int)m_frgbaColor2.IntRed(63.0f);
-		unsigned int uiHorizGreen = (unsigned int)m_frgbaColor2.IntGreen(127.0f);
-		unsigned int uiHorizBlue = (unsigned int)m_frgbaColor2.IntBlue(63.0f);
-
-		unsigned int uiVertRed = (unsigned int)m_frgbaColor3.IntRed(63.0f);
-		unsigned int uiVertGreen = (unsigned int)m_frgbaColor3.IntGreen(127.0f);
-		unsigned int uiVertBlue = (unsigned int)m_frgbaColor3.IntBlue(63.0f);
-
-		m_pencodingbitsRGB8->planar.originRed = uiOriginRed;
-		m_pencodingbitsRGB8->planar.originGreen1 = uiOriginGreen >> 6;
-		m_pencodingbitsRGB8->planar.originGreen2 = uiOriginGreen;
-		m_pencodingbitsRGB8->planar.originBlue1 = uiOriginBlue >> 5;
-		m_pencodingbitsRGB8->planar.originBlue2 = uiOriginBlue >> 3;
-		m_pencodingbitsRGB8->planar.originBlue3 = uiOriginBlue >> 1;
-		m_pencodingbitsRGB8->planar.originBlue4 = uiOriginBlue;
-
-		m_pencodingbitsRGB8->planar.horizRed1 = uiHorizRed >> 1;
-		m_pencodingbitsRGB8->planar.horizRed2 = uiHorizRed;
-		m_pencodingbitsRGB8->planar.horizGreen = uiHorizGreen;
-		m_pencodingbitsRGB8->planar.horizBlue1 = uiHorizBlue >> 5;
-		m_pencodingbitsRGB8->planar.horizBlue2 = uiHorizBlue;
-
-		m_pencodingbitsRGB8->planar.vertRed1 = uiVertRed >> 3;
-		m_pencodingbitsRGB8->planar.vertRed2 = uiVertRed;
-		m_pencodingbitsRGB8->planar.vertGreen1 = uiVertGreen >> 2;
-		m_pencodingbitsRGB8->planar.vertGreen2 = uiVertGreen;
-		m_pencodingbitsRGB8->planar.vertBlue = uiVertBlue;
-
-		m_pencodingbitsRGB8->planar.diff = 1;
-
-		// create valid RG differentials and an invalid B differential to trigger planar mode
-		m_pencodingbitsRGB8->planar.detect1 = 0;
-		m_pencodingbitsRGB8->planar.detect2 = 0;
-		m_pencodingbitsRGB8->planar.detect3 = 0;
-		m_pencodingbitsRGB8->planar.detect4 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		int iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->planar.detect1 = 1;
-		}
-		if (iGreen2 < 0 || iGreen2 > 31)
-		{
-			m_pencodingbitsRGB8->planar.detect2 = 1;
-		}
-		if (iBlue2 >= 4)
-		{
-			m_pencodingbitsRGB8->planar.detect3 = 7;
-			m_pencodingbitsRGB8->planar.detect4 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->planar.detect3 = 0;
-			m_pencodingbitsRGB8->planar.detect4 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-			iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
-
-			// make sure red and green don't overflow and blue does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 >= 0 && iGreen2 <= 31);
-			assert(iBlue2 < 0 || iBlue2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for T mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_T(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				break;
-
-			case 2:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for H mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_H(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
-				break;
-
-			case 2:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for Planar mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_Planar(void)
-	{
-
-		int iRO = m_frgbaColor1.IntRed(255.0f);
-        int iGO = m_frgbaColor1.IntGreen(255.0f);
-        int iBO = m_frgbaColor1.IntBlue(255.0f);
-
-        int iRH = m_frgbaColor2.IntRed(255.0f);
-        int iGH = m_frgbaColor2.IntGreen(255.0f);
-        int iBH = m_frgbaColor2.IntBlue(255.0f);
-
-        int iRV = m_frgbaColor3.IntRed(255.0f);
-        int iGV = m_frgbaColor3.IntGreen(255.0f);
-        int iBV = m_frgbaColor3.IntBlue(255.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			int iX = (int)(uiPixel >> 2);
-			int iY = (int)(uiPixel & 3);
-
-			int iR = (iX*(iRH - iRO) + iY*(iRV - iRO) + 4*iRO + 2) >> 2;
-			int iG = (iX*(iGH - iGO) + iY*(iGV - iGO) + 4*iGO + 2) >> 2;
-			int iB = (iX*(iBH - iBO) + iY*(iBV - iBO) + 4*iBO + 2) >> 2;
-
-			ColorFloatRGBA frgba;
-			frgba.fR = (float)iR / 255.0f;
-			frgba.fG = (float)iG / 255.0f;
-			frgba.fB = (float)iB / 255.0f;
-			frgba.fA = 1.0f;
-
-			m_afrgbaDecodedColors[uiPixel] = frgba.ClampRGB();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a linear regression for the a_uiPixels in a_pafrgbaPixels[]
-	//
-	// output the closest color line using a_pfrgbaSlope and a_pfrgbaOffset
-	//
-	void Block4x4Encoding_RGB8::ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
-												ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset)
-	{
-		typedef struct
-		{
-			float f[4];
-		} Float4;
-
-		Float4 *paf4Pixels = (Float4 *)(a_pafrgbaPixels);
-		Float4 *pf4Slope = (Float4 *)(a_pfrgbaSlope);
-		Float4 *pf4Offset = (Float4 *)(a_pfrgbaOffset);
-
-		float afX[MAX_PLANAR_REGRESSION_SIZE];
-		float afY[MAX_PLANAR_REGRESSION_SIZE];
-
-		// handle r, g and b separately.  don't bother with a
-		for (unsigned int uiComponent = 0; uiComponent < 3; uiComponent++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < a_uiPixels; uiPixel++)
-			{
-				afX[uiPixel] = (float)uiPixel;
-				afY[uiPixel] = paf4Pixels[uiPixel].f[uiComponent];
-				
-			}
-			Etc::Regression(afX, afY, a_uiPixels,
-				&(pf4Slope->f[uiComponent]), &(pf4Offset->f[uiComponent]));
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcBlock4x4Encoding_RGB8.cpp
+
+Block4x4Encoding_RGB8 is the encoder to use for the ETC2 extensions when targetting file format RGB8.  
+This encoder is also used for the ETC2 subset of file format RGBA8.
+
+Block4x4Encoding_ETC1 encodes the ETC1 subset of RGB8.
+
+*/
+
+// TODO: add isGray opimizations where rgb are iterated once for a single radius
+// instead of as individual channels.
+
+#include "EtcConfig.h"
+#include "EtcBlock4x4Encoding_RGB8.h"
+
+#include "EtcBlock4x4EncodingBits.h"
+#include "EtcBlock4x4.h"
+#include "EtcMath.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <limits>
+
+namespace Etc
+{
+	float Block4x4Encoding_RGB8::s_afTHDistanceTable[TH_DISTANCES] =
+	{
+		3.0f / 255.0f,
+		6.0f / 255.0f,
+		11.0f / 255.0f,
+		16.0f / 255.0f,
+		23.0f / 255.0f,
+		32.0f / 255.0f,
+		41.0f / 255.0f,
+		64.0f / 255.0f
+	};
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+	Block4x4Encoding_RGB8::Block4x4Encoding_RGB8(void)
+	{
+
+		m_pencodingbitsRGB8 = nullptr;
+
+	}
+
+	Block4x4Encoding_RGB8::~Block4x4Encoding_RGB8(void) {}
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding
+	// a_pblockParent points to the block associated with this encoding
+	// a_errormetric is used to choose the best encoding
+	// a_pafrgbaSource points to a 4x4 block subset of the source image
+	// a_paucEncodingBits points to the final encoding bits of a previous encoding
+	//
+	void Block4x4Encoding_RGB8::Decode(Block4x4 *a_pblockParent,
+														unsigned char *a_paucEncodingBits,
+														const ColorFloatRGBA *a_pafrgbaSource,
+														ErrorMetric a_errormetric,
+                                                        uint16_t iterationCount )
+	{
+		
+		// handle ETC1 modes
+		Block4x4Encoding_ETC1::Decode(a_pblockParent,
+													a_paucEncodingBits, a_pafrgbaSource,a_errormetric, iterationCount);
+
+		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
+
+		// detect if there is a T, H or Planar mode present
+		if (m_pencodingbitsRGB8->differential.diff)
+		{
+			int iRed1 = (int)m_pencodingbitsRGB8->differential.red1;
+			int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
+			int iRed2 = iRed1 + iDRed2;
+
+			int iGreen1 = (int)m_pencodingbitsRGB8->differential.green1;
+			int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
+			int iGreen2 = iGreen1 + iDGreen2;
+
+			int iBlue1 = (int)m_pencodingbitsRGB8->differential.blue1;
+			int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
+			int iBlue2 = iBlue1 + iDBlue2;
+
+			if (iRed2 < 0 || iRed2 > 31)
+			{
+				InitFromEncodingBits_T();
+			}
+			else if (iGreen2 < 0 || iGreen2 > 31)
+			{
+				InitFromEncodingBits_H();
+			}
+			else if (iBlue2 < 0 || iBlue2 > 31)
+			{
+				InitFromEncodingBits_Planar();
+			}
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding if T mode is detected
+	//
+	void Block4x4Encoding_RGB8::InitFromEncodingBits_T(void)
+	{
+
+		m_mode = MODE_T;
+
+		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
+								m_pencodingbitsRGB8->t.red1b);
+		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
+		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
+
+		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
+		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
+		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
+
+		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
+		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
+
+		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
+
+		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
+
+		DecodePixels_T();
+
+		CalcBlockError();
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding if H mode is detected
+	//
+	void Block4x4Encoding_RGB8::InitFromEncodingBits_H(void)
+	{
+
+		m_mode = MODE_H;
+		
+		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
+		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
+									m_pencodingbitsRGB8->h.green1b);
+		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
+								(m_pencodingbitsRGB8->h.blue1b << 1) + 
+								m_pencodingbitsRGB8->h.blue1c);
+
+		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
+		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
+									m_pencodingbitsRGB8->h.green2b);
+		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
+
+		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
+		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
+
+		// used to determine the LSB of the CW
+		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
+		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
+
+		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
+		if (uiRGB1 >= uiRGB2)
+		{
+			m_uiCW1++;
+		}
+
+		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
+
+		DecodePixels_H();
+
+		CalcBlockError();
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding if Planar mode is detected
+	//
+	void Block4x4Encoding_RGB8::InitFromEncodingBits_Planar(void)
+	{
+
+		m_mode = MODE_PLANAR;
+
+		unsigned char ucOriginRed = m_pencodingbitsRGB8->planar.originRed;
+		unsigned char ucOriginGreen = (unsigned char)((m_pencodingbitsRGB8->planar.originGreen1 << 6) +
+										m_pencodingbitsRGB8->planar.originGreen2);
+		unsigned char ucOriginBlue = (unsigned char)((m_pencodingbitsRGB8->planar.originBlue1 << 5) +
+										(m_pencodingbitsRGB8->planar.originBlue2 << 3) +
+										(m_pencodingbitsRGB8->planar.originBlue3 << 1) +
+										m_pencodingbitsRGB8->planar.originBlue4);
+
+		unsigned char ucHorizRed = (unsigned char)((m_pencodingbitsRGB8->planar.horizRed1 << 1) +
+									m_pencodingbitsRGB8->planar.horizRed2);
+		unsigned char ucHorizGreen = m_pencodingbitsRGB8->planar.horizGreen;
+		unsigned char ucHorizBlue = (unsigned char)((m_pencodingbitsRGB8->planar.horizBlue1 << 5) +
+									m_pencodingbitsRGB8->planar.horizBlue2);
+
+		unsigned char ucVertRed = (unsigned char)((m_pencodingbitsRGB8->planar.vertRed1 << 3) +
+									m_pencodingbitsRGB8->planar.vertRed2);
+		unsigned char ucVertGreen = (unsigned char)((m_pencodingbitsRGB8->planar.vertGreen1 << 2) +
+									m_pencodingbitsRGB8->planar.vertGreen2);
+		unsigned char ucVertBlue = m_pencodingbitsRGB8->planar.vertBlue;
+
+		m_frgbaColor1 = ColorFloatRGBA::ConvertFromR6G7B6(ucOriginRed, ucOriginGreen, ucOriginBlue);
+		m_frgbaColor2 = ColorFloatRGBA::ConvertFromR6G7B6(ucHorizRed, ucHorizGreen, ucHorizBlue);
+		m_frgbaColor3 = ColorFloatRGBA::ConvertFromR6G7B6(ucVertRed, ucVertGreen, ucVertBlue);
+
+		DecodePixels_Planar();
+
+		CalcBlockError();
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// perform a single encoding iteration
+	// replace the encoding if a better encoding was found
+	// subsequent iterations generally take longer for each iteration
+	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
+	//
+	void Block4x4Encoding_RGB8::PerformIteration(float a_fEffort)
+	{
+		assert(!m_boolDone);
+
+		switch (m_uiEncodingIterations)
+		{
+		case 0:
+			Block4x4Encoding_ETC1::PerformFirstIteration();
+			if (m_boolDone)
+			{
+				break;
+			}
+                
+			TryPlanar(0);
+			SetDoneIfPerfect();
+			if (m_boolDone)
+			{
+				break;
+			}
+                
+			TryTAndH(0);
+            break;
+
+		case 1:
+			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
+			break;
+
+		case 2:
+			Block4x4Encoding_ETC1::TryIndividual(m_boolMostLikelyFlip, 1);
+			break;
+
+		case 3:
+			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
+			break;
+
+		case 4:
+			Block4x4Encoding_ETC1::TryIndividual(!m_boolMostLikelyFlip, 1);
+			break;
+
+		case 5:
+			TryPlanar(1);
+			if (a_fEffort <= 49.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 6:
+			TryTAndH(1);
+			if (a_fEffort <= 59.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 7:
+			Block4x4Encoding_ETC1::TryDegenerates1();
+			if (a_fEffort <= 69.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 8:
+			Block4x4Encoding_ETC1::TryDegenerates2();
+			if (a_fEffort <= 79.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 9:
+			Block4x4Encoding_ETC1::TryDegenerates3();
+			if (a_fEffort <= 89.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 10:
+			Block4x4Encoding_ETC1::TryDegenerates4();
+			m_boolDone = true;
+			break;
+
+		default:
+			assert(0);
+			break;
+		}
+
+		m_uiEncodingIterations++;
+
+		SetDoneIfPerfect();
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try encoding in Planar mode
+	// save this encoding if it improves the error
+	//
+	void Block4x4Encoding_RGB8::TryPlanar(unsigned int a_uiRadius)
+	{
+		Block4x4Encoding_RGB8 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_PLANAR;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+		}
+
+		encodingTry.CalculatePlanarCornerColors();
+
+		encodingTry.DecodePixels_Planar();
+
+		encodingTry.CalcBlockError();
+
+		if (a_uiRadius > 0)
+		{
+			encodingTry.TwiddlePlanar();
+		}
+
+		if (encodingTry.m_fError < m_fError)
+		{
+			m_mode = MODE_PLANAR;
+			m_boolDiff = true;
+			m_boolFlip = false;
+			m_frgbaColor1 = encodingTry.m_frgbaColor1;
+			m_frgbaColor2 = encodingTry.m_frgbaColor2;
+			m_frgbaColor3 = encodingTry.m_frgbaColor3;
+
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+				m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+			}
+
+			m_fError = encodingTry.m_fError;
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try encoding in T mode or H mode
+	// save this encoding if it improves the error
+	//
+	void Block4x4Encoding_RGB8::TryTAndH(unsigned int a_uiRadius)
+	{
+
+		CalculateBaseColorsForTAndH();
+
+		TryT(a_uiRadius);
+
+        if (!IsDone())
+        {
+            TryH(a_uiRadius);
+        }
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// calculate original values for base colors
+	// store them in m_frgbaOriginalColor1 and m_frgbaOriginalColor2
+	//
+	void Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH(void)
+	{
+
+		//bool boolRGBX = m_pblockParent->GetImageSource()->GetErrorMetric() == ErrorMetric::RGBX;
+
+		ColorFloatRGBA frgbaBlockAverage = (m_frgbaSourceAverageLeft + m_frgbaSourceAverageRight) * 0.5f;
+
+		// find pixel farthest from average gray line
+		unsigned int uiFarthestPixel = 0;
+		float fFarthestGrayDistance2 = 0.0f;
+		//unsigned int uiTransparentPixels = 0;
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			// don't count transparent
+//			if (m_pafrgbaSource[uiPixel].fA == 0.0f && !boolRGBX)
+//			{
+//				uiTransparentPixels++;
+//			}
+//			else
+			{
+				float fGrayDistance2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], frgbaBlockAverage);
+
+				if (fGrayDistance2 > fFarthestGrayDistance2)
+				{
+					uiFarthestPixel = uiPixel;
+					fFarthestGrayDistance2 = fGrayDistance2;
+				}
+			}
+		}
+		// a transparent block should not reach this method
+		//assert(uiTransparentPixels < PIXELS);
+
+		// set the original base colors to:
+		//		half way to the farthest pixel and
+		//		the mirror color on the other side of the average
+		ColorFloatRGBA frgbaOffset = (m_pafrgbaSource[uiFarthestPixel] - frgbaBlockAverage) * 0.5f;
+		m_frgbaOriginalColor1_TAndH = (frgbaBlockAverage + frgbaOffset).QuantizeR4G4B4();
+		m_frgbaOriginalColor2_TAndH = (frgbaBlockAverage - frgbaOffset).ClampRGB().QuantizeR4G4B4();	// the "other side" might be out of range
+
+		// move base colors to find best fit
+		for (unsigned int uiIteration = 0; uiIteration < 10; uiIteration++)
+		{
+			// find the center of pixels closest to each color
+			float fPixelsCloserToColor1 = 0.0f;
+			ColorFloatRGBA frgbSumPixelsCloserToColor1;
+			float fPixelsCloserToColor2 = 0.0f;
+			ColorFloatRGBA frgbSumPixelsCloserToColor2;
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+				// don't count transparent pixels
+                // Can't do this, use premul to weight the colors before they are encoded
+                float alpha = 1.0f; // m_pafrgbaSource[uiPixel].fA;
+//				if (alpha == 0.0f)
+//				{
+//					continue;
+//				}
+
+				float fGrayDistance2ToColor1 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor1_TAndH);
+				float fGrayDistance2ToColor2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor2_TAndH);
+
+				ColorFloatRGBA frgbaAlphaWeightedSource = m_pafrgbaSource[uiPixel] * alpha;
+                frgbaAlphaWeightedSource.fA = 1.0f;
+                
+				if (fGrayDistance2ToColor1 <= fGrayDistance2ToColor2)
+				{
+					fPixelsCloserToColor1 += alpha;
+					frgbSumPixelsCloserToColor1 = frgbSumPixelsCloserToColor1 + frgbaAlphaWeightedSource;
+				}
+				else
+				{
+					fPixelsCloserToColor2 += alpha;
+					frgbSumPixelsCloserToColor2 = frgbSumPixelsCloserToColor2 + frgbaAlphaWeightedSource;
+				}
+			}
+			if (fPixelsCloserToColor1 == 0.0f || fPixelsCloserToColor2 == 0.0f)
+			{
+				break;
+			}
+
+            // this doesn't scale alpha
+			ColorFloatRGBA frgbAvgColor1Pixels = (frgbSumPixelsCloserToColor1 * (1.0f / fPixelsCloserToColor1)).QuantizeR4G4B4();
+			ColorFloatRGBA frgbAvgColor2Pixels = (frgbSumPixelsCloserToColor2 * (1.0f / fPixelsCloserToColor2)).QuantizeR4G4B4();
+
+            frgbAvgColor1Pixels.fA = 1.0f;
+            frgbAvgColor2Pixels.fA = 1.0f;
+            
+			if (frgbAvgColor1Pixels.fR == m_frgbaOriginalColor1_TAndH.fR &&
+				frgbAvgColor1Pixels.fG == m_frgbaOriginalColor1_TAndH.fG &&
+				frgbAvgColor1Pixels.fB == m_frgbaOriginalColor1_TAndH.fB &&
+                
+				frgbAvgColor2Pixels.fR == m_frgbaOriginalColor2_TAndH.fR &&
+				frgbAvgColor2Pixels.fG == m_frgbaOriginalColor2_TAndH.fG &&
+				frgbAvgColor2Pixels.fB == m_frgbaOriginalColor2_TAndH.fB)
+			{
+				break;
+			}
+
+			m_frgbaOriginalColor1_TAndH = frgbAvgColor1Pixels;
+			m_frgbaOriginalColor2_TAndH = frgbAvgColor2Pixels;
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try encoding in T mode
+	// save this encoding if it improves the error
+	//
+	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
+	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
+	//
+	void Block4x4Encoding_RGB8::TryT(unsigned int a_uiRadius)
+	{
+		Block4x4Encoding_RGB8 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_T;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+			encodingTry.m_fError = FLT_MAX;
+		}
+
+		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
+		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
+		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
+
+		int iMinRed1 = iColor1Red - (int)a_uiRadius;
+        int iMinGreen1 = iColor1Green - (int)a_uiRadius;
+        int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
+        
+        int iMaxRed1 = iColor1Red + (int)a_uiRadius;
+        int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
+        int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
+        
+        if (iMinRed1 < 0)
+		{
+			iMinRed1 = 0;
+		}
+        if (iMinGreen1 < 0)
+		{
+			iMinGreen1 = 0;
+		}
+        if (iMinBlue1 < 0)
+		{
+			iMinBlue1 = 0;
+		}
+        
+        if (iMaxRed1 > 15)
+        {
+            iMaxRed1 = 15;
+        }
+        if (iMaxGreen1 > 15)
+        {
+            iMaxGreen1 = 15;
+        }
+        if (iMaxBlue1 > 15)
+		{
+			iMaxBlue1 = 15;
+		}
+
+		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
+		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
+		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
+
+		int iMinRed2 = iColor2Red - (int)a_uiRadius;
+        int iMinGreen2 = iColor2Green - (int)a_uiRadius;
+        int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
+        
+        int iMaxRed2 = iColor2Red + (int)a_uiRadius;
+        int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
+        int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
+        
+        if (iMinRed2 < 0)
+		{
+			iMinRed2 = 0;
+		}
+		if (iMinGreen2 < 0)
+		{
+			iMinGreen2 = 0;
+		}
+        if (iMinBlue2 < 0)
+		{
+			iMinBlue2 = 0;
+		}
+        
+        if (iMaxRed2 > 15)
+        {
+            iMaxRed2 = 15;
+        }
+        if (iMaxGreen2 > 15)
+        {
+            iMaxGreen2 = 15;
+        }
+        if (iMaxBlue2 > 15)
+		{
+			iMaxBlue2 = 15;
+		}
+
+        bool isGray = m_errormetric == GRAY || !m_pblockParent->HasColorPixels();
+        
+		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
+		{
+			encodingTry.m_uiCW1 = uiDistance;
+
+			// twiddle m_frgbaOriginalColor2_TAndH
+			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
+			//
+			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
+			{
+				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
+				{
+					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
+					{
+                        if (isGray && (iRed2 != iGreen2 || iRed2 != iBlue2))
+                        {
+                            continue;
+                        }
+                        
+						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
+						{
+							if (uiBaseColorSwaps == 0)
+							{
+								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
+								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
+							}
+							else
+							{
+								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
+								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
+							}
+
+							encodingTry.TryT_BestSelectorCombination();
+
+							if (encodingTry.m_fError < m_fError)
+							{
+								m_mode = encodingTry.m_mode;
+								m_boolDiff = encodingTry.m_boolDiff;
+								m_boolFlip = encodingTry.m_boolFlip;
+
+								m_frgbaColor1 = encodingTry.m_frgbaColor1;
+								m_frgbaColor2 = encodingTry.m_frgbaColor2;
+								m_uiCW1 = encodingTry.m_uiCW1;
+
+								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+								{
+									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+								}
+
+								m_fError = encodingTry.m_fError;
+                                
+                                SetDoneIfPerfect();
+                                if (IsDone())
+                                {
+                                    return;
+                                }
+							}
+						}
+					}
+				}
+			}
+
+			// twiddle m_frgbaOriginalColor1_TAndH
+			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
+			{
+				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
+				{
+					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
+					{
+                        if (isGray && (iRed1 != iGreen1 || iRed1 != iBlue1))
+                        {
+                            continue;
+                        }
+                        
+						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
+						{
+							if (uiBaseColorSwaps == 0)
+							{
+								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
+								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
+							}
+							else
+							{
+								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
+								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
+							}
+
+							encodingTry.TryT_BestSelectorCombination();
+
+							if (encodingTry.m_fError < m_fError)
+							{
+								m_mode = encodingTry.m_mode;
+								m_boolDiff = encodingTry.m_boolDiff;
+								m_boolFlip = encodingTry.m_boolFlip;
+
+								m_frgbaColor1 = encodingTry.m_frgbaColor1;
+								m_frgbaColor2 = encodingTry.m_frgbaColor2;
+								m_uiCW1 = encodingTry.m_uiCW1;
+
+								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+								{
+									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+								}
+
+								m_fError = encodingTry.m_fError;
+                                
+                                SetDoneIfPerfect();
+                                if (IsDone())
+                                {
+                                    return;
+                                }
+							}
+						}
+					}
+				}
+			}
+
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// find best selector combination for TryT
+	// called on an encodingTry
+	//
+	void Block4x4Encoding_RGB8::TryT_BestSelectorCombination(void)
+	{
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+
+		unsigned int auiBestPixelSelectors[PIXELS];
+		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
+			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
+		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
+		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
+		
+		assert(SELECTORS == 4);
+		afrgbaDecodedPixel[0] = m_frgbaColor1;
+		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
+		afrgbaDecodedPixel[2] = m_frgbaColor2;
+		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
+		
+		// try each selector
+		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
+		{
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+
+				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], 
+														uiPixel);
+
+				if (fPixelError < afBestPixelErrors[uiPixel])
+				{
+					afBestPixelErrors[uiPixel] = fPixelError;
+					auiBestPixelSelectors[uiPixel] = uiSelector;
+					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
+				}
+			}
+		}
+		
+
+		// add up all of the pixel errors
+		float fBlockError = 0.0f;
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			fBlockError += afBestPixelErrors[uiPixel];
+		}
+
+		if (fBlockError < m_fError)
+		{
+			m_fError = fBlockError;
+
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
+				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
+			}
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try encoding in T mode
+	// save this encoding if it improves the error
+	//
+	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
+	// TWIDDLE_RADIUS of 2 is WAY too slow
+	//
+	void Block4x4Encoding_RGB8::TryH(unsigned int a_uiRadius)
+	{
+		Block4x4Encoding_RGB8 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_H;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+			encodingTry.m_fError = FLT_MAX;
+		}
+
+		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
+		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
+		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
+
+		int iMinRed1 = iColor1Red - (int)a_uiRadius;
+        int iMaxRed1 = iColor1Red + (int)a_uiRadius;
+        int iMinGreen1 = iColor1Green - (int)a_uiRadius;
+        int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
+        int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
+        int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
+        
+        if (iMinRed1 < 0)
+		{
+			iMinRed1 = 0;
+		}
+		if (iMaxRed1 > 15)
+		{
+			iMaxRed1 = 15;
+		}
+		if (iMinGreen1 < 0)
+		{
+			iMinGreen1 = 0;
+		}
+		if (iMaxGreen1 > 15)
+		{
+			iMaxGreen1 = 15;
+		}
+		if (iMinBlue1 < 0)
+		{
+			iMinBlue1 = 0;
+		}
+		if (iMaxBlue1 > 15)
+		{
+			iMaxBlue1 = 15;
+		}
+
+		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
+		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
+		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
+
+		int iMinRed2 = iColor2Red - (int)a_uiRadius;
+        int iMaxRed2 = iColor2Red + (int)a_uiRadius;
+        int iMinGreen2 = iColor2Green - (int)a_uiRadius;
+        int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
+        int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
+        int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
+        
+        if (iMinRed2 < 0)
+		{
+			iMinRed2 = 0;
+		}
+		if (iMaxRed2 > 15)
+		{
+			iMaxRed2 = 15;
+		}
+
+		if (iMinGreen2 < 0)
+		{
+			iMinGreen2 = 0;
+		}
+		if (iMaxGreen2 > 15)
+		{
+			iMaxGreen2 = 15;
+		}
+
+		if (iMinBlue2 < 0)
+		{
+			iMinBlue2 = 0;
+		}
+		if (iMaxBlue2 > 15)
+		{
+			iMaxBlue2 = 15;
+		}
+
+        bool isGray = m_errormetric == GRAY || !m_pblockParent->HasColorPixels();
+        
+		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
+		{
+			encodingTry.m_uiCW1 = uiDistance;
+
+			// twiddle m_frgbaOriginalColor1_TAndH
+			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
+			{
+				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
+				{
+					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
+					{
+                        // gray only iterates red
+                        if (isGray && (iRed1 != iGreen1 || iRed1 != iBlue1))
+                        {
+                            continue;
+                        }
+                        
+						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
+						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
+
+						// if color1 == color2, H encoding issues can pop up, so abort
+						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
+						{
+							continue;
+						}
+
+						encodingTry.TryH_BestSelectorCombination();
+
+						if (encodingTry.m_fError < m_fError)
+						{
+							m_mode = encodingTry.m_mode;
+							m_boolDiff = encodingTry.m_boolDiff;
+							m_boolFlip = encodingTry.m_boolFlip;
+
+							m_frgbaColor1 = encodingTry.m_frgbaColor1;
+							m_frgbaColor2 = encodingTry.m_frgbaColor2;
+							m_uiCW1 = encodingTry.m_uiCW1;
+
+							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+							{
+								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+							}
+
+							m_fError = encodingTry.m_fError;
+                            
+                            SetDoneIfPerfect();
+                            if (IsDone())
+                            {
+                                return;
+                            }
+						}
+					}
+				}
+			}
+
+			// twiddle m_frgbaOriginalColor2_TAndH
+			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
+			{
+				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
+				{
+					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
+					{
+                        // gray only iterates red
+                        if (isGray && (iRed2 != iGreen2 || iRed2 != iBlue2))
+                        {
+                            continue;
+                        }
+                        
+						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
+						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
+
+						// if color1 == color2, H encoding issues can pop up, so abort
+						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
+						{
+							continue;
+						}
+
+						encodingTry.TryH_BestSelectorCombination();
+
+						if (encodingTry.m_fError < m_fError)
+						{
+							m_mode = encodingTry.m_mode;
+							m_boolDiff = encodingTry.m_boolDiff;
+							m_boolFlip = encodingTry.m_boolFlip;
+
+							m_frgbaColor1 = encodingTry.m_frgbaColor1;
+							m_frgbaColor2 = encodingTry.m_frgbaColor2;
+							m_uiCW1 = encodingTry.m_uiCW1;
+
+							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+							{
+								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+							}
+
+							m_fError = encodingTry.m_fError;
+                            
+                            SetDoneIfPerfect();
+                            if (IsDone())
+                            {
+                                return;
+                            }
+						}
+					}
+				}
+			}
+
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// find best selector combination for TryH
+	// called on an encodingTry
+	//
+	void Block4x4Encoding_RGB8::TryH_BestSelectorCombination(void)
+	{
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+
+		unsigned int auiBestPixelSelectors[PIXELS];
+		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
+			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
+		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
+		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
+		
+		assert(SELECTORS == 4);
+		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
+		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
+		afrgbaDecodedPixel[2] = (m_frgbaColor2 + fDistance).ClampRGB();
+		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
+		
+		// try each selector
+		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
+		{
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+
+				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector],
+														uiPixel);
+
+				if (fPixelError < afBestPixelErrors[uiPixel])
+				{
+					afBestPixelErrors[uiPixel] = fPixelError;
+					auiBestPixelSelectors[uiPixel] = uiSelector;
+					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
+				}
+			}
+		}
+		
+
+		// add up all of the pixel errors
+		float fBlockError = 0.0f;
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			fBlockError += afBestPixelErrors[uiPixel];
+		}
+
+		if (m_fError > fBlockError)
+		{
+			m_fError = fBlockError;
+
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
+				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
+			}
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// use linear regression to find the best fit for colors along the edges of the 4x4 block
+	//
+	void Block4x4Encoding_RGB8::CalculatePlanarCornerColors(void)
+	{
+		ColorFloatRGBA afrgbaRegression[MAX_PLANAR_REGRESSION_SIZE];
+		ColorFloatRGBA frgbaSlope;
+		ColorFloatRGBA frgbaOffset;
+
+		// top edge
+		afrgbaRegression[0] = m_pafrgbaSource[0];
+		afrgbaRegression[1] = m_pafrgbaSource[4];
+		afrgbaRegression[2] = m_pafrgbaSource[8];
+		afrgbaRegression[3] = m_pafrgbaSource[12];
+		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
+		m_frgbaColor1 = frgbaOffset;
+		m_frgbaColor2 = (frgbaSlope * 4.0f) + frgbaOffset;
+
+		// left edge
+		afrgbaRegression[0] = m_pafrgbaSource[0];
+		afrgbaRegression[1] = m_pafrgbaSource[1];
+		afrgbaRegression[2] = m_pafrgbaSource[2];
+		afrgbaRegression[3] = m_pafrgbaSource[3];
+		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
+		m_frgbaColor1 = (m_frgbaColor1 + frgbaOffset) * 0.5f;		// average with top edge
+		m_frgbaColor3 = (frgbaSlope * 4.0f) + frgbaOffset;
+
+		// right edge
+		afrgbaRegression[0] = m_pafrgbaSource[12];
+		afrgbaRegression[1] = m_pafrgbaSource[13];
+		afrgbaRegression[2] = m_pafrgbaSource[14];
+		afrgbaRegression[3] = m_pafrgbaSource[15];
+		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
+		m_frgbaColor2 = (m_frgbaColor2 + frgbaOffset) * 0.5f;		// average with top edge
+
+		// bottom edge
+		afrgbaRegression[0] = m_pafrgbaSource[3];
+		afrgbaRegression[1] = m_pafrgbaSource[7];
+		afrgbaRegression[2] = m_pafrgbaSource[11];
+		afrgbaRegression[3] = m_pafrgbaSource[15];
+		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
+		m_frgbaColor3 = (m_frgbaColor3 + frgbaOffset) * 0.5f;		// average with left edge
+
+		// quantize corner colors to 6/7/6
+		m_frgbaColor1 = m_frgbaColor1.QuantizeR6G7B6();
+		m_frgbaColor2 = m_frgbaColor2.QuantizeR6G7B6();
+		m_frgbaColor3 = m_frgbaColor3.QuantizeR6G7B6();
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try different corner colors by slightly changing R, G and B independently
+	//
+	// R, G and B decoding and errors are independent, so R, G and B twiddles can be independent
+	//
+	// return true if improvement
+	//
+	bool Block4x4Encoding_RGB8::TwiddlePlanar(void)
+	{
+		bool boolImprovement = false;
+        bool isGray = m_errormetric == GRAY || !m_pblockParent->HasColorPixels();
+        
+		while (TwiddlePlanarR())
+		{
+			boolImprovement = true;
+		}
+
+        if (!isGray) {
+            while (TwiddlePlanarG())
+            {
+                boolImprovement = true;
+            }
+
+            while (TwiddlePlanarB())
+            {
+                boolImprovement = true;
+            }
+        }
+        
+		return boolImprovement;
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try different corner colors by slightly changing R
+	//
+	bool Block4x4Encoding_RGB8::TwiddlePlanarR()
+	{
+		bool boolImprovement = false;
+
+		Block4x4Encoding_RGB8 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_PLANAR;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+		}
+
+		int iOriginRed = encodingTry.m_frgbaColor1.IntRed(63.0f);
+		int iHorizRed = encodingTry.m_frgbaColor2.IntRed(63.0f);
+		int iVertRed = encodingTry.m_frgbaColor3.IntRed(63.0f);
+
+		for (int iTryOriginRed = iOriginRed - 1; iTryOriginRed <= iOriginRed + 1; iTryOriginRed++)
+		{
+			// check for out of range
+			if (iTryOriginRed < 0 || iTryOriginRed > 63)
+			{
+				continue;
+			}
+
+			encodingTry.m_frgbaColor1.fR = ((iTryOriginRed << 2) + (iTryOriginRed >> 4)) / 255.0f;
+
+			for (int iTryHorizRed = iHorizRed - 1; iTryHorizRed <= iHorizRed + 1; iTryHorizRed++)
+			{
+				// check for out of range
+				if (iTryHorizRed < 0 || iTryHorizRed > 63)
+				{
+					continue;
+				}
+
+				encodingTry.m_frgbaColor2.fR = ((iTryHorizRed << 2) + (iTryHorizRed >> 4)) / 255.0f;
+
+				for (int iTryVertRed = iVertRed - 1; iTryVertRed <= iVertRed + 1; iTryVertRed++)
+				{
+					// check for out of range
+					if (iTryVertRed < 0 || iTryVertRed > 63)
+					{
+						continue;
+					}
+
+					// don't bother with null twiddle
+					if (iTryOriginRed == iOriginRed && iTryHorizRed == iHorizRed && iTryVertRed == iVertRed)
+					{
+						continue;
+					}
+
+					encodingTry.m_frgbaColor3.fR = ((iTryVertRed << 2) + (iTryVertRed >> 4)) / 255.0f;
+
+					encodingTry.DecodePixels_Planar();
+
+					encodingTry.CalcBlockError();
+
+					if (encodingTry.m_fError < m_fError)
+					{
+						m_mode = MODE_PLANAR;
+						m_boolDiff = true;
+						m_boolFlip = false;
+						m_frgbaColor1 = encodingTry.m_frgbaColor1;
+						m_frgbaColor2 = encodingTry.m_frgbaColor2;
+						m_frgbaColor3 = encodingTry.m_frgbaColor3;
+
+						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+						{
+							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+						}
+
+						m_fError = encodingTry.m_fError;
+
+						boolImprovement = true;
+					}
+				}
+			}
+		}
+
+		return boolImprovement;
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try different corner colors by slightly changing G
+	//
+	bool Block4x4Encoding_RGB8::TwiddlePlanarG()
+	{
+		bool boolImprovement = false;
+
+		Block4x4Encoding_RGB8 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_PLANAR;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+		}
+
+		int iOriginGreen = encodingTry.m_frgbaColor1.IntGreen(127.0f);
+		int iHorizGreen = encodingTry.m_frgbaColor2.IntGreen(127.0f);
+		int iVertGreen = encodingTry.m_frgbaColor3.IntGreen(127.0f);
+
+		for (int iTryOriginGreen = iOriginGreen - 1; iTryOriginGreen <= iOriginGreen + 1; iTryOriginGreen++)
+		{
+			// check for out of range
+			if (iTryOriginGreen < 0 || iTryOriginGreen > 127)
+			{
+				continue;
+			}
+
+			encodingTry.m_frgbaColor1.fG = ((iTryOriginGreen << 1) + (iTryOriginGreen >> 6)) / 255.0f;
+
+			for (int iTryHorizGreen = iHorizGreen - 1; iTryHorizGreen <= iHorizGreen + 1; iTryHorizGreen++)
+			{
+				// check for out of range
+				if (iTryHorizGreen < 0 || iTryHorizGreen > 127)
+				{
+					continue;
+				}
+
+				encodingTry.m_frgbaColor2.fG = ((iTryHorizGreen << 1) + (iTryHorizGreen >> 6)) / 255.0f;
+
+				for (int iTryVertGreen = iVertGreen - 1; iTryVertGreen <= iVertGreen + 1; iTryVertGreen++)
+				{
+					// check for out of range
+					if (iTryVertGreen < 0 || iTryVertGreen > 127)
+					{
+						continue;
+					}
+
+					// don't bother with null twiddle
+					if (iTryOriginGreen == iOriginGreen && 
+						iTryHorizGreen == iHorizGreen && 
+						iTryVertGreen == iVertGreen)
+					{
+						continue;
+					}
+
+					encodingTry.m_frgbaColor3.fG = ((iTryVertGreen << 1) + (iTryVertGreen >> 6)) / 255.0f;
+
+					encodingTry.DecodePixels_Planar();
+
+					encodingTry.CalcBlockError();
+
+					if (encodingTry.m_fError < m_fError)
+					{
+						m_mode = MODE_PLANAR;
+						m_boolDiff = true;
+						m_boolFlip = false;
+						m_frgbaColor1 = encodingTry.m_frgbaColor1;
+						m_frgbaColor2 = encodingTry.m_frgbaColor2;
+						m_frgbaColor3 = encodingTry.m_frgbaColor3;
+
+						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+						{
+							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+						}
+
+						m_fError = encodingTry.m_fError;
+
+						boolImprovement = true;
+					}
+				}
+			}
+		}
+
+		return boolImprovement;
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try different corner colors by slightly changing B
+	//
+	bool Block4x4Encoding_RGB8::TwiddlePlanarB()
+	{
+		bool boolImprovement = false;
+
+		Block4x4Encoding_RGB8 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_PLANAR;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+		}
+
+		int iOriginBlue = encodingTry.m_frgbaColor1.IntBlue(63.0f);
+		int iHorizBlue = encodingTry.m_frgbaColor2.IntBlue(63.0f);
+		int iVertBlue = encodingTry.m_frgbaColor3.IntBlue(63.0f);
+
+		for (int iTryOriginBlue = iOriginBlue - 1; iTryOriginBlue <= iOriginBlue + 1; iTryOriginBlue++)
+		{
+			// check for out of range
+			if (iTryOriginBlue < 0 || iTryOriginBlue > 63)
+			{
+				continue;
+			}
+
+			encodingTry.m_frgbaColor1.fB = ((iTryOriginBlue << 2) + (iTryOriginBlue >> 4)) / 255.0f;
+
+			for (int iTryHorizBlue = iHorizBlue - 1; iTryHorizBlue <= iHorizBlue + 1; iTryHorizBlue++)
+			{
+				// check for out of range
+				if (iTryHorizBlue < 0 || iTryHorizBlue > 63)
+				{
+					continue;
+				}
+
+				encodingTry.m_frgbaColor2.fB = ((iTryHorizBlue << 2) + (iTryHorizBlue >> 4)) / 255.0f;
+
+				for (int iTryVertBlue = iVertBlue - 1; iTryVertBlue <= iVertBlue + 1; iTryVertBlue++)
+				{
+					// check for out of range
+					if (iTryVertBlue < 0 || iTryVertBlue > 63)
+					{
+						continue;
+					}
+
+					// don't bother with null twiddle
+					if (iTryOriginBlue == iOriginBlue && iTryHorizBlue == iHorizBlue && iTryVertBlue == iVertBlue)
+					{
+						continue;
+					}
+
+					encodingTry.m_frgbaColor3.fB = ((iTryVertBlue << 2) + (iTryVertBlue >> 4)) / 255.0f;
+
+					encodingTry.DecodePixels_Planar();
+
+					encodingTry.CalcBlockError();
+
+					if (encodingTry.m_fError < m_fError)
+					{
+						m_mode = MODE_PLANAR;
+						m_boolDiff = true;
+						m_boolFlip = false;
+						m_frgbaColor1 = encodingTry.m_frgbaColor1;
+						m_frgbaColor2 = encodingTry.m_frgbaColor2;
+						m_frgbaColor3 = encodingTry.m_frgbaColor3;
+
+						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+						{
+							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+						}
+
+						m_fError = encodingTry.m_fError;
+
+						boolImprovement = true;
+					}
+				}
+			}
+		}
+
+		return boolImprovement;
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state
+	//
+	void Block4x4Encoding_RGB8::SetEncodingBits(void)
+	{
+
+		switch (m_mode)
+		{
+		case MODE_ETC1:
+			Block4x4Encoding_ETC1::SetEncodingBits();
+			break;
+
+		case MODE_T:
+			SetEncodingBits_T();
+			break;
+
+		case MODE_H:
+			SetEncodingBits_H();
+			break;
+
+		case MODE_PLANAR:
+			SetEncodingBits_Planar();
+			break;
+
+		default:
+			assert(false);
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state for T mode
+	//
+	void Block4x4Encoding_RGB8::SetEncodingBits_T(void)
+	{
+		static const bool SANITY_CHECK = true;
+
+		assert(m_mode == MODE_T);
+		assert(m_boolDiff == true);
+
+		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
+		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
+		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
+
+		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
+		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
+		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
+
+		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
+		m_pencodingbitsRGB8->t.red1b = uiRed1;
+		m_pencodingbitsRGB8->t.green1 = uiGreen1;
+		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
+
+		m_pencodingbitsRGB8->t.red2 = uiRed2;
+		m_pencodingbitsRGB8->t.green2 = uiGreen2;
+		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
+
+		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
+		m_pencodingbitsRGB8->t.db = m_uiCW1;
+
+		m_pencodingbitsRGB8->t.diff = 1;
+
+		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
+
+		// create an invalid R differential to trigger T mode
+		m_pencodingbitsRGB8->t.detect1 = 0;
+		m_pencodingbitsRGB8->t.detect2 = 0;
+		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+		if (iRed2 >= 4)
+		{
+			m_pencodingbitsRGB8->t.detect1 = 7;
+			m_pencodingbitsRGB8->t.detect2 = 0;
+		}
+		else
+		{
+			m_pencodingbitsRGB8->t.detect1 = 0;
+			m_pencodingbitsRGB8->t.detect2 = 1;
+		}
+
+		if (SANITY_CHECK)
+		{
+			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+
+			// make sure red overflows
+			assert(iRed2 < 0 || iRed2 > 31);
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state for H mode
+	//
+	// colors and selectors may need to swap in order to generate lsb of distance index
+	//
+	void Block4x4Encoding_RGB8::SetEncodingBits_H(void)
+	{
+		static const bool SANITY_CHECK = true;
+
+		assert(m_mode == MODE_H);
+		assert(m_boolDiff == true);
+
+		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
+		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
+		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
+
+		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
+		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
+		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
+
+		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
+		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
+
+		bool boolOddDistance = m_uiCW1 & 1;
+		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
+
+		if (boolSwapColors)
+		{
+			m_pencodingbitsRGB8->h.red1 = uiRed2;
+			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
+			m_pencodingbitsRGB8->h.green1b = uiGreen2;
+			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
+			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
+			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
+
+			m_pencodingbitsRGB8->h.red2 = uiRed1;
+			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
+			m_pencodingbitsRGB8->h.green2b = uiGreen1;
+			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
+
+			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
+			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
+		}
+		else
+		{
+			m_pencodingbitsRGB8->h.red1 = uiRed1;
+			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
+			m_pencodingbitsRGB8->h.green1b = uiGreen1;
+			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
+			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
+			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
+
+			m_pencodingbitsRGB8->h.red2 = uiRed2;
+			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
+			m_pencodingbitsRGB8->h.green2b = uiGreen2;
+			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
+
+			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
+			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
+		}
+
+		m_pencodingbitsRGB8->h.diff = 1;
+
+		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
+
+		if (boolSwapColors)
+		{
+			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
+		}
+
+		// create an invalid R differential to trigger T mode
+		m_pencodingbitsRGB8->h.detect1 = 0;
+		m_pencodingbitsRGB8->h.detect2 = 0;
+		m_pencodingbitsRGB8->h.detect3 = 0;
+		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
+		if (iRed2 < 0 || iRed2 > 31)
+		{
+			m_pencodingbitsRGB8->h.detect1 = 1;
+		}
+		if (iGreen2 >= 4)
+		{
+			m_pencodingbitsRGB8->h.detect2 = 7;
+			m_pencodingbitsRGB8->h.detect3 = 0;
+		}
+		else
+		{
+			m_pencodingbitsRGB8->h.detect2 = 0;
+			m_pencodingbitsRGB8->h.detect3 = 1;
+		}
+
+		if (SANITY_CHECK)
+		{
+			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
+
+			// make sure red doesn't overflow and green does
+			assert(iRed2 >= 0 && iRed2 <= 31);
+			assert(iGreen2 < 0 || iGreen2 > 31);
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state for Planar mode
+	//
+	void Block4x4Encoding_RGB8::SetEncodingBits_Planar(void)
+	{
+		static const bool SANITY_CHECK = true;
+
+		assert(m_mode == MODE_PLANAR);
+		assert(m_boolDiff == true);
+
+		unsigned int uiOriginRed = (unsigned int)m_frgbaColor1.IntRed(63.0f);
+		unsigned int uiOriginGreen = (unsigned int)m_frgbaColor1.IntGreen(127.0f);
+		unsigned int uiOriginBlue = (unsigned int)m_frgbaColor1.IntBlue(63.0f);
+
+		unsigned int uiHorizRed = (unsigned int)m_frgbaColor2.IntRed(63.0f);
+		unsigned int uiHorizGreen = (unsigned int)m_frgbaColor2.IntGreen(127.0f);
+		unsigned int uiHorizBlue = (unsigned int)m_frgbaColor2.IntBlue(63.0f);
+
+		unsigned int uiVertRed = (unsigned int)m_frgbaColor3.IntRed(63.0f);
+		unsigned int uiVertGreen = (unsigned int)m_frgbaColor3.IntGreen(127.0f);
+		unsigned int uiVertBlue = (unsigned int)m_frgbaColor3.IntBlue(63.0f);
+
+		m_pencodingbitsRGB8->planar.originRed = uiOriginRed;
+		m_pencodingbitsRGB8->planar.originGreen1 = uiOriginGreen >> 6;
+		m_pencodingbitsRGB8->planar.originGreen2 = uiOriginGreen;
+		m_pencodingbitsRGB8->planar.originBlue1 = uiOriginBlue >> 5;
+		m_pencodingbitsRGB8->planar.originBlue2 = uiOriginBlue >> 3;
+		m_pencodingbitsRGB8->planar.originBlue3 = uiOriginBlue >> 1;
+		m_pencodingbitsRGB8->planar.originBlue4 = uiOriginBlue;
+
+		m_pencodingbitsRGB8->planar.horizRed1 = uiHorizRed >> 1;
+		m_pencodingbitsRGB8->planar.horizRed2 = uiHorizRed;
+		m_pencodingbitsRGB8->planar.horizGreen = uiHorizGreen;
+		m_pencodingbitsRGB8->planar.horizBlue1 = uiHorizBlue >> 5;
+		m_pencodingbitsRGB8->planar.horizBlue2 = uiHorizBlue;
+
+		m_pencodingbitsRGB8->planar.vertRed1 = uiVertRed >> 3;
+		m_pencodingbitsRGB8->planar.vertRed2 = uiVertRed;
+		m_pencodingbitsRGB8->planar.vertGreen1 = uiVertGreen >> 2;
+		m_pencodingbitsRGB8->planar.vertGreen2 = uiVertGreen;
+		m_pencodingbitsRGB8->planar.vertBlue = uiVertBlue;
+
+		m_pencodingbitsRGB8->planar.diff = 1;
+
+		// create valid RG differentials and an invalid B differential to trigger planar mode
+		m_pencodingbitsRGB8->planar.detect1 = 0;
+		m_pencodingbitsRGB8->planar.detect2 = 0;
+		m_pencodingbitsRGB8->planar.detect3 = 0;
+		m_pencodingbitsRGB8->planar.detect4 = 0;
+		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
+		int iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
+		if (iRed2 < 0 || iRed2 > 31)
+		{
+			m_pencodingbitsRGB8->planar.detect1 = 1;
+		}
+		if (iGreen2 < 0 || iGreen2 > 31)
+		{
+			m_pencodingbitsRGB8->planar.detect2 = 1;
+		}
+		if (iBlue2 >= 4)
+		{
+			m_pencodingbitsRGB8->planar.detect3 = 7;
+			m_pencodingbitsRGB8->planar.detect4 = 0;
+		}
+		else
+		{
+			m_pencodingbitsRGB8->planar.detect3 = 0;
+			m_pencodingbitsRGB8->planar.detect4 = 1;
+		}
+
+		if (SANITY_CHECK)
+		{
+			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
+			iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
+
+			// make sure red and green don't overflow and blue does
+			assert(iRed2 >= 0 && iRed2 <= 31);
+			assert(iGreen2 >= 0 && iGreen2 <= 31);
+			assert(iBlue2 < 0 || iBlue2 > 31);
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the decoded colors and decoded alpha based on the encoding state for T mode
+	//
+	void Block4x4Encoding_RGB8::DecodePixels_T(void)
+	{
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
+
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			switch (m_auiSelectors[uiPixel])
+			{
+			case 0:
+				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
+				break;
+
+			case 1:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
+				break;
+
+			case 2:
+				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
+				break;
+
+			case 3:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
+				break;
+			}
+
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the decoded colors and decoded alpha based on the encoding state for H mode
+	//
+	void Block4x4Encoding_RGB8::DecodePixels_H(void)
+	{
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
+
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			switch (m_auiSelectors[uiPixel])
+			{
+			case 0:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
+				break;
+
+			case 1:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
+				break;
+
+			case 2:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
+				break;
+
+			case 3:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
+				break;
+			}
+
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the decoded colors and decoded alpha based on the encoding state for Planar mode
+	//
+	void Block4x4Encoding_RGB8::DecodePixels_Planar(void)
+	{
+
+		int iRO = m_frgbaColor1.IntRed(255.0f);
+        int iGO = m_frgbaColor1.IntGreen(255.0f);
+        int iBO = m_frgbaColor1.IntBlue(255.0f);
+
+        int iRH = m_frgbaColor2.IntRed(255.0f);
+        int iGH = m_frgbaColor2.IntGreen(255.0f);
+        int iBH = m_frgbaColor2.IntBlue(255.0f);
+
+        int iRV = m_frgbaColor3.IntRed(255.0f);
+        int iGV = m_frgbaColor3.IntGreen(255.0f);
+        int iBV = m_frgbaColor3.IntBlue(255.0f);
+
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			int iX = (int)(uiPixel >> 2);
+			int iY = (int)(uiPixel & 3);
+
+			int iR = (iX*(iRH - iRO) + iY*(iRV - iRO) + 4*iRO + 2) >> 2;
+			int iG = (iX*(iGH - iGO) + iY*(iGV - iGO) + 4*iGO + 2) >> 2;
+			int iB = (iX*(iBH - iBO) + iY*(iBV - iBO) + 4*iBO + 2) >> 2;
+
+			ColorFloatRGBA frgba;
+			frgba.fR = (float)iR / 255.0f;
+			frgba.fG = (float)iG / 255.0f;
+			frgba.fB = (float)iB / 255.0f;
+			frgba.fA = 1.0f;
+
+			m_afrgbaDecodedColors[uiPixel] = frgba.ClampRGB();
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// perform a linear regression for the a_uiPixels in a_pafrgbaPixels[]
+	//
+	// output the closest color line using a_pfrgbaSlope and a_pfrgbaOffset
+	//
+	void Block4x4Encoding_RGB8::ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
+												ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset)
+	{
+		typedef struct
+		{
+			float f[4];
+		} Float4;
+
+		Float4 *paf4Pixels = (Float4 *)(a_pafrgbaPixels);
+		Float4 *pf4Slope = (Float4 *)(a_pfrgbaSlope);
+		Float4 *pf4Offset = (Float4 *)(a_pfrgbaOffset);
+
+		float afX[MAX_PLANAR_REGRESSION_SIZE];
+		float afY[MAX_PLANAR_REGRESSION_SIZE];
+
+		// handle r, g and b separately.  don't bother with a
+		for (unsigned int uiComponent = 0; uiComponent < 3; uiComponent++)
+		{
+			for (unsigned int uiPixel = 0; uiPixel < a_uiPixels; uiPixel++)
+			{
+				afX[uiPixel] = (float)uiPixel;
+				afY[uiPixel] = paf4Pixels[uiPixel].f[uiComponent];
+				
+			}
+			Etc::Regression(afX, afY, a_uiPixels,
+				&(pf4Slope->f[uiComponent]), &(pf4Offset->f[uiComponent]));
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+}
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.h b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.h
index c4d6c3e9..f49222c7 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.h
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.h
@@ -1,97 +1,97 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_ETC1.h"
-
-namespace Etc
-{
-
-	class Block4x4Encoding_RGB8 : public Block4x4Encoding_ETC1
-	{
-	public:
-
-		Block4x4Encoding_RGB8(void);
-		virtual ~Block4x4Encoding_RGB8(void);
-
-		virtual void Decode(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											const ColorFloatRGBA *a_pafrgbaSource,
-
-											ErrorMetric a_errormetric,
-                                            uint16_t iterationCount);
-
-		virtual void PerformIteration(float a_fEffort);
-		
-		virtual void SetEncodingBits(void);
-
-//		inline ColorFloatRGBA GetColor3(void) const
-//		{
-//			return m_frgbaColor3;
-//		}
-
-	protected:
-
-		static const unsigned int PLANAR_CORNER_COLORS = 3;
-		static const unsigned int MAX_PLANAR_REGRESSION_SIZE = 4;
-		static const unsigned int TH_DISTANCES = 8;
-
-		static float s_afTHDistanceTable[TH_DISTANCES];
-
-		void TryPlanar(unsigned int a_uiRadius);
-		void TryTAndH(unsigned int a_uiRadius);
-
-		void InitFromEncodingBits_Planar(void);
-
-		ColorFloatRGBA	m_frgbaColor3;		// used for planar
-
-		void SetEncodingBits_T(void);
-		void SetEncodingBits_H(void);
-		void SetEncodingBits_Planar(void);
-
-		// state shared between iterations
-		ColorFloatRGBA	m_frgbaOriginalColor1_TAndH;
-		ColorFloatRGBA	m_frgbaOriginalColor2_TAndH;
-
-		void CalculateBaseColorsForTAndH(void);
-		void TryT(unsigned int a_uiRadius);
-		void TryT_BestSelectorCombination(void);
-		void TryH(unsigned int a_uiRadius);
-		void TryH_BestSelectorCombination(void);
-
-	protected:
-
-		void InitFromEncodingBits_T(void);
-		void InitFromEncodingBits_H(void);
-
-		void CalculatePlanarCornerColors(void);
-
-		void ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
-			ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset);
-
-		bool TwiddlePlanar(void);
-		bool TwiddlePlanarR();
-		bool TwiddlePlanarG();
-		bool TwiddlePlanarB();
-
-		void DecodePixels_T(void);
-		void DecodePixels_H(void);
-		void DecodePixels_Planar(void);
-
-	};
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcBlock4x4Encoding_ETC1.h"
+
+namespace Etc
+{
+
+	class Block4x4Encoding_RGB8 : public Block4x4Encoding_ETC1
+	{
+	public:
+
+		Block4x4Encoding_RGB8(void);
+		virtual ~Block4x4Encoding_RGB8(void);
+
+		virtual void Decode(Block4x4 *a_pblockParent,
+											unsigned char *a_paucEncodingBits,
+											const ColorFloatRGBA *a_pafrgbaSource,
+
+											ErrorMetric a_errormetric,
+                                            uint16_t iterationCount);
+
+		virtual void PerformIteration(float a_fEffort);
+		
+		virtual void SetEncodingBits(void);
+
+//		inline ColorFloatRGBA GetColor3(void) const
+//		{
+//			return m_frgbaColor3;
+//		}
+
+	protected:
+
+		static const unsigned int PLANAR_CORNER_COLORS = 3;
+		static const unsigned int MAX_PLANAR_REGRESSION_SIZE = 4;
+		static const unsigned int TH_DISTANCES = 8;
+
+		static float s_afTHDistanceTable[TH_DISTANCES];
+
+		void TryPlanar(unsigned int a_uiRadius);
+		void TryTAndH(unsigned int a_uiRadius);
+
+		void InitFromEncodingBits_Planar(void);
+
+		ColorFloatRGBA	m_frgbaColor3;		// used for planar
+
+		void SetEncodingBits_T(void);
+		void SetEncodingBits_H(void);
+		void SetEncodingBits_Planar(void);
+
+		// state shared between iterations
+		ColorFloatRGBA	m_frgbaOriginalColor1_TAndH;
+		ColorFloatRGBA	m_frgbaOriginalColor2_TAndH;
+
+		void CalculateBaseColorsForTAndH(void);
+		void TryT(unsigned int a_uiRadius);
+		void TryT_BestSelectorCombination(void);
+		void TryH(unsigned int a_uiRadius);
+		void TryH_BestSelectorCombination(void);
+
+	protected:
+
+		void InitFromEncodingBits_T(void);
+		void InitFromEncodingBits_H(void);
+
+		void CalculatePlanarCornerColors(void);
+
+		void ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
+			ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset);
+
+		bool TwiddlePlanar(void);
+		bool TwiddlePlanarR();
+		bool TwiddlePlanarG();
+		bool TwiddlePlanarB();
+
+		void DecodePixels_T(void);
+		void DecodePixels_H(void);
+		void DecodePixels_Planar(void);
+
+	};
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
index f6d70264..41dfc0a4 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
@@ -1,1829 +1,1829 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGB8A1.cpp contains:
-	Block4x4Encoding_RGB8A1
-	Block4x4Encoding_RGB8A1_Opaque
-	Block4x4Encoding_RGB8A1_Transparent
-
-These encoders are used when targetting file format RGB8A1.
-
-Block4x4Encoding_RGB8A1_Opaque is used when all pixels in the 4x4 block are opaque
-Block4x4Encoding_RGB8A1_Transparent is used when all pixels in the 4x4 block are transparent
-Block4x4Encoding_RGB8A1 is used when there is a mixture of alphas in the 4x4 block
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGB8A1.h"
-
-#include "EtcBlock4x4.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	
-	// ####################################################################################################
-	// Block4x4Encoding_RGB8A1
-	// ####################################################################################################
-
-	float Block4x4Encoding_RGB8A1::s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS] =
-	{
-		{ 0.0f / 255.0f, 8.0f / 255.0f, 0.0f / 255.0f, -8.0f / 255.0f },
-		{ 0.0f / 255.0f, 17.0f / 255.0f, 0.0f / 255.0f, -17.0f / 255.0f },
-		{ 0.0f / 255.0f, 29.0f / 255.0f, 0.0f / 255.0f, -29.0f / 255.0f },
-		{ 0.0f / 255.0f, 42.0f / 255.0f, 0.0f / 255.0f, -42.0f / 255.0f },
-		{ 0.0f / 255.0f, 60.0f / 255.0f, 0.0f / 255.0f, -60.0f / 255.0f },
-		{ 0.0f / 255.0f, 80.0f / 255.0f, 0.0f / 255.0f, -80.0f / 255.0f },
-		{ 0.0f / 255.0f, 106.0f / 255.0f, 0.0f / 255.0f, -106.0f / 255.0f },
-		{ 0.0f / 255.0f, 183.0f / 255.0f, 0.0f / 255.0f, -183.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGB8A1::Block4x4Encoding_RGB8A1(void)
-	{
-		m_pencodingbitsRGB8 = nullptr;
-		m_boolOpaque = false;
-		m_boolTransparent = false;
-		m_boolPunchThroughPixels = true;
-
-	}
-	Block4x4Encoding_RGB8A1::~Block4x4Encoding_RGB8A1(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RGB8A1::Encode(Block4x4 *a_pblockParent,
-													const ColorFloatRGBA *a_pafrgbaSource,
-													unsigned char *a_paucEncodingBits,
-													ErrorMetric a_errormetric)
-	{
-
-		Block4x4Encoding_RGB8::Encode(a_pblockParent,
-			a_pafrgbaSource,
-			a_paucEncodingBits,
-			a_errormetric);
-
-		m_boolOpaque = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE;
-		m_boolTransparent = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT;
-		m_boolPunchThroughPixels = a_pblockParent->HasPunchThroughPixels();
-
-//		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-//		{
-//			if (m_pafrgbaSource[uiPixel].fA >= 0.5f)
-//			{
-//				m_afDecodedAlphas[uiPixel] = 1.0f;
-//			}
-//			else
-//			{
-//				m_afDecodedAlphas[uiPixel] = 0.0f;
-//			}
-//		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGB8A1::Decode(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														const ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric,
-                                                        uint16_t iterationCount)
-	{
-
-
-		InitFromEncodingBits_ETC1(a_pblockParent,
-			a_paucEncodingBits,
-			a_pafrgbaSource,
-			a_errormetric, iterationCount);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		// detect if there is a T, H or Planar mode present
-		int iRed1 = m_pencodingbitsRGB8->differential.red1;
-		int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
-		int iRed2 = iRed1 + iDRed2;
-
-		int iGreen1 = m_pencodingbitsRGB8->differential.green1;
-		int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
-		int iGreen2 = iGreen1 + iDGreen2;
-
-		int iBlue1 = m_pencodingbitsRGB8->differential.blue1;
-		int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
-		int iBlue2 = iBlue1 + iDBlue2;
-
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			InitFromEncodingBits_T();
-		}
-		else if (iGreen2 < 0 || iGreen2 > 31)
-		{
-			InitFromEncodingBits_H();
-		}
-		else if (iBlue2 < 0 || iBlue2 > 31)
-		{
-			Block4x4Encoding_RGB8::InitFromEncodingBits_Planar();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding assuming the encoding is an ETC1 mode.
-	// if it isn't an ETC1 mode, this will be overwritten later
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
-		unsigned char *a_paucEncodingBits,
-		const ColorFloatRGBA *a_pafrgbaSource,
-		ErrorMetric a_errormetric, uint16_t iterationCount)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,
-			a_errormetric, iterationCount);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = m_pencodingbitsRGB8->differential.flip;
-		m_boolOpaque = m_pencodingbitsRGB8->differential.diff;
-
-		int iR2 = m_pencodingbitsRGB8->differential.red1 + m_pencodingbitsRGB8->differential.dred2;
-        int iG2 = m_pencodingbitsRGB8->differential.green1 + m_pencodingbitsRGB8->differential.dgreen2;
-        int iB2 = m_pencodingbitsRGB8->differential.blue1 + m_pencodingbitsRGB8->differential.dblue2;
-        
-        if (iR2 < 0)
-		{
-			iR2 = 0;
-		}
-		else if (iR2 > 31)
-		{
-			iR2 = 31;
-		}
-
-		if (iG2 < 0)
-		{
-			iG2 = 0;
-		}
-		else if (iG2 > 31)
-		{
-			iG2 = 31;
-		}
-
-		if (iB2 < 0)
-		{
-			iB2 = 0;
-		}
-		else if (iB2 > 31)
-		{
-			iB2 = 31;
-		}
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5(m_pencodingbitsRGB8->differential.red1, m_pencodingbitsRGB8->differential.green1, m_pencodingbitsRGB8->differential.blue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iR2, (unsigned char)iG2, (unsigned char)iB2);
-
-		m_uiCW1 = m_pencodingbitsRGB8->differential.cw1;
-		m_uiCW2 = m_pencodingbitsRGB8->differential.cw2;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		Decode_ETC1();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if T mode is detected
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_T(void)
-	{
-		m_mode = MODE_T;
-
-		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
-								m_pencodingbitsRGB8->t.red1b);
-		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
-		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
-		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_T();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if H mode is detected
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_H(void)
-	{
-		m_mode = MODE_H;
-
-		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
-		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
-									m_pencodingbitsRGB8->h.green1b);
-		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
-								(m_pencodingbitsRGB8->h.blue1b << 1) +
-								m_pencodingbitsRGB8->h.blue1c);
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
-		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
-									m_pencodingbitsRGB8->h.green2b);
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		// used to determine the LSB of the CW
-		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
-		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
-		if (uiRGB1 >= uiRGB2)
-		{
-			m_uiCW1++;
-		}
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_H();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for ETC1 modes, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::Decode_ETC1(void)
-	{
-
-		const unsigned int *pauiPixelOrder = m_boolFlip ? s_auiPixelOrderFlip1 : s_auiPixelOrderFlip0;
-
-		for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS; uiPixelOrder++)
-		{
-			ColorFloatRGBA *pfrgbaCenter = uiPixelOrder < 8 ? &m_frgbaColor1 : &m_frgbaColor2;
-			unsigned int uiCW = uiPixelOrder < 8 ? m_uiCW1 : m_uiCW2;
-
-			unsigned int uiPixel = pauiPixelOrder[uiPixelOrder];
-
-			float fDelta;
-			if (m_boolOpaque)
-				fDelta = Block4x4Encoding_ETC1::s_aafCwTable[uiCW][m_auiSelectors[uiPixel]];
-			else 
-				fDelta = s_aafCwOpaqueUnsetTable[uiCW][m_auiSelectors[uiPixel]];
-
-			if (m_boolOpaque == false && m_auiSelectors[uiPixel] == TRANSPARENT_SELECTOR)
-			{
-				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-				//m_afDecodedAlphas[uiPixel] = 0.0f;
-			}
-			else
-			{
-				m_afrgbaDecodedColors[uiPixel] = (*pfrgbaCenter + fDelta).ClampRGB();
-				//m_afDecodedAlphas[uiPixel] = 1.0f;
-			}
-            
-            // TODO: this isn't setting alpha
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for T mode, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::DecodePixels_T(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
-				//m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				//m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 2:
-				if (m_boolOpaque == false)
-				{
-					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-					//m_afDecodedAlphas[uiPixel] = 0.0f;
-				}
-				else
-				{
-					m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
-					//m_afDecodedAlphas[uiPixel] = 1.0f;
-				}
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				//m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for H mode, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::DecodePixels_H(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
-				//m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
-				//m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 2:
-				if (m_boolOpaque == false)
-				{
-					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-					//m_afDecodedAlphas[uiPixel] = 0.0f;
-				}
-				else
-				{
-					m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-					//m_afDecodedAlphas[uiPixel] = 1.0f;
-				}
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				//m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-			}
-
-		}
-
-	}
-
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	// RGB8A1 can't use individual mode
-	// RGB8A1 with transparent pixels can't use planar mode
-	//
-	void Block4x4Encoding_RGB8A1::PerformIteration(float a_fEffort)
-	{
-        if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE)
-        {
-            PerformIterationOpaque(a_fEffort);
-            return;
-        }
-        else if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT)
-        {
-            PerformIterationTransparent(a_fEffort);
-            return;
-        }
-        
-		assert(!m_boolOpaque);
-		assert(!m_boolTransparent);
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			PerformFirstIteration();
-			break;
-
-		case 1:
-			TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			if (a_fEffort <= 39.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH();
-			TryT(1);
-			TryH(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 4:
-			TryDegenerates1();
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			TryDegenerates2();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryDegenerates3();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-
-		SetDoneIfPerfect();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_RGB8A1::PerformFirstIteration(void)
-	{
-		Block4x4Encoding_ETC1::CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// mostly copied from ETC1
-	// differences:
-	//		Block4x4Encoding_RGB8A1 encodingTry = *this;
-	//
-	void Block4x4Encoding_RGB8A1::TryDifferential(bool a_boolFlip, unsigned int a_uiRadius, 
-													int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-
-		ColorFloatRGBA frgbaColor1;
-		ColorFloatRGBA frgbaColor2;
-
-		const unsigned int *pauiPixelMapping1;
-		const unsigned int *pauiPixelMapping2;
-
-		if (a_boolFlip)
-		{
-			frgbaColor1 = m_frgbaSourceAverageTop;
-			frgbaColor2 = m_frgbaSourceAverageBottom;
-
-			pauiPixelMapping1 = s_auiTopPixelMapping;
-			pauiPixelMapping2 = s_auiBottomPixelMapping;
-		}
-		else
-		{
-			frgbaColor1 = m_frgbaSourceAverageLeft;
-			frgbaColor2 = m_frgbaSourceAverageRight;
-
-			pauiPixelMapping1 = s_auiLeftPixelMapping;
-			pauiPixelMapping2 = s_auiRightPixelMapping;
-		}
-
-		DifferentialTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, 
-								a_uiRadius, a_iGrayOffset1, a_iGrayOffset2);
-
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-		encodingTry.m_boolFlip = a_boolFlip;
-
-		encodingTry.TryDifferentialHalf(&trys.m_half1);
-		encodingTry.TryDifferentialHalf(&trys.m_half2);
-
-		// find best halves that are within differential range
-		DifferentialTrys::Try *ptryBest1 = nullptr;
-		DifferentialTrys::Try *ptryBest2 = nullptr;
-		encodingTry.m_fError = FLT_MAX;
-
-		// see if the best of each half are in differential range
-		int iDRed = trys.m_half2.m_ptryBest->m_iRed - trys.m_half1.m_ptryBest->m_iRed;
-		int iDGreen = trys.m_half2.m_ptryBest->m_iGreen - trys.m_half1.m_ptryBest->m_iGreen;
-		int iDBlue = trys.m_half2.m_ptryBest->m_iBlue - trys.m_half1.m_ptryBest->m_iBlue;
-		if (iDRed >= -4 && iDRed <= 3 && iDGreen >= -4 && iDGreen <= 3 && iDBlue >= -4 && iDBlue <= 3)
-		{
-			ptryBest1 = trys.m_half1.m_ptryBest;
-			ptryBest2 = trys.m_half2.m_ptryBest;
-			encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
-		}
-		else
-		{
-			// else, find the next best halves that are in differential range
-			for (DifferentialTrys::Try *ptry1 = &trys.m_half1.m_atry[0];
-			ptry1 < &trys.m_half1.m_atry[trys.m_half1.m_uiTrys];
-				ptry1++)
-			{
-				for (DifferentialTrys::Try *ptry2 = &trys.m_half2.m_atry[0];
-				ptry2 < &trys.m_half2.m_atry[trys.m_half2.m_uiTrys];
-					ptry2++)
-				{
-					iDRed = ptry2->m_iRed - ptry1->m_iRed;
-					bool boolValidRedDelta = iDRed <= 3 && iDRed >= -4;
-					iDGreen = ptry2->m_iGreen - ptry1->m_iGreen;
-					bool boolValidGreenDelta = iDGreen <= 3 && iDGreen >= -4;
-					iDBlue = ptry2->m_iBlue - ptry1->m_iBlue;
-					bool boolValidBlueDelta = iDBlue <= 3 && iDBlue >= -4;
-
-					if (boolValidRedDelta && boolValidGreenDelta && boolValidBlueDelta)
-					{
-						float fError = ptry1->m_fError + ptry2->m_fError;
-
-						if (fError < encodingTry.m_fError)
-						{
-							encodingTry.m_fError = fError;
-
-							ptryBest1 = ptry1;
-							ptryBest2 = ptry2;
-						}
-					}
-
-				}
-			}
-			assert(encodingTry.m_fError < FLT_MAX);
-			assert(ptryBest1 != nullptr);
-			assert(ptryBest2 != nullptr);
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_ETC1;
-			m_boolDiff = true;
-			m_boolFlip = encodingTry.m_boolFlip;
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
-			m_uiCW1 = ptryBest1->m_uiCW;
-			m_uiCW2 = ptryBest2->m_uiCW;
-
-			m_fError = 0.0f;
-			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
-			{
-				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
-				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
-
-				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
-				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				m_auiSelectors[uiPixel1] = uiSelector1;
-				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-                float alpha1 = 1.0;
-                float alpha2 = 1.0;
-                
-				if (uiSelector1 == TRANSPARENT_SELECTOR)
-				{
-					m_afrgbaDecodedColors[uiPixel1] = ColorFloatRGBA();
-					//m_afDecodedAlphas[uiPixel1] = 0.0f;
-                    alpha1 = 0.0;
-				}
-				else
-				{
-					float fDeltaRGB1 = s_aafCwOpaqueUnsetTable[m_uiCW1][uiSelector1];
-					m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
-					//m_afDecodedAlphas[uiPixel1] = 1.0f;
-				}
-
-				if (uiSelector2 == TRANSPARENT_SELECTOR)
-				{
-					m_afrgbaDecodedColors[uiPixel2] = ColorFloatRGBA();
-					//m_afDecodedAlphas[uiPixel2] = 0.0f;
-                    alpha2 = 0.0;
-				}
-				else
-				{
-					float fDeltaRGB2 = s_aafCwOpaqueUnsetTable[m_uiCW2][uiSelector2];
-					m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
-					//m_afDecodedAlphas[uiPixel2] = 1.0f;
-				}
-
-				float fDeltaA1 =alpha1 - m_pafrgbaSource[uiPixel1].fA;
-				m_fError += fDeltaA1 * fDeltaA1;
-				float fDeltaA2 = alpha2 - m_pafrgbaSource[uiPixel2].fA;
-				m_fError += fDeltaA2 * fDeltaA2;
-			}
-
-			m_fError1 = ptryBest1->m_fError;
-			m_fError2 = ptryBest2->m_fError;
-			m_boolSeverelyBentDifferentialColors = trys.m_boolSeverelyBentColors;
-			m_fError = m_fError1 + m_fError2;
-
-			// sanity check
-			{
-				int iRed1 = m_frgbaColor1.IntRed(31.0f);
-				int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-				int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-				int iRed2 = m_frgbaColor2.IntRed(31.0f);
-				int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-				int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-				iDRed = iRed2 - iRed1;
-				iDGreen = iGreen2 - iGreen1;
-				iDBlue = iBlue2 - iBlue1;
-
-				assert(iDRed >= -4 && iDRed < 4);
-				assert(iDGreen >= -4 && iDGreen < 4);
-				assert(iDBlue >= -4 && iDBlue < 4);
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// mostly copied from ETC1
-	// differences:
-	//		uses s_aafCwOpaqueUnsetTable
-	//		color for selector set to 0,0,0,0
-	//
-	void Block4x4Encoding_RGB8A1::TryDifferentialHalf(DifferentialTrys::Half *a_phalf)
-	{
-
-		a_phalf->m_ptryBest = nullptr;
-		float fBestTryError = FLT_MAX;
-
-		a_phalf->m_uiTrys = 0;
-		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius;
-		iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
-			iRed++)
-		{
-			assert(iRed >= 0 && iRed <= 31);
-
-			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
-			iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
-				iGreen++)
-			{
-				assert(iGreen >= 0 && iGreen <= 31);
-
-				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
-				iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
-					iBlue++)
-				{
-					assert(iBlue >= 0 && iBlue <= 31);
-
-					DifferentialTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
-					assert(ptry < &a_phalf->m_atry[DifferentialTrys::Half::MAX_TRYS]);
-
-					ptry->m_iRed = iRed;
-					ptry->m_iGreen = iGreen;
-					ptry->m_iBlue = iBlue;
-					ptry->m_fError = FLT_MAX;
-					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
-
-					// try each CW
-					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-					{
-						unsigned int auiPixelSelectors[PIXELS / 2];
-						ColorFloatRGBA	afrgbaDecodedColors[PIXELS / 2];
-						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-							FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-						// pre-compute decoded pixels for each selector
-						ColorFloatRGBA afrgbaSelectors[SELECTORS];
-						assert(SELECTORS == 4);
-						afrgbaSelectors[0] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][0]).ClampRGB();
-						afrgbaSelectors[1] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][1]).ClampRGB();
-						afrgbaSelectors[2] = ColorFloatRGBA();
-						afrgbaSelectors[3] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][3]).ClampRGB();
-
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-                            int srcPixelIndex = a_phalf->m_pauiPixelMapping[uiPixel];
-							const ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[srcPixelIndex];
-							ColorFloatRGBA frgbaDecodedPixel;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								if (pfrgbaSourcePixel->fA < 0.5f)
-								{
-									uiSelector = TRANSPARENT_SELECTOR;
-								}
-								else if (uiSelector == TRANSPARENT_SELECTOR)
-								{
-									continue;
-								}
-
-								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
-
-								float fPixelError;
-								
-								fPixelError = CalcPixelError(frgbaDecodedPixel, srcPixelIndex);
-
-								if (fPixelError < afPixelErrors[uiPixel])
-								{
-									auiPixelSelectors[uiPixel] = uiSelector;
-									afrgbaDecodedColors[uiPixel] = frgbaDecodedPixel;
-									afPixelErrors[uiPixel] = fPixelError;
-								}
-
-								if (uiSelector == TRANSPARENT_SELECTOR)
-								{
-									break;
-								}
-							}
-						}
-
-						// add up all pixel errors
-						float fCWError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							fCWError += afPixelErrors[uiPixel];
-						}
-
-						// if best CW so far
-						if (fCWError < ptry->m_fError)
-						{
-							ptry->m_uiCW = uiCW;
-							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-							{
-								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
-							}
-							ptry->m_fError = fCWError;
-						}
-
-					}
-
-					if (ptry->m_fError < fBestTryError)
-					{
-						a_phalf->m_ptryBest = ptry;
-						fBestTryError = ptry->m_fError;
-					}
-
-					assert(ptry->m_fError < FLT_MAX);
-
-					a_phalf->m_uiTrys++;
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
-	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
-	//
-	void Block4x4Encoding_RGB8A1::TryT(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_T;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMinRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMinGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMinBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMinRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMinGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMinBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
-			//
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryT
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8A1::TryT_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = m_frgbaColor1;
-		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = ColorFloatRGBA();
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-
-		// try each selector
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiMinSelector = 0;
-			unsigned int uiMaxSelector = SELECTORS - 1;
-
-			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
-			{
-				uiMinSelector = 2;
-				uiMaxSelector = 2;
-			}
-
-			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
-			{
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], uiPixel);
-                
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (m_fError > fBlockError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in H mode
-	// save this encoding if it improves the error
-	//
-	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
-	// TWIDDLE_RADIUS of 2 is WAY too slow
-	//
-	void Block4x4Encoding_RGB8A1::TryH(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_H;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMinRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMinGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMinBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMinRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMinGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMinBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryH
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8A1::TryH_BestSelectorCombination(void)
-	{
-
-		// abort if colors and CW will pose an encoding problem
-		{
-			unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(255.0f);
-			unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(255.0f);
-			unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(255.0f);
-			unsigned int uiColorValue1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-
-			unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(255.0f);
-			unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(255.0f);
-			unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(255.0f);
-			unsigned int uiColorValue2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-			unsigned int uiCWLsb = m_uiCW1 & 1;
-
-			if ((uiColorValue1 >= (uiColorValue2 & uiCWLsb)) == 0 ||
-				(uiColorValue1 < (uiColorValue2 & uiCWLsb)) == 1)
-			{
-				return;
-			}
-		}
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-											FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = ColorFloatRGBA();;
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-
-
-		// try each selector
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiMinSelector = 0;
-			unsigned int uiMaxSelector = SELECTORS - 1;
-
-			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
-			{
-				uiMinSelector = 2;
-				uiMaxSelector = 2;
-			}
-
-			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
-			{
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], uiPixel);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (m_fError > fBlockError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 1 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates1(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 2 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates2(void)
-	{
-
-		TryDifferential(!m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 3 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates3(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 4 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates4(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 4);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -4);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits(void)
-	{
-		switch (m_mode)
-		{
-		case MODE_ETC1:
-			SetEncodingBits_ETC1();
-			break;
-
-		case MODE_T:
-			SetEncodingBits_T();
-			break;
-
-		case MODE_H:
-			SetEncodingBits_H();
-			break;
-
-		case MODE_PLANAR:
-			Block4x4Encoding_RGB8::SetEncodingBits_Planar();
-			break;
-
-		default:
-			assert(false);
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if ETC1 mode
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_ETC1(void)
-	{
-
-		// there is no individual mode in RGB8A1
-		assert(m_boolDiff);
-
-		int iRed1 = m_frgbaColor1.IntRed(31.0f);
-		int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-		int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-		int iRed2 = m_frgbaColor2.IntRed(31.0f);
-		int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-		int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-		int iDRed2 = iRed2 - iRed1;
-		int iDGreen2 = iGreen2 - iGreen1;
-		int iDBlue2 = iBlue2 - iBlue1;
-
-		assert(iDRed2 >= -4 && iDRed2 < 4);
-		assert(iDGreen2 >= -4 && iDGreen2 < 4);
-		assert(iDBlue2 >= -4 && iDBlue2 < 4);
-
-		m_pencodingbitsRGB8->differential.red1 = iRed1;
-		m_pencodingbitsRGB8->differential.green1 = iGreen1;
-		m_pencodingbitsRGB8->differential.blue1 = iBlue1;
-
-		m_pencodingbitsRGB8->differential.dred2 = iDRed2;
-		m_pencodingbitsRGB8->differential.dgreen2 = iDGreen2;
-		m_pencodingbitsRGB8->differential.dblue2 = iDBlue2;
-
-		m_pencodingbitsRGB8->individual.cw1 = m_uiCW1;
-		m_pencodingbitsRGB8->individual.cw2 = m_uiCW2;
-
-		SetEncodingBits_Selectors();
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		m_pencodingbitsRGB8->individual.flip = m_boolFlip;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if T mode
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_T(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_T);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
-		m_pencodingbitsRGB8->t.red1b = uiRed1;
-		m_pencodingbitsRGB8->t.green1 = uiGreen1;
-		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
-
-		m_pencodingbitsRGB8->t.red2 = uiRed2;
-		m_pencodingbitsRGB8->t.green2 = uiGreen2;
-		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
-
-		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
-		m_pencodingbitsRGB8->t.db = m_uiCW1;
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->t.detect1 = 0;
-		m_pencodingbitsRGB8->t.detect2 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		if (iRed2 >= 4)
-		{
-			m_pencodingbitsRGB8->t.detect1 = 7;
-			m_pencodingbitsRGB8->t.detect2 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->t.detect1 = 0;
-			m_pencodingbitsRGB8->t.detect2 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-
-			// make sure red overflows
-			assert(iRed2 < 0 || iRed2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if H mode
-	//
-	// colors and selectors may need to swap in order to generate lsb of distance index
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_H(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_H);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-		bool boolOddDistance = m_uiCW1 & 1;
-		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed2;
-			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed1;
-			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed1;
-			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed2;
-			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
-		}
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->h.detect1 = 0;
-		m_pencodingbitsRGB8->h.detect2 = 0;
-		m_pencodingbitsRGB8->h.detect3 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->h.detect1 = 1;
-		}
-		if (iGreen2 >= 4)
-		{
-			m_pencodingbitsRGB8->h.detect2 = 7;
-			m_pencodingbitsRGB8->h.detect3 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.detect2 = 0;
-			m_pencodingbitsRGB8->h.detect3 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-
-			// make sure red doesn't overflow and green does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 < 0 || iGreen2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8A1::PerformIterationOpaque(float a_fEffort)
-	{
-		assert(!m_boolPunchThroughPixels);
-		assert(!m_boolTransparent);
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-                PerformFirstIterationOpaque();
-			break;
-
-		case 1:
-			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 3:
-			Block4x4Encoding_RGB8::TryPlanar(1);
-			break;
-
-		case 4:
-			Block4x4Encoding_RGB8::TryTAndH(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			Block4x4Encoding_ETC1::TryDegenerates1();
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			Block4x4Encoding_ETC1::TryDegenerates2();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			Block4x4Encoding_ETC1::TryDegenerates3();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			Block4x4Encoding_ETC1::TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_RGB8A1::PerformFirstIterationOpaque(void)
-	{
-		
-		// set decoded alphas
-		// calculate alpha error
-		m_fError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// m_afDecodedAlphas[uiPixel] = 1.0f;
-
-			float fDeltaA = 1.0f - m_pafrgbaSource[uiPixel].fA;
-			m_fError += fDeltaA * fDeltaA;
-		}
-
-		CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_RGB8::TryPlanar(0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_RGB8::TryTAndH(0);
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8A1::PerformIterationTransparent(float )
-	{
-		assert(!m_boolOpaque);
-		assert(m_boolTransparent);
-		assert(!m_boolDone);
-		assert(m_uiEncodingIterations == 0);
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = false;
-
-		m_uiCW1 = 0;
-		m_uiCW2 = 0;
-
-		m_frgbaColor1 = ColorFloatRGBA();
-		m_frgbaColor2 = ColorFloatRGBA();
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_auiSelectors[uiPixel] = TRANSPARENT_SELECTOR;
-
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-			//m_afDecodedAlphas[uiPixel] = 0.0f;
-		}
-
-		CalcBlockError();
-
-		m_boolDone = true;
-		m_uiEncodingIterations++;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcBlock4x4Encoding_RGB8A1.cpp contains:
+	Block4x4Encoding_RGB8A1
+	Block4x4Encoding_RGB8A1_Opaque
+	Block4x4Encoding_RGB8A1_Transparent
+
+These encoders are used when targetting file format RGB8A1.
+
+Block4x4Encoding_RGB8A1_Opaque is used when all pixels in the 4x4 block are opaque
+Block4x4Encoding_RGB8A1_Transparent is used when all pixels in the 4x4 block are transparent
+Block4x4Encoding_RGB8A1 is used when there is a mixture of alphas in the 4x4 block
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcBlock4x4Encoding_RGB8A1.h"
+
+#include "EtcBlock4x4.h"
+#include "EtcBlock4x4EncodingBits.h"
+#include "EtcBlock4x4Encoding_RGB8.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+namespace Etc
+{
+	
+	// ####################################################################################################
+	// Block4x4Encoding_RGB8A1
+	// ####################################################################################################
+
+	float Block4x4Encoding_RGB8A1::s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS] =
+	{
+		{ 0.0f / 255.0f, 8.0f / 255.0f, 0.0f / 255.0f, -8.0f / 255.0f },
+		{ 0.0f / 255.0f, 17.0f / 255.0f, 0.0f / 255.0f, -17.0f / 255.0f },
+		{ 0.0f / 255.0f, 29.0f / 255.0f, 0.0f / 255.0f, -29.0f / 255.0f },
+		{ 0.0f / 255.0f, 42.0f / 255.0f, 0.0f / 255.0f, -42.0f / 255.0f },
+		{ 0.0f / 255.0f, 60.0f / 255.0f, 0.0f / 255.0f, -60.0f / 255.0f },
+		{ 0.0f / 255.0f, 80.0f / 255.0f, 0.0f / 255.0f, -80.0f / 255.0f },
+		{ 0.0f / 255.0f, 106.0f / 255.0f, 0.0f / 255.0f, -106.0f / 255.0f },
+		{ 0.0f / 255.0f, 183.0f / 255.0f, 0.0f / 255.0f, -183.0f / 255.0f }
+	};
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+	Block4x4Encoding_RGB8A1::Block4x4Encoding_RGB8A1(void)
+	{
+		m_pencodingbitsRGB8 = nullptr;
+		m_boolOpaque = false;
+		m_boolTransparent = false;
+		m_boolPunchThroughPixels = true;
+
+	}
+	Block4x4Encoding_RGB8A1::~Block4x4Encoding_RGB8A1(void) {}
+	// ----------------------------------------------------------------------------------------------------
+	// initialization prior to encoding
+	// a_pblockParent points to the block associated with this encoding
+	// a_errormetric is used to choose the best encoding
+	// a_pafrgbaSource points to a 4x4 block subset of the source image
+	// a_paucEncodingBits points to the final encoding bits
+	//
+	void Block4x4Encoding_RGB8A1::Encode(Block4x4 *a_pblockParent,
+													const ColorFloatRGBA *a_pafrgbaSource,
+													unsigned char *a_paucEncodingBits,
+													ErrorMetric a_errormetric)
+	{
+
+		Block4x4Encoding_RGB8::Encode(a_pblockParent,
+			a_pafrgbaSource,
+			a_paucEncodingBits,
+			a_errormetric);
+
+		m_boolOpaque = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE;
+		m_boolTransparent = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT;
+		m_boolPunchThroughPixels = a_pblockParent->HasPunchThroughPixels();
+
+//		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+//		{
+//			if (m_pafrgbaSource[uiPixel].fA >= 0.5f)
+//			{
+//				m_afDecodedAlphas[uiPixel] = 1.0f;
+//			}
+//			else
+//			{
+//				m_afDecodedAlphas[uiPixel] = 0.0f;
+//			}
+//		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding
+	// a_pblockParent points to the block associated with this encoding
+	// a_errormetric is used to choose the best encoding
+	// a_pafrgbaSource points to a 4x4 block subset of the source image
+	// a_paucEncodingBits points to the final encoding bits of a previous encoding
+	//
+	void Block4x4Encoding_RGB8A1::Decode(Block4x4 *a_pblockParent,
+														unsigned char *a_paucEncodingBits,
+														const ColorFloatRGBA *a_pafrgbaSource,
+														ErrorMetric a_errormetric,
+                                                        uint16_t iterationCount)
+	{
+
+
+		InitFromEncodingBits_ETC1(a_pblockParent,
+			a_paucEncodingBits,
+			a_pafrgbaSource,
+			a_errormetric, iterationCount);
+
+		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
+
+		// detect if there is a T, H or Planar mode present
+		int iRed1 = m_pencodingbitsRGB8->differential.red1;
+		int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
+		int iRed2 = iRed1 + iDRed2;
+
+		int iGreen1 = m_pencodingbitsRGB8->differential.green1;
+		int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
+		int iGreen2 = iGreen1 + iDGreen2;
+
+		int iBlue1 = m_pencodingbitsRGB8->differential.blue1;
+		int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
+		int iBlue2 = iBlue1 + iDBlue2;
+
+		if (iRed2 < 0 || iRed2 > 31)
+		{
+			InitFromEncodingBits_T();
+		}
+		else if (iGreen2 < 0 || iGreen2 > 31)
+		{
+			InitFromEncodingBits_H();
+		}
+		else if (iBlue2 < 0 || iBlue2 > 31)
+		{
+			Block4x4Encoding_RGB8::InitFromEncodingBits_Planar();
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding assuming the encoding is an ETC1 mode.
+	// if it isn't an ETC1 mode, this will be overwritten later
+	//
+	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
+		unsigned char *a_paucEncodingBits,
+		const ColorFloatRGBA *a_pafrgbaSource,
+		ErrorMetric a_errormetric, uint16_t iterationCount)
+	{
+		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,
+			a_errormetric, iterationCount);
+
+		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
+
+		m_mode = MODE_ETC1;
+		m_boolDiff = true;
+		m_boolFlip = m_pencodingbitsRGB8->differential.flip;
+		m_boolOpaque = m_pencodingbitsRGB8->differential.diff;
+
+		int iR2 = m_pencodingbitsRGB8->differential.red1 + m_pencodingbitsRGB8->differential.dred2;
+        int iG2 = m_pencodingbitsRGB8->differential.green1 + m_pencodingbitsRGB8->differential.dgreen2;
+        int iB2 = m_pencodingbitsRGB8->differential.blue1 + m_pencodingbitsRGB8->differential.dblue2;
+        
+        if (iR2 < 0)
+		{
+			iR2 = 0;
+		}
+		else if (iR2 > 31)
+		{
+			iR2 = 31;
+		}
+
+		if (iG2 < 0)
+		{
+			iG2 = 0;
+		}
+		else if (iG2 > 31)
+		{
+			iG2 = 31;
+		}
+
+		if (iB2 < 0)
+		{
+			iB2 = 0;
+		}
+		else if (iB2 > 31)
+		{
+			iB2 = 31;
+		}
+
+		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5(m_pencodingbitsRGB8->differential.red1, m_pencodingbitsRGB8->differential.green1, m_pencodingbitsRGB8->differential.blue1);
+		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iR2, (unsigned char)iG2, (unsigned char)iB2);
+
+		m_uiCW1 = m_pencodingbitsRGB8->differential.cw1;
+		m_uiCW2 = m_pencodingbitsRGB8->differential.cw2;
+
+		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
+
+		Decode_ETC1();
+
+		CalcBlockError();
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding if T mode is detected
+	//
+	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_T(void)
+	{
+		m_mode = MODE_T;
+
+		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
+								m_pencodingbitsRGB8->t.red1b);
+		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
+		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
+
+		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
+		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
+		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
+
+		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
+		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
+
+		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
+
+		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
+
+		DecodePixels_T();
+
+		CalcBlockError();
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding if H mode is detected
+	//
+	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_H(void)
+	{
+		m_mode = MODE_H;
+
+		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
+		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
+									m_pencodingbitsRGB8->h.green1b);
+		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
+								(m_pencodingbitsRGB8->h.blue1b << 1) +
+								m_pencodingbitsRGB8->h.blue1c);
+
+		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
+		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
+									m_pencodingbitsRGB8->h.green2b);
+		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
+
+		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
+		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
+
+		// used to determine the LSB of the CW
+		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
+		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
+
+		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
+		if (uiRGB1 >= uiRGB2)
+		{
+			m_uiCW1++;
+		}
+
+		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
+
+		DecodePixels_H();
+
+		CalcBlockError();
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// for ETC1 modes, set the decoded colors and decoded alpha based on the encoding state
+	//
+	void Block4x4Encoding_RGB8A1::Decode_ETC1(void)
+	{
+
+		const unsigned int *pauiPixelOrder = m_boolFlip ? s_auiPixelOrderFlip1 : s_auiPixelOrderFlip0;
+
+		for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS; uiPixelOrder++)
+		{
+			ColorFloatRGBA *pfrgbaCenter = uiPixelOrder < 8 ? &m_frgbaColor1 : &m_frgbaColor2;
+			unsigned int uiCW = uiPixelOrder < 8 ? m_uiCW1 : m_uiCW2;
+
+			unsigned int uiPixel = pauiPixelOrder[uiPixelOrder];
+
+			float fDelta;
+			if (m_boolOpaque)
+				fDelta = Block4x4Encoding_ETC1::s_aafCwTable[uiCW][m_auiSelectors[uiPixel]];
+			else 
+				fDelta = s_aafCwOpaqueUnsetTable[uiCW][m_auiSelectors[uiPixel]];
+
+			if (m_boolOpaque == false && m_auiSelectors[uiPixel] == TRANSPARENT_SELECTOR)
+			{
+				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
+				//m_afDecodedAlphas[uiPixel] = 0.0f;
+			}
+			else
+			{
+				m_afrgbaDecodedColors[uiPixel] = (*pfrgbaCenter + fDelta).ClampRGB();
+				//m_afDecodedAlphas[uiPixel] = 1.0f;
+			}
+            
+            // TODO: this isn't setting alpha
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// for T mode, set the decoded colors and decoded alpha based on the encoding state
+	//
+	void Block4x4Encoding_RGB8A1::DecodePixels_T(void)
+	{
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
+
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			switch (m_auiSelectors[uiPixel])
+			{
+			case 0:
+				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
+				//m_afDecodedAlphas[uiPixel] = 1.0f;
+				break;
+
+			case 1:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
+				//m_afDecodedAlphas[uiPixel] = 1.0f;
+				break;
+
+			case 2:
+				if (m_boolOpaque == false)
+				{
+					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
+					//m_afDecodedAlphas[uiPixel] = 0.0f;
+				}
+				else
+				{
+					m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
+					//m_afDecodedAlphas[uiPixel] = 1.0f;
+				}
+				break;
+
+			case 3:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
+				//m_afDecodedAlphas[uiPixel] = 1.0f;
+				break;
+			}
+
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// for H mode, set the decoded colors and decoded alpha based on the encoding state
+	//
+	void Block4x4Encoding_RGB8A1::DecodePixels_H(void)
+	{
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
+
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			switch (m_auiSelectors[uiPixel])
+			{
+			case 0:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
+				//m_afDecodedAlphas[uiPixel] = 1.0f;
+				break;
+
+			case 1:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
+				//m_afDecodedAlphas[uiPixel] = 1.0f;
+				break;
+
+			case 2:
+				if (m_boolOpaque == false)
+				{
+					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
+					//m_afDecodedAlphas[uiPixel] = 0.0f;
+				}
+				else
+				{
+					m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
+					//m_afDecodedAlphas[uiPixel] = 1.0f;
+				}
+				break;
+
+			case 3:
+				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
+				//m_afDecodedAlphas[uiPixel] = 1.0f;
+				break;
+			}
+
+		}
+
+	}
+
+
+	// ----------------------------------------------------------------------------------------------------
+	// perform a single encoding iteration
+	// replace the encoding if a better encoding was found
+	// subsequent iterations generally take longer for each iteration
+	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
+	//
+	// RGB8A1 can't use individual mode
+	// RGB8A1 with transparent pixels can't use planar mode
+	//
+	void Block4x4Encoding_RGB8A1::PerformIteration(float a_fEffort)
+	{
+        if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE)
+        {
+            PerformIterationOpaque(a_fEffort);
+            return;
+        }
+        else if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT)
+        {
+            PerformIterationTransparent(a_fEffort);
+            return;
+        }
+        
+		assert(!m_boolOpaque);
+		assert(!m_boolTransparent);
+		assert(!m_boolDone);
+
+		switch (m_uiEncodingIterations)
+		{
+		case 0:
+			PerformFirstIteration();
+			break;
+
+		case 1:
+			TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
+			break;
+
+		case 2:
+			TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
+			if (a_fEffort <= 39.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 3:
+			Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH();
+			TryT(1);
+			TryH(1);
+			if (a_fEffort <= 49.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 4:
+			TryDegenerates1();
+			if (a_fEffort <= 59.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 5:
+			TryDegenerates2();
+			if (a_fEffort <= 69.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 6:
+			TryDegenerates3();
+			if (a_fEffort <= 79.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 7:
+			TryDegenerates4();
+			m_boolDone = true;
+			break;
+
+		default:
+			assert(0);
+			break;
+		}
+
+		m_uiEncodingIterations++;
+
+		SetDoneIfPerfect();
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// find best initial encoding to ensure block has a valid encoding
+	//
+	void Block4x4Encoding_RGB8A1::PerformFirstIteration(void)
+	{
+		Block4x4Encoding_ETC1::CalculateMostLikelyFlip();
+
+		m_fError = FLT_MAX;
+
+		TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
+		SetDoneIfPerfect();
+		if (m_boolDone)
+		{
+			return;
+		}
+		TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
+		SetDoneIfPerfect();
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// mostly copied from ETC1
+	// differences:
+	//		Block4x4Encoding_RGB8A1 encodingTry = *this;
+	//
+	void Block4x4Encoding_RGB8A1::TryDifferential(bool a_boolFlip, unsigned int a_uiRadius, 
+													int a_iGrayOffset1, int a_iGrayOffset2)
+	{
+
+		ColorFloatRGBA frgbaColor1;
+		ColorFloatRGBA frgbaColor2;
+
+		const unsigned int *pauiPixelMapping1;
+		const unsigned int *pauiPixelMapping2;
+
+		if (a_boolFlip)
+		{
+			frgbaColor1 = m_frgbaSourceAverageTop;
+			frgbaColor2 = m_frgbaSourceAverageBottom;
+
+			pauiPixelMapping1 = s_auiTopPixelMapping;
+			pauiPixelMapping2 = s_auiBottomPixelMapping;
+		}
+		else
+		{
+			frgbaColor1 = m_frgbaSourceAverageLeft;
+			frgbaColor2 = m_frgbaSourceAverageRight;
+
+			pauiPixelMapping1 = s_auiLeftPixelMapping;
+			pauiPixelMapping2 = s_auiRightPixelMapping;
+		}
+
+		DifferentialTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, 
+								a_uiRadius, a_iGrayOffset1, a_iGrayOffset2);
+
+		Block4x4Encoding_RGB8A1 encodingTry = *this;
+		encodingTry.m_boolFlip = a_boolFlip;
+
+		encodingTry.TryDifferentialHalf(&trys.m_half1);
+		encodingTry.TryDifferentialHalf(&trys.m_half2);
+
+		// find best halves that are within differential range
+		DifferentialTrys::Try *ptryBest1 = nullptr;
+		DifferentialTrys::Try *ptryBest2 = nullptr;
+		encodingTry.m_fError = FLT_MAX;
+
+		// see if the best of each half are in differential range
+		int iDRed = trys.m_half2.m_ptryBest->m_iRed - trys.m_half1.m_ptryBest->m_iRed;
+		int iDGreen = trys.m_half2.m_ptryBest->m_iGreen - trys.m_half1.m_ptryBest->m_iGreen;
+		int iDBlue = trys.m_half2.m_ptryBest->m_iBlue - trys.m_half1.m_ptryBest->m_iBlue;
+		if (iDRed >= -4 && iDRed <= 3 && iDGreen >= -4 && iDGreen <= 3 && iDBlue >= -4 && iDBlue <= 3)
+		{
+			ptryBest1 = trys.m_half1.m_ptryBest;
+			ptryBest2 = trys.m_half2.m_ptryBest;
+			encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
+		}
+		else
+		{
+			// else, find the next best halves that are in differential range
+			for (DifferentialTrys::Try *ptry1 = &trys.m_half1.m_atry[0];
+			ptry1 < &trys.m_half1.m_atry[trys.m_half1.m_uiTrys];
+				ptry1++)
+			{
+				for (DifferentialTrys::Try *ptry2 = &trys.m_half2.m_atry[0];
+				ptry2 < &trys.m_half2.m_atry[trys.m_half2.m_uiTrys];
+					ptry2++)
+				{
+					iDRed = ptry2->m_iRed - ptry1->m_iRed;
+					bool boolValidRedDelta = iDRed <= 3 && iDRed >= -4;
+					iDGreen = ptry2->m_iGreen - ptry1->m_iGreen;
+					bool boolValidGreenDelta = iDGreen <= 3 && iDGreen >= -4;
+					iDBlue = ptry2->m_iBlue - ptry1->m_iBlue;
+					bool boolValidBlueDelta = iDBlue <= 3 && iDBlue >= -4;
+
+					if (boolValidRedDelta && boolValidGreenDelta && boolValidBlueDelta)
+					{
+						float fError = ptry1->m_fError + ptry2->m_fError;
+
+						if (fError < encodingTry.m_fError)
+						{
+							encodingTry.m_fError = fError;
+
+							ptryBest1 = ptry1;
+							ptryBest2 = ptry2;
+						}
+					}
+
+				}
+			}
+			assert(encodingTry.m_fError < FLT_MAX);
+			assert(ptryBest1 != nullptr);
+			assert(ptryBest2 != nullptr);
+		}
+
+		if (encodingTry.m_fError < m_fError)
+		{
+			m_mode = MODE_ETC1;
+			m_boolDiff = true;
+			m_boolFlip = encodingTry.m_boolFlip;
+			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
+			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
+			m_uiCW1 = ptryBest1->m_uiCW;
+			m_uiCW2 = ptryBest2->m_uiCW;
+
+			m_fError = 0.0f;
+			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
+			{
+				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
+				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
+
+				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
+				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
+
+				m_auiSelectors[uiPixel1] = uiSelector1;
+				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
+
+                float alpha1 = 1.0;
+                float alpha2 = 1.0;
+                
+				if (uiSelector1 == TRANSPARENT_SELECTOR)
+				{
+					m_afrgbaDecodedColors[uiPixel1] = ColorFloatRGBA();
+					//m_afDecodedAlphas[uiPixel1] = 0.0f;
+                    alpha1 = 0.0;
+				}
+				else
+				{
+					float fDeltaRGB1 = s_aafCwOpaqueUnsetTable[m_uiCW1][uiSelector1];
+					m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
+					//m_afDecodedAlphas[uiPixel1] = 1.0f;
+				}
+
+				if (uiSelector2 == TRANSPARENT_SELECTOR)
+				{
+					m_afrgbaDecodedColors[uiPixel2] = ColorFloatRGBA();
+					//m_afDecodedAlphas[uiPixel2] = 0.0f;
+                    alpha2 = 0.0;
+				}
+				else
+				{
+					float fDeltaRGB2 = s_aafCwOpaqueUnsetTable[m_uiCW2][uiSelector2];
+					m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
+					//m_afDecodedAlphas[uiPixel2] = 1.0f;
+				}
+
+				float fDeltaA1 =alpha1 - m_pafrgbaSource[uiPixel1].fA;
+				m_fError += fDeltaA1 * fDeltaA1;
+				float fDeltaA2 = alpha2 - m_pafrgbaSource[uiPixel2].fA;
+				m_fError += fDeltaA2 * fDeltaA2;
+			}
+
+			m_fError1 = ptryBest1->m_fError;
+			m_fError2 = ptryBest2->m_fError;
+			m_boolSeverelyBentDifferentialColors = trys.m_boolSeverelyBentColors;
+			m_fError = m_fError1 + m_fError2;
+
+			// sanity check
+			{
+				int iRed1 = m_frgbaColor1.IntRed(31.0f);
+				int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
+				int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
+
+				int iRed2 = m_frgbaColor2.IntRed(31.0f);
+				int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
+				int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
+
+				iDRed = iRed2 - iRed1;
+				iDGreen = iGreen2 - iGreen1;
+				iDBlue = iBlue2 - iBlue1;
+
+				assert(iDRed >= -4 && iDRed < 4);
+				assert(iDGreen >= -4 && iDGreen < 4);
+				assert(iDBlue >= -4 && iDBlue < 4);
+			}
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// mostly copied from ETC1
+	// differences:
+	//		uses s_aafCwOpaqueUnsetTable
+	//		color for selector set to 0,0,0,0
+	//
+	void Block4x4Encoding_RGB8A1::TryDifferentialHalf(DifferentialTrys::Half *a_phalf)
+	{
+
+		a_phalf->m_ptryBest = nullptr;
+		float fBestTryError = FLT_MAX;
+
+		a_phalf->m_uiTrys = 0;
+		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius;
+		iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
+			iRed++)
+		{
+			assert(iRed >= 0 && iRed <= 31);
+
+			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
+			iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
+				iGreen++)
+			{
+				assert(iGreen >= 0 && iGreen <= 31);
+
+				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
+				iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
+					iBlue++)
+				{
+					assert(iBlue >= 0 && iBlue <= 31);
+
+					DifferentialTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
+					assert(ptry < &a_phalf->m_atry[DifferentialTrys::Half::MAX_TRYS]);
+
+					ptry->m_iRed = iRed;
+					ptry->m_iGreen = iGreen;
+					ptry->m_iBlue = iBlue;
+					ptry->m_fError = FLT_MAX;
+					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
+
+					// try each CW
+					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
+					{
+						unsigned int auiPixelSelectors[PIXELS / 2];
+						ColorFloatRGBA	afrgbaDecodedColors[PIXELS / 2];
+						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
+							FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
+
+						// pre-compute decoded pixels for each selector
+						ColorFloatRGBA afrgbaSelectors[SELECTORS];
+						assert(SELECTORS == 4);
+						afrgbaSelectors[0] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][0]).ClampRGB();
+						afrgbaSelectors[1] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][1]).ClampRGB();
+						afrgbaSelectors[2] = ColorFloatRGBA();
+						afrgbaSelectors[3] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][3]).ClampRGB();
+
+						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
+						{
+                            int srcPixelIndex = a_phalf->m_pauiPixelMapping[uiPixel];
+							const ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[srcPixelIndex];
+							ColorFloatRGBA frgbaDecodedPixel;
+
+							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
+							{
+								if (pfrgbaSourcePixel->fA < 0.5f)
+								{
+									uiSelector = TRANSPARENT_SELECTOR;
+								}
+								else if (uiSelector == TRANSPARENT_SELECTOR)
+								{
+									continue;
+								}
+
+								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
+
+								float fPixelError;
+								
+								fPixelError = CalcPixelError(frgbaDecodedPixel, srcPixelIndex);
+
+								if (fPixelError < afPixelErrors[uiPixel])
+								{
+									auiPixelSelectors[uiPixel] = uiSelector;
+									afrgbaDecodedColors[uiPixel] = frgbaDecodedPixel;
+									afPixelErrors[uiPixel] = fPixelError;
+								}
+
+								if (uiSelector == TRANSPARENT_SELECTOR)
+								{
+									break;
+								}
+							}
+						}
+
+						// add up all pixel errors
+						float fCWError = 0.0f;
+						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
+						{
+							fCWError += afPixelErrors[uiPixel];
+						}
+
+						// if best CW so far
+						if (fCWError < ptry->m_fError)
+						{
+							ptry->m_uiCW = uiCW;
+							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
+							{
+								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
+							}
+							ptry->m_fError = fCWError;
+						}
+
+					}
+
+					if (ptry->m_fError < fBestTryError)
+					{
+						a_phalf->m_ptryBest = ptry;
+						fBestTryError = ptry->m_fError;
+					}
+
+					assert(ptry->m_fError < FLT_MAX);
+
+					a_phalf->m_uiTrys++;
+				}
+			}
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try encoding in T mode
+	// save this encoding if it improves the error
+	//
+	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
+	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
+	//
+	void Block4x4Encoding_RGB8A1::TryT(unsigned int a_uiRadius)
+	{
+		Block4x4Encoding_RGB8A1 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_T;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+			encodingTry.m_fError = FLT_MAX;
+		}
+
+		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
+		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
+		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
+
+		int iMinRed1 = iColor1Red - (int)a_uiRadius;
+		if (iMinRed1 < 0)
+		{
+			iMinRed1 = 0;
+		}
+		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
+		if (iMaxRed1 > 15)
+		{
+			iMinRed1 = 15;
+		}
+
+		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
+		if (iMinGreen1 < 0)
+		{
+			iMinGreen1 = 0;
+		}
+		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
+		if (iMaxGreen1 > 15)
+		{
+			iMinGreen1 = 15;
+		}
+
+		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
+		if (iMinBlue1 < 0)
+		{
+			iMinBlue1 = 0;
+		}
+		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
+		if (iMaxBlue1 > 15)
+		{
+			iMinBlue1 = 15;
+		}
+
+		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
+		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
+		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
+
+		int iMinRed2 = iColor2Red - (int)a_uiRadius;
+		if (iMinRed2 < 0)
+		{
+			iMinRed2 = 0;
+		}
+		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
+		if (iMaxRed2 > 15)
+		{
+			iMinRed2 = 15;
+		}
+
+		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
+		if (iMinGreen2 < 0)
+		{
+			iMinGreen2 = 0;
+		}
+		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
+		if (iMaxGreen2 > 15)
+		{
+			iMinGreen2 = 15;
+		}
+
+		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
+		if (iMinBlue2 < 0)
+		{
+			iMinBlue2 = 0;
+		}
+		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
+		if (iMaxBlue2 > 15)
+		{
+			iMinBlue2 = 15;
+		}
+
+		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
+		{
+			encodingTry.m_uiCW1 = uiDistance;
+
+			// twiddle m_frgbaOriginalColor2_TAndH
+			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
+			//
+			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
+			{
+				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
+				{
+					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
+					{
+						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
+						{
+							if (uiBaseColorSwaps == 0)
+							{
+								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
+								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
+							}
+							else
+							{
+								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
+								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
+							}
+
+							encodingTry.TryT_BestSelectorCombination();
+
+							if (encodingTry.m_fError < m_fError)
+							{
+								m_mode = encodingTry.m_mode;
+								m_boolDiff = encodingTry.m_boolDiff;
+								m_boolFlip = encodingTry.m_boolFlip;
+
+								m_frgbaColor1 = encodingTry.m_frgbaColor1;
+								m_frgbaColor2 = encodingTry.m_frgbaColor2;
+								m_uiCW1 = encodingTry.m_uiCW1;
+
+								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+								{
+									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+								}
+
+								m_fError = encodingTry.m_fError;
+							}
+						}
+					}
+				}
+			}
+
+			// twiddle m_frgbaOriginalColor1_TAndH
+			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
+			{
+				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
+				{
+					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
+					{
+						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
+						{
+							if (uiBaseColorSwaps == 0)
+							{
+								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
+								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
+							}
+							else
+							{
+								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
+								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
+							}
+
+							encodingTry.TryT_BestSelectorCombination();
+
+							if (encodingTry.m_fError < m_fError)
+							{
+								m_mode = encodingTry.m_mode;
+								m_boolDiff = encodingTry.m_boolDiff;
+								m_boolFlip = encodingTry.m_boolFlip;
+
+								m_frgbaColor1 = encodingTry.m_frgbaColor1;
+								m_frgbaColor2 = encodingTry.m_frgbaColor2;
+								m_uiCW1 = encodingTry.m_uiCW1;
+
+								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+								{
+									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+								}
+
+								m_fError = encodingTry.m_fError;
+							}
+						}
+					}
+				}
+			}
+
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// find best selector combination for TryT
+	// called on an encodingTry
+	//
+	void Block4x4Encoding_RGB8A1::TryT_BestSelectorCombination(void)
+	{
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+
+		unsigned int auiBestPixelSelectors[PIXELS];
+		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
+			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
+		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
+		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
+
+		assert(SELECTORS == 4);
+		afrgbaDecodedPixel[0] = m_frgbaColor1;
+		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
+		afrgbaDecodedPixel[2] = ColorFloatRGBA();
+		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
+
+		// try each selector
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			unsigned int uiMinSelector = 0;
+			unsigned int uiMaxSelector = SELECTORS - 1;
+
+			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
+			{
+				uiMinSelector = 2;
+				uiMaxSelector = 2;
+			}
+
+			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
+			{
+				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], uiPixel);
+                
+				if (fPixelError < afBestPixelErrors[uiPixel])
+				{
+					afBestPixelErrors[uiPixel] = fPixelError;
+					auiBestPixelSelectors[uiPixel] = uiSelector;
+					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
+				}
+			}
+		}
+		
+
+		// add up all of the pixel errors
+		float fBlockError = 0.0f;
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			fBlockError += afBestPixelErrors[uiPixel];
+		}
+
+		if (m_fError > fBlockError)
+		{
+			m_fError = fBlockError;
+
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
+				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
+			}
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try encoding in H mode
+	// save this encoding if it improves the error
+	//
+	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
+	// TWIDDLE_RADIUS of 2 is WAY too slow
+	//
+	void Block4x4Encoding_RGB8A1::TryH(unsigned int a_uiRadius)
+	{
+		Block4x4Encoding_RGB8A1 encodingTry = *this;
+
+		// init "try"
+		{
+			encodingTry.m_mode = MODE_H;
+			encodingTry.m_boolDiff = true;
+			encodingTry.m_boolFlip = false;
+			encodingTry.m_fError = FLT_MAX;
+		}
+
+		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
+		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
+		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
+
+		int iMinRed1 = iColor1Red - (int)a_uiRadius;
+		if (iMinRed1 < 0)
+		{
+			iMinRed1 = 0;
+		}
+		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
+		if (iMaxRed1 > 15)
+		{
+			iMinRed1 = 15;
+		}
+
+		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
+		if (iMinGreen1 < 0)
+		{
+			iMinGreen1 = 0;
+		}
+		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
+		if (iMaxGreen1 > 15)
+		{
+			iMinGreen1 = 15;
+		}
+
+		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
+		if (iMinBlue1 < 0)
+		{
+			iMinBlue1 = 0;
+		}
+		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
+		if (iMaxBlue1 > 15)
+		{
+			iMinBlue1 = 15;
+		}
+
+		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
+		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
+		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
+
+		int iMinRed2 = iColor2Red - (int)a_uiRadius;
+		if (iMinRed2 < 0)
+		{
+			iMinRed2 = 0;
+		}
+		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
+		if (iMaxRed2 > 15)
+		{
+			iMinRed2 = 15;
+		}
+
+		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
+		if (iMinGreen2 < 0)
+		{
+			iMinGreen2 = 0;
+		}
+		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
+		if (iMaxGreen2 > 15)
+		{
+			iMinGreen2 = 15;
+		}
+
+		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
+		if (iMinBlue2 < 0)
+		{
+			iMinBlue2 = 0;
+		}
+		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
+		if (iMaxBlue2 > 15)
+		{
+			iMinBlue2 = 15;
+		}
+
+		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
+		{
+			encodingTry.m_uiCW1 = uiDistance;
+
+			// twiddle m_frgbaOriginalColor1_TAndH
+			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
+			{
+				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
+				{
+					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
+					{
+						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
+						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
+
+						// if color1 == color2, H encoding issues can pop up, so abort
+						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
+						{
+							continue;
+						}
+
+						encodingTry.TryH_BestSelectorCombination();
+
+						if (encodingTry.m_fError < m_fError)
+						{
+							m_mode = encodingTry.m_mode;
+							m_boolDiff = encodingTry.m_boolDiff;
+							m_boolFlip = encodingTry.m_boolFlip;
+
+							m_frgbaColor1 = encodingTry.m_frgbaColor1;
+							m_frgbaColor2 = encodingTry.m_frgbaColor2;
+							m_uiCW1 = encodingTry.m_uiCW1;
+
+							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+							{
+								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+							}
+
+							m_fError = encodingTry.m_fError;
+						}
+					}
+				}
+			}
+
+			// twiddle m_frgbaOriginalColor2_TAndH
+			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
+			{
+				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
+				{
+					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
+					{
+						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
+						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
+
+						// if color1 == color2, H encoding issues can pop up, so abort
+						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
+						{
+							continue;
+						}
+
+						encodingTry.TryH_BestSelectorCombination();
+
+						if (encodingTry.m_fError < m_fError)
+						{
+							m_mode = encodingTry.m_mode;
+							m_boolDiff = encodingTry.m_boolDiff;
+							m_boolFlip = encodingTry.m_boolFlip;
+
+							m_frgbaColor1 = encodingTry.m_frgbaColor1;
+							m_frgbaColor2 = encodingTry.m_frgbaColor2;
+							m_uiCW1 = encodingTry.m_uiCW1;
+
+							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+							{
+								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
+								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
+							}
+
+							m_fError = encodingTry.m_fError;
+						}
+					}
+				}
+			}
+
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// find best selector combination for TryH
+	// called on an encodingTry
+	//
+	void Block4x4Encoding_RGB8A1::TryH_BestSelectorCombination(void)
+	{
+
+		// abort if colors and CW will pose an encoding problem
+		{
+			unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(255.0f);
+			unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(255.0f);
+			unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(255.0f);
+			unsigned int uiColorValue1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
+
+			unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(255.0f);
+			unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(255.0f);
+			unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(255.0f);
+			unsigned int uiColorValue2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
+
+			unsigned int uiCWLsb = m_uiCW1 & 1;
+
+			if ((uiColorValue1 >= (uiColorValue2 & uiCWLsb)) == 0 ||
+				(uiColorValue1 < (uiColorValue2 & uiCWLsb)) == 1)
+			{
+				return;
+			}
+		}
+
+		float fDistance = s_afTHDistanceTable[m_uiCW1];
+
+		unsigned int auiBestPixelSelectors[PIXELS];
+		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
+											FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
+		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
+		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
+
+		assert(SELECTORS == 4);
+		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
+		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
+		afrgbaDecodedPixel[2] = ColorFloatRGBA();;
+		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
+
+
+		// try each selector
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			unsigned int uiMinSelector = 0;
+			unsigned int uiMaxSelector = SELECTORS - 1;
+
+			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
+			{
+				uiMinSelector = 2;
+				uiMaxSelector = 2;
+			}
+
+			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
+			{
+				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], uiPixel);
+
+				if (fPixelError < afBestPixelErrors[uiPixel])
+				{
+					afBestPixelErrors[uiPixel] = fPixelError;
+					auiBestPixelSelectors[uiPixel] = uiSelector;
+					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
+				}
+			}
+		}
+		
+
+		// add up all of the pixel errors
+		float fBlockError = 0.0f;
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			fBlockError += afBestPixelErrors[uiPixel];
+		}
+
+		if (m_fError > fBlockError)
+		{
+			m_fError = fBlockError;
+
+			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+			{
+				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
+				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
+			}
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try version 1 of the degenerate search
+	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
+	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
+	//		be successfull
+	//
+	void Block4x4Encoding_RGB8A1::TryDegenerates1(void)
+	{
+
+		TryDifferential(m_boolMostLikelyFlip, 1, -2, 0);
+		TryDifferential(m_boolMostLikelyFlip, 1, 2, 0);
+		TryDifferential(m_boolMostLikelyFlip, 1, 0, 2);
+		TryDifferential(m_boolMostLikelyFlip, 1, 0, -2);
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try version 2 of the degenerate search
+	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
+	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
+	//		be successfull
+	//
+	void Block4x4Encoding_RGB8A1::TryDegenerates2(void)
+	{
+
+		TryDifferential(!m_boolMostLikelyFlip, 1, -2, 0);
+		TryDifferential(!m_boolMostLikelyFlip, 1, 2, 0);
+		TryDifferential(!m_boolMostLikelyFlip, 1, 0, 2);
+		TryDifferential(!m_boolMostLikelyFlip, 1, 0, -2);
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try version 3 of the degenerate search
+	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
+	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
+	//		be successfull
+	//
+	void Block4x4Encoding_RGB8A1::TryDegenerates3(void)
+	{
+
+		TryDifferential(m_boolMostLikelyFlip, 1, -2, -2);
+		TryDifferential(m_boolMostLikelyFlip, 1, -2, 2);
+		TryDifferential(m_boolMostLikelyFlip, 1, 2, -2);
+		TryDifferential(m_boolMostLikelyFlip, 1, 2, 2);
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// try version 4 of the degenerate search
+	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
+	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
+	//		be successfull
+	//
+	void Block4x4Encoding_RGB8A1::TryDegenerates4(void)
+	{
+
+		TryDifferential(m_boolMostLikelyFlip, 1, -4, 0);
+		TryDifferential(m_boolMostLikelyFlip, 1, 4, 0);
+		TryDifferential(m_boolMostLikelyFlip, 1, 0, 4);
+		TryDifferential(m_boolMostLikelyFlip, 1, 0, -4);
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state
+	//
+	void Block4x4Encoding_RGB8A1::SetEncodingBits(void)
+	{
+		switch (m_mode)
+		{
+		case MODE_ETC1:
+			SetEncodingBits_ETC1();
+			break;
+
+		case MODE_T:
+			SetEncodingBits_T();
+			break;
+
+		case MODE_H:
+			SetEncodingBits_H();
+			break;
+
+		case MODE_PLANAR:
+			Block4x4Encoding_RGB8::SetEncodingBits_Planar();
+			break;
+
+		default:
+			assert(false);
+		}
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state if ETC1 mode
+	//
+	void Block4x4Encoding_RGB8A1::SetEncodingBits_ETC1(void)
+	{
+
+		// there is no individual mode in RGB8A1
+		assert(m_boolDiff);
+
+		int iRed1 = m_frgbaColor1.IntRed(31.0f);
+		int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
+		int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
+
+		int iRed2 = m_frgbaColor2.IntRed(31.0f);
+		int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
+		int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
+
+		int iDRed2 = iRed2 - iRed1;
+		int iDGreen2 = iGreen2 - iGreen1;
+		int iDBlue2 = iBlue2 - iBlue1;
+
+		assert(iDRed2 >= -4 && iDRed2 < 4);
+		assert(iDGreen2 >= -4 && iDGreen2 < 4);
+		assert(iDBlue2 >= -4 && iDBlue2 < 4);
+
+		m_pencodingbitsRGB8->differential.red1 = iRed1;
+		m_pencodingbitsRGB8->differential.green1 = iGreen1;
+		m_pencodingbitsRGB8->differential.blue1 = iBlue1;
+
+		m_pencodingbitsRGB8->differential.dred2 = iDRed2;
+		m_pencodingbitsRGB8->differential.dgreen2 = iDGreen2;
+		m_pencodingbitsRGB8->differential.dblue2 = iDBlue2;
+
+		m_pencodingbitsRGB8->individual.cw1 = m_uiCW1;
+		m_pencodingbitsRGB8->individual.cw2 = m_uiCW2;
+
+		SetEncodingBits_Selectors();
+
+		// in RGB8A1 encoding bits, opaque replaces differential
+		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
+
+		m_pencodingbitsRGB8->individual.flip = m_boolFlip;
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state if T mode
+	//
+	void Block4x4Encoding_RGB8A1::SetEncodingBits_T(void)
+	{
+		static const bool SANITY_CHECK = true;
+
+		assert(m_mode == MODE_T);
+		assert(m_boolDiff == true);
+
+		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
+		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
+		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
+
+		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
+		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
+		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
+
+		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
+		m_pencodingbitsRGB8->t.red1b = uiRed1;
+		m_pencodingbitsRGB8->t.green1 = uiGreen1;
+		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
+
+		m_pencodingbitsRGB8->t.red2 = uiRed2;
+		m_pencodingbitsRGB8->t.green2 = uiGreen2;
+		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
+
+		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
+		m_pencodingbitsRGB8->t.db = m_uiCW1;
+
+		// in RGB8A1 encoding bits, opaque replaces differential
+		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
+
+		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
+
+		// create an invalid R differential to trigger T mode
+		m_pencodingbitsRGB8->t.detect1 = 0;
+		m_pencodingbitsRGB8->t.detect2 = 0;
+		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+		if (iRed2 >= 4)
+		{
+			m_pencodingbitsRGB8->t.detect1 = 7;
+			m_pencodingbitsRGB8->t.detect2 = 0;
+		}
+		else
+		{
+			m_pencodingbitsRGB8->t.detect1 = 0;
+			m_pencodingbitsRGB8->t.detect2 = 1;
+		}
+
+		if (SANITY_CHECK)
+		{
+			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+
+			// make sure red overflows
+			assert(iRed2 < 0 || iRed2 > 31);
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state if H mode
+	//
+	// colors and selectors may need to swap in order to generate lsb of distance index
+	//
+	void Block4x4Encoding_RGB8A1::SetEncodingBits_H(void)
+	{
+		static const bool SANITY_CHECK = true;
+
+		assert(m_mode == MODE_H);
+		assert(m_boolDiff == true);
+
+		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
+		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
+		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
+
+		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
+		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
+		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
+
+		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
+		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
+
+		bool boolOddDistance = m_uiCW1 & 1;
+		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
+
+		if (boolSwapColors)
+		{
+			m_pencodingbitsRGB8->h.red1 = uiRed2;
+			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
+			m_pencodingbitsRGB8->h.green1b = uiGreen2;
+			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
+			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
+			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
+
+			m_pencodingbitsRGB8->h.red2 = uiRed1;
+			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
+			m_pencodingbitsRGB8->h.green2b = uiGreen1;
+			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
+
+			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
+			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
+		}
+		else
+		{
+			m_pencodingbitsRGB8->h.red1 = uiRed1;
+			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
+			m_pencodingbitsRGB8->h.green1b = uiGreen1;
+			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
+			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
+			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
+
+			m_pencodingbitsRGB8->h.red2 = uiRed2;
+			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
+			m_pencodingbitsRGB8->h.green2b = uiGreen2;
+			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
+
+			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
+			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
+		}
+
+		// in RGB8A1 encoding bits, opaque replaces differential
+		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
+
+		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
+
+		if (boolSwapColors)
+		{
+			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
+		}
+
+		// create an invalid R differential to trigger T mode
+		m_pencodingbitsRGB8->h.detect1 = 0;
+		m_pencodingbitsRGB8->h.detect2 = 0;
+		m_pencodingbitsRGB8->h.detect3 = 0;
+		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
+		if (iRed2 < 0 || iRed2 > 31)
+		{
+			m_pencodingbitsRGB8->h.detect1 = 1;
+		}
+		if (iGreen2 >= 4)
+		{
+			m_pencodingbitsRGB8->h.detect2 = 7;
+			m_pencodingbitsRGB8->h.detect3 = 0;
+		}
+		else
+		{
+			m_pencodingbitsRGB8->h.detect2 = 0;
+			m_pencodingbitsRGB8->h.detect3 = 1;
+		}
+
+		if (SANITY_CHECK)
+		{
+			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
+			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
+
+			// make sure red doesn't overflow and green does
+			assert(iRed2 >= 0 && iRed2 <= 31);
+			assert(iGreen2 < 0 || iGreen2 > 31);
+		}
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// perform a single encoding iteration
+	// replace the encoding if a better encoding was found
+	// subsequent iterations generally take longer for each iteration
+	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
+	//
+	void Block4x4Encoding_RGB8A1::PerformIterationOpaque(float a_fEffort)
+	{
+		assert(!m_boolPunchThroughPixels);
+		assert(!m_boolTransparent);
+		assert(!m_boolDone);
+
+		switch (m_uiEncodingIterations)
+		{
+		case 0:
+                PerformFirstIterationOpaque();
+			break;
+
+		case 1:
+			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
+			break;
+
+		case 2:
+			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
+			break;
+
+		case 3:
+			Block4x4Encoding_RGB8::TryPlanar(1);
+			break;
+
+		case 4:
+			Block4x4Encoding_RGB8::TryTAndH(1);
+			if (a_fEffort <= 49.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 5:
+			Block4x4Encoding_ETC1::TryDegenerates1();
+			if (a_fEffort <= 59.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 6:
+			Block4x4Encoding_ETC1::TryDegenerates2();
+			if (a_fEffort <= 69.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 7:
+			Block4x4Encoding_ETC1::TryDegenerates3();
+			if (a_fEffort <= 79.5f)
+			{
+				m_boolDone = true;
+			}
+			break;
+
+		case 8:
+			Block4x4Encoding_ETC1::TryDegenerates4();
+			m_boolDone = true;
+			break;
+
+		default:
+			assert(0);
+			break;
+		}
+
+		m_uiEncodingIterations++;
+		SetDoneIfPerfect();
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// find best initial encoding to ensure block has a valid encoding
+	//
+	void Block4x4Encoding_RGB8A1::PerformFirstIterationOpaque(void)
+	{
+		
+		// set decoded alphas
+		// calculate alpha error
+		m_fError = 0.0f;
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			// m_afDecodedAlphas[uiPixel] = 1.0f;
+
+			float fDeltaA = 1.0f - m_pafrgbaSource[uiPixel].fA;
+			m_fError += fDeltaA * fDeltaA;
+		}
+
+		CalculateMostLikelyFlip();
+
+		m_fError = FLT_MAX;
+
+		Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
+		SetDoneIfPerfect();
+		if (m_boolDone)
+		{
+			return;
+		}
+		Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
+		SetDoneIfPerfect();
+		if (m_boolDone)
+		{
+			return;
+		}
+		Block4x4Encoding_RGB8::TryPlanar(0);
+		SetDoneIfPerfect();
+		if (m_boolDone)
+		{
+			return;
+		}
+		Block4x4Encoding_RGB8::TryTAndH(0);
+		SetDoneIfPerfect();
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// perform a single encoding iteration
+	// replace the encoding if a better encoding was found
+	// subsequent iterations generally take longer for each iteration
+	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
+	//
+	void Block4x4Encoding_RGB8A1::PerformIterationTransparent(float )
+	{
+		assert(!m_boolOpaque);
+		assert(m_boolTransparent);
+		assert(!m_boolDone);
+		assert(m_uiEncodingIterations == 0);
+
+		m_mode = MODE_ETC1;
+		m_boolDiff = true;
+		m_boolFlip = false;
+
+		m_uiCW1 = 0;
+		m_uiCW2 = 0;
+
+		m_frgbaColor1 = ColorFloatRGBA();
+		m_frgbaColor2 = ColorFloatRGBA();
+
+		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+		{
+			m_auiSelectors[uiPixel] = TRANSPARENT_SELECTOR;
+
+			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
+			//m_afDecodedAlphas[uiPixel] = 0.0f;
+		}
+
+		CalcBlockError();
+
+		m_boolDone = true;
+		m_uiEncodingIterations++;
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+}
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.h b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.h
index f6b31bad..05e57417 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.h
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8A1.h
@@ -1,140 +1,140 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcErrorMetric.h"
-#include "EtcBlock4x4EncodingBits.h"
-
-namespace Etc
-{
-
-	// ################################################################################
-	// Block4x4Encoding_RGB8A1
-	// RGB8A1 if not completely opaque or transparent
-	// ################################################################################
-
-	class Block4x4Encoding_RGB8A1 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		static const unsigned int TRANSPARENT_SELECTOR = 2;
-
-		Block4x4Encoding_RGB8A1(void);
-		virtual ~Block4x4Encoding_RGB8A1(void);
-
-		virtual void Encode(Block4x4 *a_pblockParent,
-									const ColorFloatRGBA *a_pafrgbaSource,
-									unsigned char *a_paucEncodingBits,
-									ErrorMetric a_errormetric) override;
-
-		virtual void Decode(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											const ColorFloatRGBA *a_pafrgbaSource,
-											ErrorMetric a_errormetric,
-                                            uint16_t iterationCount) override;
-
-		virtual void PerformIteration(float a_fEffort) override;
-
-		virtual void SetEncodingBits(void) override;
-
-		void InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
-										unsigned char *a_paucEncodingBits,
-										const ColorFloatRGBA *a_pafrgbaSource,
-										ErrorMetric a_errormetric, uint16_t iterationCount);
-
-		void InitFromEncodingBits_T(void);
-		void InitFromEncodingBits_H(void);
-
-		void PerformFirstIteration(void);
-
-		void Decode_ETC1(void);
-		void DecodePixels_T(void);
-		void DecodePixels_H(void);
-		void SetEncodingBits_ETC1(void);
-		void SetEncodingBits_T(void);
-		void SetEncodingBits_H(void);
-
-	private:
-
-		bool m_boolOpaque;				// all source pixels have alpha >= 0.5
-		bool m_boolTransparent;			// all source pixels have alpha < 0.5
-		bool m_boolPunchThroughPixels;	// some source pixels have alpha < 0.5
-
-        // pulled from ETC1
-        static const unsigned int CW_BITS = 3;
-        static const unsigned int CW_RANGES = 1 << CW_BITS;
-        
-        static const unsigned int SELECTOR_BITS = 2;
-        static const unsigned int SELECTORS = 1 << SELECTOR_BITS;
-
-		static float s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS];
-
-	private:
-
-		void TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
-								int a_iGrayOffset1, int a_iGrayOffset2);
-		void TryDifferentialHalf(DifferentialTrys::Half *a_phalf);
-
-		void TryT(unsigned int a_uiRadius);
-		void TryT_BestSelectorCombination(void);
-		void TryH(unsigned int a_uiRadius);
-		void TryH_BestSelectorCombination(void);
-
-		void TryDegenerates1(void);
-		void TryDegenerates2(void);
-		void TryDegenerates3(void);
-		void TryDegenerates4(void);
-
-        void PerformIterationOpaque(float a_fEffort);
-        void PerformFirstIterationOpaque(void);
-        void PerformIterationTransparent(float);
-	};
-
-//	// ################################################################################
-//	// Block4x4Encoding_RGB8A1_Opaque
-//	// RGB8A1 if all pixels have alpha==1
-//	// ################################################################################
-//
-//	class Block4x4Encoding_RGB8A1_Opaque : public Block4x4Encoding_RGB8A1
-//	{
-//	public:
-//
-//		virtual void PerformIteration(float a_fEffort);
-//
-//		void PerformFirstIteration(void);
-//
-//	private:
-//
-//	};
-//
-//	// ################################################################################
-//	// Block4x4Encoding_RGB8A1_Transparent
-//	// RGB8A1 if all pixels have alpha==0
-//	// ################################################################################
-//
-//	class Block4x4Encoding_RGB8A1_Transparent : public Block4x4Encoding_RGB8A1
-//	{
-//	public:
-//
-//		virtual void PerformIteration(float a_fEffort);
-//
-//	private:
-//
-//	};
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcBlock4x4Encoding_RGB8.h"
+#include "EtcErrorMetric.h"
+#include "EtcBlock4x4EncodingBits.h"
+
+namespace Etc
+{
+
+	// ################################################################################
+	// Block4x4Encoding_RGB8A1
+	// RGB8A1 if not completely opaque or transparent
+	// ################################################################################
+
+	class Block4x4Encoding_RGB8A1 : public Block4x4Encoding_RGB8
+	{
+	public:
+
+		static const unsigned int TRANSPARENT_SELECTOR = 2;
+
+		Block4x4Encoding_RGB8A1(void);
+		virtual ~Block4x4Encoding_RGB8A1(void);
+
+		virtual void Encode(Block4x4 *a_pblockParent,
+									const ColorFloatRGBA *a_pafrgbaSource,
+									unsigned char *a_paucEncodingBits,
+									ErrorMetric a_errormetric) override;
+
+		virtual void Decode(Block4x4 *a_pblockParent,
+											unsigned char *a_paucEncodingBits,
+											const ColorFloatRGBA *a_pafrgbaSource,
+											ErrorMetric a_errormetric,
+                                            uint16_t iterationCount) override;
+
+		virtual void PerformIteration(float a_fEffort) override;
+
+		virtual void SetEncodingBits(void) override;
+
+		void InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
+										unsigned char *a_paucEncodingBits,
+										const ColorFloatRGBA *a_pafrgbaSource,
+										ErrorMetric a_errormetric, uint16_t iterationCount);
+
+		void InitFromEncodingBits_T(void);
+		void InitFromEncodingBits_H(void);
+
+		void PerformFirstIteration(void);
+
+		void Decode_ETC1(void);
+		void DecodePixels_T(void);
+		void DecodePixels_H(void);
+		void SetEncodingBits_ETC1(void);
+		void SetEncodingBits_T(void);
+		void SetEncodingBits_H(void);
+
+	private:
+
+		bool m_boolOpaque;				// all source pixels have alpha >= 0.5
+		bool m_boolTransparent;			// all source pixels have alpha < 0.5
+		bool m_boolPunchThroughPixels;	// some source pixels have alpha < 0.5
+
+        // pulled from ETC1
+        static const unsigned int CW_BITS = 3;
+        static const unsigned int CW_RANGES = 1 << CW_BITS;
+        
+        static const unsigned int SELECTOR_BITS = 2;
+        static const unsigned int SELECTORS = 1 << SELECTOR_BITS;
+
+		static float s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS];
+
+	private:
+
+		void TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
+								int a_iGrayOffset1, int a_iGrayOffset2);
+		void TryDifferentialHalf(DifferentialTrys::Half *a_phalf);
+
+		void TryT(unsigned int a_uiRadius);
+		void TryT_BestSelectorCombination(void);
+		void TryH(unsigned int a_uiRadius);
+		void TryH_BestSelectorCombination(void);
+
+		void TryDegenerates1(void);
+		void TryDegenerates2(void);
+		void TryDegenerates3(void);
+		void TryDegenerates4(void);
+
+        void PerformIterationOpaque(float a_fEffort);
+        void PerformFirstIterationOpaque(void);
+        void PerformIterationTransparent(float);
+	};
+
+//	// ################################################################################
+//	// Block4x4Encoding_RGB8A1_Opaque
+//	// RGB8A1 if all pixels have alpha==1
+//	// ################################################################################
+//
+//	class Block4x4Encoding_RGB8A1_Opaque : public Block4x4Encoding_RGB8A1
+//	{
+//	public:
+//
+//		virtual void PerformIteration(float a_fEffort);
+//
+//		void PerformFirstIteration(void);
+//
+//	private:
+//
+//	};
+//
+//	// ################################################################################
+//	// Block4x4Encoding_RGB8A1_Transparent
+//	// RGB8A1 if all pixels have alpha==0
+//	// ################################################################################
+//
+//	class Block4x4Encoding_RGB8A1_Transparent : public Block4x4Encoding_RGB8A1
+//	{
+//	public:
+//
+//		virtual void PerformIteration(float a_fEffort);
+//
+//	private:
+//
+//	};
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp
index ea0a2427..33b08271 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp
@@ -1,556 +1,556 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGBA8.cpp contains:
-	Block4x4Encoding_RGBA8
-	Block4x4Encoding_RGBA8_Opaque
-	Block4x4Encoding_RGBA8_Transparent
-
-These encoders are used when targetting file format RGBA8.
-
-Block4x4Encoding_RGBA8_Opaque is used when all pixels in the 4x4 block are opaque
-Block4x4Encoding_RGBA8_Transparent is used when all pixels in the 4x4 block are transparent
-Block4x4Encoding_RGBA8 is used when there is a mixture of alphas in the 4x4 block
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGBA8.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-//#include <algorithm>
-
-namespace Etc
-{
-    template<typename T>
-    T clamp(T value, T mn, T mx) {
-        return (value <= mn) ? mn : ((value >= mx) ? mx : value);
-    }
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGBA8
-	// ####################################################################################################
-
-    static const unsigned int MODIFIER_TABLE_ENTRYS = 16;
-    static const unsigned int ALPHA_SELECTOR_BITS = 3;
-    static const unsigned int ALPHA_SELECTORS = 1 << ALPHA_SELECTOR_BITS;
-
-    // same selector table used for R11/G11/A8
-    static const int8_t s_aafModifierTable8[MODIFIER_TABLE_ENTRYS][ALPHA_SELECTORS]
-    {
-        { -3, -6,  -9, -15, 2, 5, 8, 14 },
-        { -3, -7, -10, -13, 2, 6, 9, 12 },
-        { -2, -5,  -8, -13, 1, 4, 7, 12 },
-        { -2, -4,  -6, -13, 1, 3, 5, 12 },
-
-        { -3, -6,  -8, -12, 2, 5, 7, 11 },
-        { -3, -7,  -9, -11, 2, 6, 8, 10 },
-        { -4, -7,  -8, -11, 3, 6, 7, 10 },
-        { -3, -5,  -8, -11, 2, 4, 7, 10 },
-
-        { -2, -6,  -8, -10, 1, 5, 7,  9 },
-        { -2, -5,  -8, -10, 1, 4, 7,  9 },
-        { -2, -4,  -8, -10, 1, 3, 7,  9 },
-        { -2, -5,  -7, -10, 1, 4, 6,  9 },
-
-        { -3, -4,  -7, -10, 2, 3, 6,  9 },
-        { -1, -2,  -3, -10, 0, 1, 2,  9 },
-        { -4, -6,  -8,  -9, 3, 5, 7,  8 },
-        { -3, -5,  -7,  -9, 2, 4, 6,  8 }
-    };
-
-    inline float DecodePixelAlpha(float a_fBase, float a_fMultiplier,
-                                    unsigned int a_uiTableIndex, unsigned int a_uiSelector)
-    {
-        float fPixelAlpha = (a_fBase +
-                            a_fMultiplier * s_aafModifierTable8[a_uiTableIndex][a_uiSelector]) / 255.0f;
-        if (fPixelAlpha < 0.0f)
-        {
-            fPixelAlpha = 0.0f;
-        }
-        else if (fPixelAlpha > 1.0f)
-        {
-            fPixelAlpha = 1.0f;
-        }
-
-        return fPixelAlpha;
-    }
-
-    inline int DecodePixelAlphaInt(int a_fBase, int a_fMultiplier,
-                                    unsigned int a_uiTableIndex, unsigned int a_uiSelector)
-    {
-        int fPixelAlpha = a_fBase +
-                            a_fMultiplier * s_aafModifierTable8[a_uiTableIndex][a_uiSelector];
-        
-        return clamp(fPixelAlpha, 0, 255);
-    }
-
-
-
-    Block4x4Encoding_A8::Block4x4Encoding_A8(void)
-    {
-        m_pencodingbitsA8 = nullptr;
-        m_pafrgbaSource = nullptr;
-    }
-
-    Block4x4Encoding_A8::~Block4x4Encoding_A8(void) {}
-
-    void Block4x4Encoding_A8::Encode(const ColorFloatRGBA *a_pafrgbaSource,
-                                     unsigned char *a_paucEncodingBits,
-                                     Block4x4::SourceAlphaMix sourceAlphaMix)
-    {
-        m_pafrgbaSource = a_pafrgbaSource;
-        
-        m_boolDone = false;
-        
-        // really only care about error for one iteration
-        //m_fError = FLT_MAX;
-        
-        m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
-        
-        if (sourceAlphaMix == Block4x4::SourceAlphaMix::OPAQUE)
-        {
-            // set the A8 portion
-            m_fBase = 255;
-            m_uiModifierTableIndex = 15;
-            m_fMultiplier = 15;
-            
-            // set all selectors to 7 (all bits set)
-            for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-            {
-                m_auiAlphaSelectors[uiPixel] = 7;
-            }
-
-            m_boolDone = true;
-        }
-        else if ((sourceAlphaMix == Block4x4::SourceAlphaMix::ALL_ZERO_ALPHA) ||
-                 (sourceAlphaMix == Block4x4::SourceAlphaMix::TRANSPARENT))
-        {
-            // set the A8 portion
-            m_fBase = 0;
-            m_uiModifierTableIndex = 0;
-            m_fMultiplier = 1;
-            
-            // set all selectors to 0
-            for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-            {
-                m_auiAlphaSelectors[uiPixel] = 0;
-            }
-
-            m_boolDone = true;
-        }
-    }
-
-    // A8 always finished in one iterations, but error metrics on rgb iteration may need the alpha values
-    // in an error metric. Skip this if alpha not part of the metric.
-    void Block4x4Encoding_A8::Decode(unsigned char *a_paucEncodingBits,
-                                     const ColorFloatRGBA *a_pafrgbaSource)
-    {
-        // Note: this is really just decoding to write this exact same data out
-        
-        m_pafrgbaSource = a_pafrgbaSource; // don't really need to hold this
-        m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
-    
-        m_fBase = m_pencodingbitsA8->data.base;
-        m_fMultiplier = m_pencodingbitsA8->data.multiplier;
-        m_uiModifierTableIndex = m_pencodingbitsA8->data.table;
-    
-        uint64_t ulliSelectorBits = 0;
-        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors0 << (uint64_t)40;
-        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors1 << (uint64_t)32;
-        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors2 << (uint64_t)24;
-        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors3 << (uint64_t)16;
-        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors4 << (uint64_t)8;
-        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors5;
-    
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            uint64_t uiShift = 45 - (3 * uiPixel);
-            m_auiAlphaSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (uint64_t)(ALPHA_SELECTORS - 1);
-        }
-    
-        //Encode(a_pafrgbaSource, a_paucEncodingBits, sourceAlphaMix);
-        
-        // no iteration on A8, it's all done in after first PerformIteration
-        m_boolDone = true;
-        
-        // no error calc since this doesn't iterate, it's already resolved alpha
-    }
-
-    void Block4x4Encoding_A8::DecodeAlpha(float* decodedPixels)
-    {
-//        m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
-//
-//        m_fBase = m_pencodingbitsA8->data.base;
-//        m_fMultiplier = m_pencodingbitsA8->data.multiplier;
-//        m_uiModifierTableIndex = m_pencodingbitsA8->data.table;
-//
-//        uint64_t ulliSelectorBits = 0;
-//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors0 << (uint64_t)40;
-//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors1 << (uint64_t)32;
-//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors2 << (uint64_t)24;
-//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors3 << (uint64_t)16;
-//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors4 << (uint64_t)8;
-//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors5;
-//
-//        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-//        {
-//            uint64_t uiShift = 45 - (3 * uiPixel);
-//            m_auiAlphaSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (uint64_t)(ALPHA_SELECTORS - 1);
-//        }
-
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            // this is float version of decode
-            float pixel = DecodePixelAlpha(m_fBase, m_fMultiplier,
-                m_uiModifierTableIndex,
-                m_auiAlphaSelectors[uiPixel]);
-
-            decodedPixels[4 * uiPixel] = pixel;
-        }
-    }
-
-    void Block4x4Encoding_A8::PerformIteration(float a_fEffort)
-    {
-        if (m_boolDone)
-        {
-            return;
-        }
-
-        
-        // 0, 1, 2 pixel radius all done in iteration 0, only
-        // rgb is iterated on over multiple passes.
-        if (a_fEffort < 24.9f)
-        {
-            CalculateA8(0);
-        }
-        else if (a_fEffort < 49.9f)
-        {
-            CalculateA8(1);
-        }
-        else
-        {
-            CalculateA8(2);
-        }
-        
-        m_boolDone = true;
-    }
-
-    void Block4x4Encoding_A8::CalculateA8(int a_fRadius)
-    {
-        float m_fError = FLT_MAX;
-        
-        // This code is similiar to CalculateR11.  And it's all very slow doing brute force
-        // searches over a large nested for loop space.
-        uint8_t srcAlpha[PIXELS];
-        
-        // find min/max alpha
-        int fMinAlpha = 255;
-        int fMaxAlpha = 0;
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            int fAlpha = (int)roundf(255.0f * m_pafrgbaSource[uiPixel].fA);
-            if (fAlpha < fMinAlpha)
-            {
-                fMinAlpha = fAlpha;
-            }
-            if (fAlpha > fMaxAlpha)
-            {
-                fMaxAlpha = fAlpha;
-            }
-            
-            srcAlpha[uiPixel] = fAlpha;
-        }
-        
-        assert(fMinAlpha >= 0);
-        assert(fMaxAlpha <= 255);
-        assert(fMinAlpha <= fMaxAlpha);
-
-        int fAlphaRange = fMaxAlpha - fMinAlpha;
-        
-        // fast path if range 0 (constant alpha), no point in all this iteration
-        if (fAlphaRange == 0)
-        {
-            a_fRadius = 0;
-        }
-        
-        // try each modifier table entry
-        //m_fError = FLT_MAX;        // artificially high value
-        for (int uiTableEntry = 0; uiTableEntry < (int)MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-        {
-            static const unsigned int MIN_VALUE_SELECTOR = 3;
-            static const unsigned int MAX_VALUE_SELECTOR = 7;
-
-            int fTableEntryCenter = -s_aafModifierTable8[uiTableEntry][MIN_VALUE_SELECTOR];
-
-            int fTableEntryRange = s_aafModifierTable8[uiTableEntry][MAX_VALUE_SELECTOR] -
-                s_aafModifierTable8[uiTableEntry][MIN_VALUE_SELECTOR];
-
-            float fCenterRatio = fTableEntryCenter / (float)fTableEntryRange;
-
-            int fCenterInt = (int)roundf(fMinAlpha + fCenterRatio * fAlphaRange);
-            //int fCenterInt = roundf(fCenter);
-
-            int fMinBase = fCenterInt - a_fRadius;
-            int fMaxBase = fCenterInt + a_fRadius;
-            
-            if (fMinBase < 0)
-            {
-                fMinBase = 0;
-            }
-            if (fMaxBase > 255)
-            {
-                fMaxBase = 255;
-            }
-
-            // 255 range / usp to 29
-            int fRangeMultiplier = (int)roundf(fAlphaRange / (float)fTableEntryRange);
-
-            int fMinMultiplier = clamp(fRangeMultiplier - a_fRadius, 1, 15); // no 0 case like on R11
-            int fMaxMultiplier = clamp(fRangeMultiplier + a_fRadius, 1, 15);
-            
-            int auiBestSelectors[PIXELS];
-            int afBestAlphaError[PIXELS];
-            int afBestDecodedAlphas[PIXELS];
-            
-            for (int fBase = fMinBase; fBase <= fMaxBase; fBase++)
-            {
-                for (int fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier++)
-                {
-                    // find best selector for each pixel
-                    
-                    for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-                    {
-                        int fBestPixelAlphaError = 255 * 255;
-                        for (int uiSelector = 0; uiSelector < (int)ALPHA_SELECTORS; uiSelector++)
-                        {
-                            int fDecodedAlpha = DecodePixelAlphaInt(fBase, fMultiplier, uiTableEntry, uiSelector);
-
-                            // pixelError = dA ^ 2
-                            int fPixelDeltaAlpha = fDecodedAlpha - (int)srcAlpha[uiPixel];
-                            int fPixelAlphaError = fPixelDeltaAlpha * fPixelDeltaAlpha;
-
-                            if (fPixelAlphaError < fBestPixelAlphaError)
-                            {
-                                fBestPixelAlphaError = fPixelAlphaError;
-                                auiBestSelectors[uiPixel] = uiSelector;
-                                afBestAlphaError[uiPixel] = fBestPixelAlphaError;
-                                afBestDecodedAlphas[uiPixel] = fDecodedAlpha;
-                            }
-                        }
-                    }
-
-                    // accumlate pixel error into block error, sum(da^2)
-                    int fBlockError = 0;
-                    for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-                    {
-                        fBlockError += afBestAlphaError[uiPixel];
-                    }
-
-                    if (m_fError > (float)fBlockError)
-                    {
-                        m_fError = (float)fBlockError;
-
-                        m_fBase = fBase;
-                        m_fMultiplier = fMultiplier;
-                        m_uiModifierTableIndex = uiTableEntry;
-                        
-                        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-                        {
-                            m_auiAlphaSelectors[uiPixel] = auiBestSelectors[uiPixel];
-                            
-                            //m_afDecodedAlphas[uiPixel] = afBestDecodedAlphas[uiPixel] / 255.0f;
-                        }
-                        
-                        // stop the iteration if tolerance is low enough
-                        const int kErrorTolerance = 16 * 1 * 1;
-                        if (fBlockError <= kErrorTolerance) {
-                            return;
-                        }
-                    }
-                }
-            }
-
-        }
-
-    }
-
-    // ----------------------------------------------------------------------------------------------------
-    // set the encoding bits based on encoding state
-    //
-    void Block4x4Encoding_A8::SetEncodingBits(void)
-    {
-        // set the A8 portion
-        m_pencodingbitsA8->data.base = (uint8_t)roundf(/*255.0f * */ m_fBase);
-        m_pencodingbitsA8->data.table = m_uiModifierTableIndex;
-        m_pencodingbitsA8->data.multiplier = (uint8_t)roundf(m_fMultiplier);
-
-        uint64_t ulliSelectorBits = 0;
-        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-        {
-            uint64_t uiShift = 45 - (3 * uiPixel);
-            ulliSelectorBits |= ((uint64_t)m_auiAlphaSelectors[uiPixel]) << uiShift;
-        }
-
-        m_pencodingbitsA8->data.selectors0 = uint32_t(ulliSelectorBits >> (uint64_t)40);
-        m_pencodingbitsA8->data.selectors1 = uint32_t(ulliSelectorBits >> (uint64_t)32);
-        m_pencodingbitsA8->data.selectors2 = uint32_t(ulliSelectorBits >> (uint64_t)24);
-        m_pencodingbitsA8->data.selectors3 = uint32_t(ulliSelectorBits >> (uint64_t)16);
-        m_pencodingbitsA8->data.selectors4 = uint32_t(ulliSelectorBits >> (uint64_t)8);
-        m_pencodingbitsA8->data.selectors5 = uint32_t(ulliSelectorBits);
-    }
-
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGBA8::Block4x4Encoding_RGBA8(void)
-	{
-	}
-	Block4x4Encoding_RGBA8::~Block4x4Encoding_RGBA8(void) {}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RGBA8::Encode(Block4x4 *a_pblockParent,
-												const ColorFloatRGBA *a_pafrgbaSource,
-												unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource, a_errormetric, 0);
-        
-        // RGB stored after A8 block
-        m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + 8);
-
-        // Only need alpha channel passed down
-        m_alpha.Encode(a_pafrgbaSource, a_paucEncodingBits, a_pblockParent->GetSourceAlphaMix());
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGBA8::Decode(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														const ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric,
-                                                        uint16_t iterationCount)
-	{
-        // this won't iterate, but alpha values available for error calc
-        // but not using alpha in error calc anymore, so doing after RGB8 decode
-        m_alpha.Decode(a_paucEncodingBits, a_pafrgbaSource);
-        
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + 8);
-
-		// init RGB portion
-		Block4x4Encoding_RGB8::Decode(a_pblockParent,
-													(unsigned char *) m_pencodingbitsRGB8,
-													a_pafrgbaSource,
-                                                    a_errormetric,
-                                                    iterationCount);
-	}
-
-    void Block4x4Encoding_RGBA8::DecodeAlpha()
-    {
-        // API hack toe be able to fill in the decodedPixels from the already Decode called alpha
-        // this is so regular Decode path doesn't do this decode and slow down multipass
-        m_alpha.DecodeAlpha(&m_afrgbaDecodedColors[0].fA);
-    }
-
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	// similar to Block4x4Encoding_RGB8_Base::Encode_RGB8(), but with alpha added
-	//
-	void Block4x4Encoding_RGBA8::PerformIteration(float a_fEffort)
-	{
-        // return if color and alpha done, note alpha only iterates on 0
-        if (m_boolDone && m_alpha.IsDone() )
-        {
-            return;
-        }
-        
-		if (m_uiEncodingIterations == 0)
-		{
-            m_alpha.PerformIteration(a_fEffort);
-            
-            /* TODO: can only do this if color if encoding premul color
-                but kram already knocks out all the color channels in this cae
-             
-            // this skips writing out color too
-            if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT)
-            {
-                m_mode = MODE_ETC1;
-                m_boolDiff = true;
-                m_boolFlip = false;
-
-                // none of these were cleared, like RGBA1 case
-                m_uiCW1 = 0;
-                m_uiCW2 = 0;
-
-                m_frgbaColor1 = ColorFloatRGBA();
-                m_frgbaColor2 = ColorFloatRGBA();
-                
-                for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-                {
-                    m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(); // assumes rgb also 0
-                    //m_afDecodedAlphas[uiPixel] = 0.0f;
-                }
-
-                m_fError = 0.0f;
-
-                // skip processing rgb
-                m_boolDone = true;
-                //m_uiEncodingIterations++;
-            }
-            */
-		}
-
-        if (!m_boolDone)
-        {
-            Block4x4Encoding_RGB8::PerformIteration(a_fEffort);
-        }
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGBA8::SetEncodingBits(void)
-    {
-        // set the RGB8 portion
-        Block4x4Encoding_RGB8::SetEncodingBits();
-        
-        m_alpha.SetEncodingBits();
-	}
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcBlock4x4Encoding_RGBA8.cpp contains:
+	Block4x4Encoding_RGBA8
+	Block4x4Encoding_RGBA8_Opaque
+	Block4x4Encoding_RGBA8_Transparent
+
+These encoders are used when targetting file format RGBA8.
+
+Block4x4Encoding_RGBA8_Opaque is used when all pixels in the 4x4 block are opaque
+Block4x4Encoding_RGBA8_Transparent is used when all pixels in the 4x4 block are transparent
+Block4x4Encoding_RGBA8 is used when there is a mixture of alphas in the 4x4 block
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcBlock4x4Encoding_RGBA8.h"
+
+#include "EtcBlock4x4EncodingBits.h"
+#include "EtcBlock4x4.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <limits>
+//#include <algorithm>
+
+namespace Etc
+{
+    template<typename T>
+    T clamp(T value, T mn, T mx) {
+        return (value <= mn) ? mn : ((value >= mx) ? mx : value);
+    }
+
+	// ####################################################################################################
+	// Block4x4Encoding_RGBA8
+	// ####################################################################################################
+
+    static const unsigned int MODIFIER_TABLE_ENTRYS = 16;
+    static const unsigned int ALPHA_SELECTOR_BITS = 3;
+    static const unsigned int ALPHA_SELECTORS = 1 << ALPHA_SELECTOR_BITS;
+
+    // same selector table used for R11/G11/A8
+    static const int8_t s_aafModifierTable8[MODIFIER_TABLE_ENTRYS][ALPHA_SELECTORS]
+    {
+        { -3, -6,  -9, -15, 2, 5, 8, 14 },
+        { -3, -7, -10, -13, 2, 6, 9, 12 },
+        { -2, -5,  -8, -13, 1, 4, 7, 12 },
+        { -2, -4,  -6, -13, 1, 3, 5, 12 },
+
+        { -3, -6,  -8, -12, 2, 5, 7, 11 },
+        { -3, -7,  -9, -11, 2, 6, 8, 10 },
+        { -4, -7,  -8, -11, 3, 6, 7, 10 },
+        { -3, -5,  -8, -11, 2, 4, 7, 10 },
+
+        { -2, -6,  -8, -10, 1, 5, 7,  9 },
+        { -2, -5,  -8, -10, 1, 4, 7,  9 },
+        { -2, -4,  -8, -10, 1, 3, 7,  9 },
+        { -2, -5,  -7, -10, 1, 4, 6,  9 },
+
+        { -3, -4,  -7, -10, 2, 3, 6,  9 },
+        { -1, -2,  -3, -10, 0, 1, 2,  9 },
+        { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+        { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+    };
+
+    inline float DecodePixelAlpha(float a_fBase, float a_fMultiplier,
+                                    unsigned int a_uiTableIndex, unsigned int a_uiSelector)
+    {
+        float fPixelAlpha = (a_fBase +
+                            a_fMultiplier * s_aafModifierTable8[a_uiTableIndex][a_uiSelector]) / 255.0f;
+        if (fPixelAlpha < 0.0f)
+        {
+            fPixelAlpha = 0.0f;
+        }
+        else if (fPixelAlpha > 1.0f)
+        {
+            fPixelAlpha = 1.0f;
+        }
+
+        return fPixelAlpha;
+    }
+
+    inline int DecodePixelAlphaInt(int a_fBase, int a_fMultiplier,
+                                    unsigned int a_uiTableIndex, unsigned int a_uiSelector)
+    {
+        int fPixelAlpha = a_fBase +
+                            a_fMultiplier * s_aafModifierTable8[a_uiTableIndex][a_uiSelector];
+        
+        return clamp(fPixelAlpha, 0, 255);
+    }
+
+
+
+    Block4x4Encoding_A8::Block4x4Encoding_A8(void)
+    {
+        m_pencodingbitsA8 = nullptr;
+        m_pafrgbaSource = nullptr;
+    }
+
+    Block4x4Encoding_A8::~Block4x4Encoding_A8(void) {}
+
+    void Block4x4Encoding_A8::Encode(const ColorFloatRGBA *a_pafrgbaSource,
+                                     unsigned char *a_paucEncodingBits,
+                                     Block4x4::SourceAlphaMix sourceAlphaMix)
+    {
+        m_pafrgbaSource = a_pafrgbaSource;
+        
+        m_boolDone = false;
+        
+        // really only care about error for one iteration
+        //m_fError = FLT_MAX;
+        
+        m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
+        
+        if (sourceAlphaMix == Block4x4::SourceAlphaMix::OPAQUE)
+        {
+            // set the A8 portion
+            m_fBase = 255;
+            m_uiModifierTableIndex = 15;
+            m_fMultiplier = 15;
+            
+            // set all selectors to 7 (all bits set)
+            for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+            {
+                m_auiAlphaSelectors[uiPixel] = 7;
+            }
+
+            m_boolDone = true;
+        }
+        else if ((sourceAlphaMix == Block4x4::SourceAlphaMix::ALL_ZERO_ALPHA) ||
+                 (sourceAlphaMix == Block4x4::SourceAlphaMix::TRANSPARENT))
+        {
+            // set the A8 portion
+            m_fBase = 0;
+            m_uiModifierTableIndex = 0;
+            m_fMultiplier = 1;
+            
+            // set all selectors to 0
+            for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+            {
+                m_auiAlphaSelectors[uiPixel] = 0;
+            }
+
+            m_boolDone = true;
+        }
+    }
+
+    // A8 always finished in one iterations, but error metrics on rgb iteration may need the alpha values
+    // in an error metric. Skip this if alpha not part of the metric.
+    void Block4x4Encoding_A8::Decode(unsigned char *a_paucEncodingBits,
+                                     const ColorFloatRGBA *a_pafrgbaSource)
+    {
+        // Note: this is really just decoding to write this exact same data out
+        
+        m_pafrgbaSource = a_pafrgbaSource; // don't really need to hold this
+        m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
+    
+        m_fBase = m_pencodingbitsA8->data.base;
+        m_fMultiplier = m_pencodingbitsA8->data.multiplier;
+        m_uiModifierTableIndex = m_pencodingbitsA8->data.table;
+    
+        uint64_t ulliSelectorBits = 0;
+        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors0 << (uint64_t)40;
+        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors1 << (uint64_t)32;
+        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors2 << (uint64_t)24;
+        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors3 << (uint64_t)16;
+        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors4 << (uint64_t)8;
+        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors5;
+    
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            uint64_t uiShift = 45 - (3 * uiPixel);
+            m_auiAlphaSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (uint64_t)(ALPHA_SELECTORS - 1);
+        }
+    
+        //Encode(a_pafrgbaSource, a_paucEncodingBits, sourceAlphaMix);
+        
+        // no iteration on A8, it's all done in after first PerformIteration
+        m_boolDone = true;
+        
+        // no error calc since this doesn't iterate, it's already resolved alpha
+    }
+
+    void Block4x4Encoding_A8::DecodeAlpha(float* decodedPixels)
+    {
+//        m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
+//
+//        m_fBase = m_pencodingbitsA8->data.base;
+//        m_fMultiplier = m_pencodingbitsA8->data.multiplier;
+//        m_uiModifierTableIndex = m_pencodingbitsA8->data.table;
+//
+//        uint64_t ulliSelectorBits = 0;
+//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors0 << (uint64_t)40;
+//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors1 << (uint64_t)32;
+//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors2 << (uint64_t)24;
+//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors3 << (uint64_t)16;
+//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors4 << (uint64_t)8;
+//        ulliSelectorBits |= (uint64_t)m_pencodingbitsA8->data.selectors5;
+//
+//        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+//        {
+//            uint64_t uiShift = 45 - (3 * uiPixel);
+//            m_auiAlphaSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (uint64_t)(ALPHA_SELECTORS - 1);
+//        }
+
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            // this is float version of decode
+            float pixel = DecodePixelAlpha(m_fBase, m_fMultiplier,
+                m_uiModifierTableIndex,
+                m_auiAlphaSelectors[uiPixel]);
+
+            decodedPixels[4 * uiPixel] = pixel;
+        }
+    }
+
+    void Block4x4Encoding_A8::PerformIteration(float a_fEffort)
+    {
+        if (m_boolDone)
+        {
+            return;
+        }
+
+        
+        // 0, 1, 2 pixel radius all done in iteration 0, only
+        // rgb is iterated on over multiple passes.
+        if (a_fEffort < 24.9f)
+        {
+            CalculateA8(0);
+        }
+        else if (a_fEffort < 49.9f)
+        {
+            CalculateA8(1);
+        }
+        else
+        {
+            CalculateA8(2);
+        }
+        
+        m_boolDone = true;
+    }
+
+    void Block4x4Encoding_A8::CalculateA8(int a_fRadius)
+    {
+        float m_fError = FLT_MAX;
+        
+        // This code is similiar to CalculateR11.  And it's all very slow doing brute force
+        // searches over a large nested for loop space.
+        uint8_t srcAlpha[PIXELS];
+        
+        // find min/max alpha
+        int fMinAlpha = 255;
+        int fMaxAlpha = 0;
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            int fAlpha = (int)roundf(255.0f * m_pafrgbaSource[uiPixel].fA);
+            if (fAlpha < fMinAlpha)
+            {
+                fMinAlpha = fAlpha;
+            }
+            if (fAlpha > fMaxAlpha)
+            {
+                fMaxAlpha = fAlpha;
+            }
+            
+            srcAlpha[uiPixel] = fAlpha;
+        }
+        
+        assert(fMinAlpha >= 0);
+        assert(fMaxAlpha <= 255);
+        assert(fMinAlpha <= fMaxAlpha);
+
+        int fAlphaRange = fMaxAlpha - fMinAlpha;
+        
+        // fast path if range 0 (constant alpha), no point in all this iteration
+        if (fAlphaRange == 0)
+        {
+            a_fRadius = 0;
+        }
+        
+        // try each modifier table entry
+        //m_fError = FLT_MAX;        // artificially high value
+        for (int uiTableEntry = 0; uiTableEntry < (int)MODIFIER_TABLE_ENTRYS; uiTableEntry++)
+        {
+            static const unsigned int MIN_VALUE_SELECTOR = 3;
+            static const unsigned int MAX_VALUE_SELECTOR = 7;
+
+            int fTableEntryCenter = -s_aafModifierTable8[uiTableEntry][MIN_VALUE_SELECTOR];
+
+            int fTableEntryRange = s_aafModifierTable8[uiTableEntry][MAX_VALUE_SELECTOR] -
+                s_aafModifierTable8[uiTableEntry][MIN_VALUE_SELECTOR];
+
+            float fCenterRatio = fTableEntryCenter / (float)fTableEntryRange;
+
+            int fCenterInt = (int)roundf(fMinAlpha + fCenterRatio * fAlphaRange);
+            //int fCenterInt = roundf(fCenter);
+
+            int fMinBase = fCenterInt - a_fRadius;
+            int fMaxBase = fCenterInt + a_fRadius;
+            
+            if (fMinBase < 0)
+            {
+                fMinBase = 0;
+            }
+            if (fMaxBase > 255)
+            {
+                fMaxBase = 255;
+            }
+
+            // 255 range / usp to 29
+            int fRangeMultiplier = (int)roundf(fAlphaRange / (float)fTableEntryRange);
+
+            int fMinMultiplier = clamp(fRangeMultiplier - a_fRadius, 1, 15); // no 0 case like on R11
+            int fMaxMultiplier = clamp(fRangeMultiplier + a_fRadius, 1, 15);
+            
+            int auiBestSelectors[PIXELS];
+            int afBestAlphaError[PIXELS];
+            int afBestDecodedAlphas[PIXELS];
+            
+            for (int fBase = fMinBase; fBase <= fMaxBase; fBase++)
+            {
+                for (int fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier++)
+                {
+                    // find best selector for each pixel
+                    
+                    for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+                    {
+                        int fBestPixelAlphaError = 255 * 255;
+                        for (int uiSelector = 0; uiSelector < (int)ALPHA_SELECTORS; uiSelector++)
+                        {
+                            int fDecodedAlpha = DecodePixelAlphaInt(fBase, fMultiplier, uiTableEntry, uiSelector);
+
+                            // pixelError = dA ^ 2
+                            int fPixelDeltaAlpha = fDecodedAlpha - (int)srcAlpha[uiPixel];
+                            int fPixelAlphaError = fPixelDeltaAlpha * fPixelDeltaAlpha;
+
+                            if (fPixelAlphaError < fBestPixelAlphaError)
+                            {
+                                fBestPixelAlphaError = fPixelAlphaError;
+                                auiBestSelectors[uiPixel] = uiSelector;
+                                afBestAlphaError[uiPixel] = fBestPixelAlphaError;
+                                afBestDecodedAlphas[uiPixel] = fDecodedAlpha;
+                            }
+                        }
+                    }
+
+                    // accumlate pixel error into block error, sum(da^2)
+                    int fBlockError = 0;
+                    for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+                    {
+                        fBlockError += afBestAlphaError[uiPixel];
+                    }
+
+                    if (m_fError > (float)fBlockError)
+                    {
+                        m_fError = (float)fBlockError;
+
+                        m_fBase = fBase;
+                        m_fMultiplier = fMultiplier;
+                        m_uiModifierTableIndex = uiTableEntry;
+                        
+                        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+                        {
+                            m_auiAlphaSelectors[uiPixel] = auiBestSelectors[uiPixel];
+                            
+                            //m_afDecodedAlphas[uiPixel] = afBestDecodedAlphas[uiPixel] / 255.0f;
+                        }
+                        
+                        // stop the iteration if tolerance is low enough
+                        const int kErrorTolerance = 16 * 1 * 1;
+                        if (fBlockError <= kErrorTolerance) {
+                            return;
+                        }
+                    }
+                }
+            }
+
+        }
+
+    }
+
+    // ----------------------------------------------------------------------------------------------------
+    // set the encoding bits based on encoding state
+    //
+    void Block4x4Encoding_A8::SetEncodingBits(void)
+    {
+        // set the A8 portion
+        m_pencodingbitsA8->data.base = (uint8_t)roundf(/*255.0f * */ m_fBase);
+        m_pencodingbitsA8->data.table = m_uiModifierTableIndex;
+        m_pencodingbitsA8->data.multiplier = (uint8_t)roundf(m_fMultiplier);
+
+        uint64_t ulliSelectorBits = 0;
+        for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+        {
+            uint64_t uiShift = 45 - (3 * uiPixel);
+            ulliSelectorBits |= ((uint64_t)m_auiAlphaSelectors[uiPixel]) << uiShift;
+        }
+
+        m_pencodingbitsA8->data.selectors0 = uint32_t(ulliSelectorBits >> (uint64_t)40);
+        m_pencodingbitsA8->data.selectors1 = uint32_t(ulliSelectorBits >> (uint64_t)32);
+        m_pencodingbitsA8->data.selectors2 = uint32_t(ulliSelectorBits >> (uint64_t)24);
+        m_pencodingbitsA8->data.selectors3 = uint32_t(ulliSelectorBits >> (uint64_t)16);
+        m_pencodingbitsA8->data.selectors4 = uint32_t(ulliSelectorBits >> (uint64_t)8);
+        m_pencodingbitsA8->data.selectors5 = uint32_t(ulliSelectorBits);
+    }
+
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+	Block4x4Encoding_RGBA8::Block4x4Encoding_RGBA8(void)
+	{
+	}
+	Block4x4Encoding_RGBA8::~Block4x4Encoding_RGBA8(void) {}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization prior to encoding
+	// a_pblockParent points to the block associated with this encoding
+	// a_errormetric is used to choose the best encoding
+	// a_pafrgbaSource points to a 4x4 block subset of the source image
+	// a_paucEncodingBits points to the final encoding bits
+	//
+	void Block4x4Encoding_RGBA8::Encode(Block4x4 *a_pblockParent,
+												const ColorFloatRGBA *a_pafrgbaSource,
+												unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
+	{
+		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource, a_errormetric, 0);
+        
+        // RGB stored after A8 block
+        m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + 8);
+
+        // Only need alpha channel passed down
+        m_alpha.Encode(a_pafrgbaSource, a_paucEncodingBits, a_pblockParent->GetSourceAlphaMix());
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// initialization from the encoding bits of a previous encoding
+	// a_pblockParent points to the block associated with this encoding
+	// a_errormetric is used to choose the best encoding
+	// a_pafrgbaSource points to a 4x4 block subset of the source image
+	// a_paucEncodingBits points to the final encoding bits of a previous encoding
+	//
+	void Block4x4Encoding_RGBA8::Decode(Block4x4 *a_pblockParent,
+														unsigned char *a_paucEncodingBits,
+														const ColorFloatRGBA *a_pafrgbaSource,
+														ErrorMetric a_errormetric,
+                                                        uint16_t iterationCount)
+	{
+        // this won't iterate, but alpha values available for error calc
+        // but not using alpha in error calc anymore, so doing after RGB8 decode
+        m_alpha.Decode(a_paucEncodingBits, a_pafrgbaSource);
+        
+		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + 8);
+
+		// init RGB portion
+		Block4x4Encoding_RGB8::Decode(a_pblockParent,
+													(unsigned char *) m_pencodingbitsRGB8,
+													a_pafrgbaSource,
+                                                    a_errormetric,
+                                                    iterationCount);
+	}
+
+    void Block4x4Encoding_RGBA8::DecodeAlpha()
+    {
+        // API hack toe be able to fill in the decodedPixels from the already Decode called alpha
+        // this is so regular Decode path doesn't do this decode and slow down multipass
+        m_alpha.DecodeAlpha(&m_afrgbaDecodedColors[0].fA);
+    }
+
+
+	// ----------------------------------------------------------------------------------------------------
+	// perform a single encoding iteration
+	// replace the encoding if a better encoding was found
+	// subsequent iterations generally take longer for each iteration
+	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
+	//
+	// similar to Block4x4Encoding_RGB8_Base::Encode_RGB8(), but with alpha added
+	//
+	void Block4x4Encoding_RGBA8::PerformIteration(float a_fEffort)
+	{
+        // return if color and alpha done, note alpha only iterates on 0
+        if (m_boolDone && m_alpha.IsDone() )
+        {
+            return;
+        }
+        
+		if (m_uiEncodingIterations == 0)
+		{
+            m_alpha.PerformIteration(a_fEffort);
+            
+            /* TODO: can only do this if color if encoding premul color
+                but kram already knocks out all the color channels in this cae
+             
+            // this skips writing out color too
+            if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT)
+            {
+                m_mode = MODE_ETC1;
+                m_boolDiff = true;
+                m_boolFlip = false;
+
+                // none of these were cleared, like RGBA1 case
+                m_uiCW1 = 0;
+                m_uiCW2 = 0;
+
+                m_frgbaColor1 = ColorFloatRGBA();
+                m_frgbaColor2 = ColorFloatRGBA();
+                
+                for (int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
+                {
+                    m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(); // assumes rgb also 0
+                    //m_afDecodedAlphas[uiPixel] = 0.0f;
+                }
+
+                m_fError = 0.0f;
+
+                // skip processing rgb
+                m_boolDone = true;
+                //m_uiEncodingIterations++;
+            }
+            */
+		}
+
+        if (!m_boolDone)
+        {
+            Block4x4Encoding_RGB8::PerformIteration(a_fEffort);
+        }
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	// set the encoding bits based on encoding state
+	//
+	void Block4x4Encoding_RGBA8::SetEncodingBits(void)
+    {
+        // set the RGB8 portion
+        Block4x4Encoding_RGB8::SetEncodingBits();
+        
+        m_alpha.SetEncodingBits();
+	}
+}
diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.h b/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.h
index 9602ff0b..7439fea8 100644
--- a/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.h
+++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.h
@@ -1,100 +1,100 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-#include "EtcBlock4x4.h" // for SourceAlphaMix
-
-namespace Etc
-{
-	class Block4x4EncodingBits_A8;
-
-	// ################################################################################
-	// Block4x4Encoding_RGBA8
-	// RGBA8 if not completely opaque or transparent
-	// ################################################################################
-
-    // Encoder for the A8 portion of RGBA.  Minimizes error in a single iteration.
-    class Block4x4Encoding_A8
-    {
-    public:
-        Block4x4Encoding_A8(void);
-        ~Block4x4Encoding_A8(void);
-        
-        void Encode(const ColorFloatRGBA *a_pafrgbaSource,
-                    unsigned char *a_paucEncodingBits,
-                    Block4x4::SourceAlphaMix sourceAlphaMix);
-        
-        void Decode(unsigned char *a_paucEncodingBits,
-                    const ColorFloatRGBA *a_pafrgbaSource);
-        
-        void DecodeAlpha(float *decodedPixels);
-        
-        void PerformIteration(float a_fEffort);
-        void CalculateA8(int a_fRadius);
-        void SetEncodingBits(void);
-        
-        bool IsDone() const { return m_boolDone; }
-        
-    private:
-        static const int PIXELS = 16;
-        
-        Block4x4EncodingBits_A8 *m_pencodingbitsA8;
-        
-        // float* m_afDecodedAlphas; // alias to parent array
-        //Block4x4::SourceAlphaMix m_sourceAlphaMix;
-        
-        const ColorFloatRGBA* m_pafrgbaSource;
-        
-        uint8_t m_fBase;
-        uint8_t m_fMultiplier;
-        uint8_t m_uiModifierTableIndex;
-        uint8_t m_auiAlphaSelectors[PIXELS];
-        
-        bool m_boolDone;
-    };
-
-    // This basically combines RGBA8 encoder with A8 encoder
-	class Block4x4Encoding_RGBA8 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		Block4x4Encoding_RGBA8(void);
-		virtual ~Block4x4Encoding_RGBA8(void);
-
-		virtual void Encode(Block4x4 *a_pblockParent,
-                            const ColorFloatRGBA *a_pafrgbaSource,
-                            unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric) override;
-
-		virtual void Decode(Block4x4 *a_pblockParent,
-                            unsigned char *a_paucEncodingBits,
-                            const ColorFloatRGBA *a_pafrgbaSource,
-                            ErrorMetric a_errormetric,
-                            uint16_t iterationCount) override;
-
-        virtual void DecodeAlpha() override;
-        
-		virtual void PerformIteration(float a_fEffort) override;
-
-		virtual void SetEncodingBits(void) override;
-
-	private:
-        Block4x4Encoding_A8 m_alpha;
-	};
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcBlock4x4Encoding_RGB8.h"
+
+#include "EtcBlock4x4.h" // for SourceAlphaMix
+
+namespace Etc
+{
+	class Block4x4EncodingBits_A8;
+
+	// ################################################################################
+	// Block4x4Encoding_RGBA8
+	// RGBA8 if not completely opaque or transparent
+	// ################################################################################
+
+    // Encoder for the A8 portion of RGBA.  Minimizes error in a single iteration.
+    class Block4x4Encoding_A8
+    {
+    public:
+        Block4x4Encoding_A8(void);
+        ~Block4x4Encoding_A8(void);
+        
+        void Encode(const ColorFloatRGBA *a_pafrgbaSource,
+                    unsigned char *a_paucEncodingBits,
+                    Block4x4::SourceAlphaMix sourceAlphaMix);
+        
+        void Decode(unsigned char *a_paucEncodingBits,
+                    const ColorFloatRGBA *a_pafrgbaSource);
+        
+        void DecodeAlpha(float *decodedPixels);
+        
+        void PerformIteration(float a_fEffort);
+        void CalculateA8(int a_fRadius);
+        void SetEncodingBits(void);
+        
+        bool IsDone() const { return m_boolDone; }
+        
+    private:
+        static const int PIXELS = 16;
+        
+        Block4x4EncodingBits_A8 *m_pencodingbitsA8;
+        
+        // float* m_afDecodedAlphas; // alias to parent array
+        //Block4x4::SourceAlphaMix m_sourceAlphaMix;
+        
+        const ColorFloatRGBA* m_pafrgbaSource;
+        
+        uint8_t m_fBase;
+        uint8_t m_fMultiplier;
+        uint8_t m_uiModifierTableIndex;
+        uint8_t m_auiAlphaSelectors[PIXELS];
+        
+        bool m_boolDone;
+    };
+
+    // This basically combines RGBA8 encoder with A8 encoder
+	class Block4x4Encoding_RGBA8 : public Block4x4Encoding_RGB8
+	{
+	public:
+
+		Block4x4Encoding_RGBA8(void);
+		virtual ~Block4x4Encoding_RGBA8(void);
+
+		virtual void Encode(Block4x4 *a_pblockParent,
+                            const ColorFloatRGBA *a_pafrgbaSource,
+                            unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric) override;
+
+		virtual void Decode(Block4x4 *a_pblockParent,
+                            unsigned char *a_paucEncodingBits,
+                            const ColorFloatRGBA *a_pafrgbaSource,
+                            ErrorMetric a_errormetric,
+                            uint16_t iterationCount) override;
+
+        virtual void DecodeAlpha() override;
+        
+		virtual void PerformIteration(float a_fEffort) override;
+
+		virtual void SetEncodingBits(void) override;
+
+	private:
+        Block4x4Encoding_A8 m_alpha;
+	};
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcColor.h b/libkram/etc2comp/EtcColor.h
index a4c40fb9..fff15cf0 100644
--- a/libkram/etc2comp/EtcColor.h
+++ b/libkram/etc2comp/EtcColor.h
@@ -1,66 +1,66 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-#pragma once
-
-#include <math.h>
-
-namespace Etc
-{
-
-	inline float LogToLinear(float a_fLog)
-	{
-		static const float ALPHA = 0.055f;
-		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-		if (a_fLog <= 0.04045f)
-		{
-			return a_fLog / 12.92f;
-		}
-		else
-		{
-			return powf((a_fLog + ALPHA) / ONE_PLUS_ALPHA, 2.4f);
-		}
-	}
-
-	inline float LinearToLog(float a_fLinear)
-	{
-		static const float ALPHA = 0.055f;
-		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-		if (a_fLinear <= 0.0031308f)
-		{
-			return 12.92f * a_fLinear;
-		}
-		else
-		{
-			return ONE_PLUS_ALPHA * powf(a_fLinear, (1.0f/2.4f)) - ALPHA;
-		}
-	}
-
-	class ColorR8G8B8A8
-	{
-	public:
-
-		unsigned char ucR;
-		unsigned char ucG;
-		unsigned char ucB;
-		unsigned char ucA;
-
-	};
-}
-*/
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+#pragma once
+
+#include <math.h>
+
+namespace Etc
+{
+
+	inline float LogToLinear(float a_fLog)
+	{
+		static const float ALPHA = 0.055f;
+		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
+
+		if (a_fLog <= 0.04045f)
+		{
+			return a_fLog / 12.92f;
+		}
+		else
+		{
+			return powf((a_fLog + ALPHA) / ONE_PLUS_ALPHA, 2.4f);
+		}
+	}
+
+	inline float LinearToLog(float a_fLinear)
+	{
+		static const float ALPHA = 0.055f;
+		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
+
+		if (a_fLinear <= 0.0031308f)
+		{
+			return 12.92f * a_fLinear;
+		}
+		else
+		{
+			return ONE_PLUS_ALPHA * powf(a_fLinear, (1.0f/2.4f)) - ALPHA;
+		}
+	}
+
+	class ColorR8G8B8A8
+	{
+	public:
+
+		unsigned char ucR;
+		unsigned char ucG;
+		unsigned char ucB;
+		unsigned char ucA;
+
+	};
+}
+*/
diff --git a/libkram/etc2comp/EtcColorFloatRGBA.h b/libkram/etc2comp/EtcColorFloatRGBA.h
index 162debc5..d387763c 100644
--- a/libkram/etc2comp/EtcColorFloatRGBA.h
+++ b/libkram/etc2comp/EtcColorFloatRGBA.h
@@ -1,316 +1,316 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcConfig.h"
-//#include "EtcColor.h"
-
-#include <math.h>
-
-namespace Etc
-{
-    inline float LogToLinear(float a_fLog)
-    {
-        static const float ALPHA = 0.055f;
-        static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-        if (a_fLog <= 0.04045f)
-        {
-            return a_fLog / 12.92f;
-        }
-        else
-        {
-            return powf((a_fLog + ALPHA) / ONE_PLUS_ALPHA, 2.4f);
-        }
-    }
-
-    inline float LinearToLog(float a_fLinear)
-    {
-        static const float ALPHA = 0.055f;
-        static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-        if (a_fLinear <= 0.0031308f)
-        {
-            return 12.92f * a_fLinear;
-        }
-        else
-        {
-            return ONE_PLUS_ALPHA * powf(a_fLinear, (1.0f/2.4f)) - ALPHA;
-        }
-    }
-
-    class ColorR8G8B8A8
-    {
-    public:
-
-        uint8_t ucR;
-        uint8_t ucG;
-        uint8_t ucB;
-        uint8_t ucA;
-
-    };
-
-	class ColorFloatRGBA
-    {
-    public:
-
-		ColorFloatRGBA(void)
-        {
-            fR = fG = fB = fA = 0.0f;
-        }
-
-		ColorFloatRGBA(float a_fR, float a_fG, float a_fB, float a_fA)
-        {
-            fR = a_fR;
-            fG = a_fG;
-            fB = a_fB;
-            fA = a_fA;
-        }
-
-		inline ColorFloatRGBA operator+(const ColorFloatRGBA& a_rfrgba) const
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR + a_rfrgba.fR;
-			frgba.fG = fG + a_rfrgba.fG;
-			frgba.fB = fB + a_rfrgba.fB;
-			frgba.fA = fA + a_rfrgba.fA;
-			return frgba;
-		}
-
-        inline ColorFloatRGBA operator-(const ColorFloatRGBA& a_rfrgba) const
-        {
-            ColorFloatRGBA frgba;
-            frgba.fR = fR - a_rfrgba.fR;
-            frgba.fG = fG - a_rfrgba.fG;
-            frgba.fB = fB - a_rfrgba.fB;
-            frgba.fA = fA - a_rfrgba.fA;
-            return frgba;
-        }
-
-        // scalar ops don't apply to alpha
-		inline ColorFloatRGBA operator+(float a_f) const
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR + a_f;
-			frgba.fG = fG + a_f;
-			frgba.fB = fB + a_f;
-			frgba.fA = fA;
-			return frgba;
-		}
-
-        // scalar ops don't apply to alpha
-        inline ColorFloatRGBA operator-(float a_f) const
-		{
-            return *this + (-a_f);
-		}
-
-		
-        // scalar ops don't apply to alpha
-		inline ColorFloatRGBA operator*(float a_f) const
-		{
-            return ScaleRGB(a_f);
-		}
-
-		inline ColorFloatRGBA ScaleRGB(float a_f) const
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR * a_f;
-            frgba.fG = fG * a_f;
-            frgba.fB = fB * a_f;
-			frgba.fA = fA;
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA RoundRGB(void) const
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = roundf(fR);
-			frgba.fG = roundf(fG);
-			frgba.fB = roundf(fB);
-            frgba.fA = fA; // was missing in original
-            
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ToLinear() const
-		{
-			ColorFloatRGBA frgbaLinear;
-			frgbaLinear.fR = LogToLinear(fR);
-			frgbaLinear.fG = LogToLinear(fG);
-			frgbaLinear.fB = LogToLinear(fB);
-			frgbaLinear.fA = fA;
-
-			return frgbaLinear;
-		}
-
-		inline ColorFloatRGBA ToLog(void) const
-		{
-			ColorFloatRGBA frgbaLog;
-			frgbaLog.fR = LinearToLog(fR);
-			frgbaLog.fG = LinearToLog(fG);
-			frgbaLog.fB = LinearToLog(fB);
-			frgbaLog.fA = fA;
-
-			return frgbaLog;
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGBA8(uint8_t a_ucR,
-			uint8_t a_ucG, uint8_t a_ucB, uint8_t a_ucA)
-		{
-			ColorFloatRGBA frgba;
-
-			frgba.fR = (float)a_ucR / 255.0f;
-			frgba.fG = (float)a_ucG / 255.0f;
-			frgba.fB = (float)a_ucB / 255.0f;
-			frgba.fA = (float)a_ucA / 255.0f;
-
-			return frgba;
-		}
-
-        inline static ColorFloatRGBA ConvertFromRGBA8(const ColorR8G8B8A8& color)
-        {
-            return ConvertFromRGBA8(color.ucR, color.ucG, color.ucB, color.ucA);
-        }
-        
-		inline static ColorFloatRGBA ConvertFromRGB4(uint8_t a_ucR4,
-														uint8_t a_ucG4,
-														uint8_t a_ucB4, uint8_t a_ucA = 255)
-		{
-			uint8_t ucR8 = (uint8_t)((a_ucR4 << 4) + a_ucR4);
-			uint8_t ucG8 = (uint8_t)((a_ucG4 << 4) + a_ucG4);
-			uint8_t ucB8 = (uint8_t)((a_ucB4 << 4) + a_ucB4);
-
-            return ConvertFromRGBA8(ucR8, ucG8, ucB8, a_ucA);
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGB5(uint8_t a_ucR5,
-			uint8_t a_ucG5,
-			uint8_t a_ucB5, uint8_t a_ucA = 255)
-		{
-			uint8_t ucR8 = (uint8_t)((a_ucR5 << 3) + (a_ucR5 >> 2));
-			uint8_t ucG8 = (uint8_t)((a_ucG5 << 3) + (a_ucG5 >> 2));
-			uint8_t ucB8 = (uint8_t)((a_ucB5 << 3) + (a_ucB5 >> 2));
-
-            return ConvertFromRGBA8(ucR8, ucG8, ucB8, a_ucA);
-		}
-
-		inline static ColorFloatRGBA ConvertFromR6G7B6(uint8_t a_ucR6,
-			uint8_t a_ucG7,
-			uint8_t a_ucB6, uint8_t a_ucA = 255)
-		{
-			uint8_t ucR8 = (uint8_t)((a_ucR6 << 2) + (a_ucR6 >> 4));
-			uint8_t ucG8 = (uint8_t)((a_ucG7 << 1) + (a_ucG7 >> 6));
-			uint8_t ucB8 = (uint8_t)((a_ucB6 << 2) + (a_ucB6 >> 4));
-
-            return ConvertFromRGBA8(ucR8, ucG8, ucB8, a_ucA);
-		}
-
-		// quantize to 4 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR4G4B4(void) const
-		{
-			ColorFloatRGBA frgba = ClampRGB();
-
-			// quantize to 4 bits
-			frgba = frgba.ScaleRGB(15.0f).RoundRGB();
-			uint32_t uiR4 = (uint32_t)frgba.fR;
-            uint32_t uiG4 = (uint32_t)frgba.fG;
-            uint32_t uiB4 = (uint32_t)frgba.fB;
-
-            frgba = ConvertFromRGB4(uiR4, uiG4, uiB4);
-            frgba.fA = fA;
-
-			return frgba;
-		}
-
-		// quantize to 5 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR5G5B5(void) const
-		{
-			ColorFloatRGBA frgba = ClampRGBA();
-
-			// quantize to 5 bits
-			frgba = frgba.ScaleRGB(31.0f).RoundRGB();
-            uint32_t uiR5 = (uint32_t)frgba.fR;
-            uint32_t uiG5 = (uint32_t)frgba.fG;
-            uint32_t uiB5 = (uint32_t)frgba.fB;
-
-            frgba = ConvertFromRGB5(uiR5, uiG5, uiB5);
-            frgba.fA = fA;
-			return frgba;
-		}
-
-		// quantize to 6/7/6 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR6G7B6(void) const
-		{
-			ColorFloatRGBA frgba = ClampRGBA();
-
-			// quantize to 6/7/6 bits
-			uint32_t uiR6 = (uint32_t)frgba.IntRed(63.0f);
-            uint32_t uiG7 = (uint32_t)frgba.IntGreen(127.0f);
-            uint32_t uiB6 = (uint32_t)frgba.IntBlue(63.0f);
-
-            frgba = ConvertFromR6G7B6(uiR6, uiG7, uiB6);
-            frgba.fA = fA;
-            
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ClampRGB(void) const
-		{
-            return ClampRGBA();
-		}
-
-		inline ColorFloatRGBA ClampRGBA(void) const
-		{
-			ColorFloatRGBA frgba = *this;
-			if (frgba.fR < 0.0f) { frgba.fR = 0.0f; }
-			if (frgba.fR > 1.0f) { frgba.fR = 1.0f; }
-			if (frgba.fG < 0.0f) { frgba.fG = 0.0f; }
-			if (frgba.fG > 1.0f) { frgba.fG = 1.0f; }
-			if (frgba.fB < 0.0f) { frgba.fB = 0.0f; }
-			if (frgba.fB > 1.0f) { frgba.fB = 1.0f; }
-			if (frgba.fA < 0.0f) { frgba.fA = 0.0f; }
-			if (frgba.fA > 1.0f) { frgba.fA = 1.0f; }
-
-			return frgba;
-		}
-
-		inline int IntRed(float a_fScale) const
-		{
-			return (int)roundf(fR * a_fScale);
-		}
-
-		inline int IntGreen(float a_fScale) const
-		{
-			return (int)roundf(fG * a_fScale);
-		}
-
-		inline int IntBlue(float a_fScale) const
-		{
-			return (int)roundf(fB * a_fScale);
-		}
-
-		inline int IntAlpha(float a_fScale) const
-		{
-			return (int)roundf(fA * a_fScale);
-		}
-
-		float	fR, fG, fB, fA;
-    };
-
-}
-
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcConfig.h"
+//#include "EtcColor.h"
+
+#include <math.h>
+
+namespace Etc
+{
+    inline float LogToLinear(float a_fLog)
+    {
+        static const float ALPHA = 0.055f;
+        static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
+
+        if (a_fLog <= 0.04045f)
+        {
+            return a_fLog / 12.92f;
+        }
+        else
+        {
+            return powf((a_fLog + ALPHA) / ONE_PLUS_ALPHA, 2.4f);
+        }
+    }
+
+    inline float LinearToLog(float a_fLinear)
+    {
+        static const float ALPHA = 0.055f;
+        static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
+
+        if (a_fLinear <= 0.0031308f)
+        {
+            return 12.92f * a_fLinear;
+        }
+        else
+        {
+            return ONE_PLUS_ALPHA * powf(a_fLinear, (1.0f/2.4f)) - ALPHA;
+        }
+    }
+
+    class ColorR8G8B8A8
+    {
+    public:
+
+        uint8_t ucR;
+        uint8_t ucG;
+        uint8_t ucB;
+        uint8_t ucA;
+
+    };
+
+	class ColorFloatRGBA
+    {
+    public:
+
+		ColorFloatRGBA(void)
+        {
+            fR = fG = fB = fA = 0.0f;
+        }
+
+		ColorFloatRGBA(float a_fR, float a_fG, float a_fB, float a_fA)
+        {
+            fR = a_fR;
+            fG = a_fG;
+            fB = a_fB;
+            fA = a_fA;
+        }
+
+		inline ColorFloatRGBA operator+(const ColorFloatRGBA& a_rfrgba) const
+		{
+			ColorFloatRGBA frgba;
+			frgba.fR = fR + a_rfrgba.fR;
+			frgba.fG = fG + a_rfrgba.fG;
+			frgba.fB = fB + a_rfrgba.fB;
+			frgba.fA = fA + a_rfrgba.fA;
+			return frgba;
+		}
+
+        inline ColorFloatRGBA operator-(const ColorFloatRGBA& a_rfrgba) const
+        {
+            ColorFloatRGBA frgba;
+            frgba.fR = fR - a_rfrgba.fR;
+            frgba.fG = fG - a_rfrgba.fG;
+            frgba.fB = fB - a_rfrgba.fB;
+            frgba.fA = fA - a_rfrgba.fA;
+            return frgba;
+        }
+
+        // scalar ops don't apply to alpha
+		inline ColorFloatRGBA operator+(float a_f) const
+		{
+			ColorFloatRGBA frgba;
+			frgba.fR = fR + a_f;
+			frgba.fG = fG + a_f;
+			frgba.fB = fB + a_f;
+			frgba.fA = fA;
+			return frgba;
+		}
+
+        // scalar ops don't apply to alpha
+        inline ColorFloatRGBA operator-(float a_f) const
+		{
+            return *this + (-a_f);
+		}
+
+		
+        // scalar ops don't apply to alpha
+		inline ColorFloatRGBA operator*(float a_f) const
+		{
+            return ScaleRGB(a_f);
+		}
+
+		inline ColorFloatRGBA ScaleRGB(float a_f) const
+		{
+			ColorFloatRGBA frgba;
+			frgba.fR = fR * a_f;
+            frgba.fG = fG * a_f;
+            frgba.fB = fB * a_f;
+			frgba.fA = fA;
+
+			return frgba;
+		}
+
+		inline ColorFloatRGBA RoundRGB(void) const
+		{
+			ColorFloatRGBA frgba;
+			frgba.fR = roundf(fR);
+			frgba.fG = roundf(fG);
+			frgba.fB = roundf(fB);
+            frgba.fA = fA; // was missing in original
+            
+			return frgba;
+		}
+
+		inline ColorFloatRGBA ToLinear() const
+		{
+			ColorFloatRGBA frgbaLinear;
+			frgbaLinear.fR = LogToLinear(fR);
+			frgbaLinear.fG = LogToLinear(fG);
+			frgbaLinear.fB = LogToLinear(fB);
+			frgbaLinear.fA = fA;
+
+			return frgbaLinear;
+		}
+
+		inline ColorFloatRGBA ToLog(void) const
+		{
+			ColorFloatRGBA frgbaLog;
+			frgbaLog.fR = LinearToLog(fR);
+			frgbaLog.fG = LinearToLog(fG);
+			frgbaLog.fB = LinearToLog(fB);
+			frgbaLog.fA = fA;
+
+			return frgbaLog;
+		}
+
+		inline static ColorFloatRGBA ConvertFromRGBA8(uint8_t a_ucR,
+			uint8_t a_ucG, uint8_t a_ucB, uint8_t a_ucA)
+		{
+			ColorFloatRGBA frgba;
+
+			frgba.fR = (float)a_ucR / 255.0f;
+			frgba.fG = (float)a_ucG / 255.0f;
+			frgba.fB = (float)a_ucB / 255.0f;
+			frgba.fA = (float)a_ucA / 255.0f;
+
+			return frgba;
+		}
+
+        inline static ColorFloatRGBA ConvertFromRGBA8(const ColorR8G8B8A8& color)
+        {
+            return ConvertFromRGBA8(color.ucR, color.ucG, color.ucB, color.ucA);
+        }
+        
+		inline static ColorFloatRGBA ConvertFromRGB4(uint8_t a_ucR4,
+														uint8_t a_ucG4,
+														uint8_t a_ucB4, uint8_t a_ucA = 255)
+		{
+			uint8_t ucR8 = (uint8_t)((a_ucR4 << 4) + a_ucR4);
+			uint8_t ucG8 = (uint8_t)((a_ucG4 << 4) + a_ucG4);
+			uint8_t ucB8 = (uint8_t)((a_ucB4 << 4) + a_ucB4);
+
+            return ConvertFromRGBA8(ucR8, ucG8, ucB8, a_ucA);
+		}
+
+		inline static ColorFloatRGBA ConvertFromRGB5(uint8_t a_ucR5,
+			uint8_t a_ucG5,
+			uint8_t a_ucB5, uint8_t a_ucA = 255)
+		{
+			uint8_t ucR8 = (uint8_t)((a_ucR5 << 3) + (a_ucR5 >> 2));
+			uint8_t ucG8 = (uint8_t)((a_ucG5 << 3) + (a_ucG5 >> 2));
+			uint8_t ucB8 = (uint8_t)((a_ucB5 << 3) + (a_ucB5 >> 2));
+
+            return ConvertFromRGBA8(ucR8, ucG8, ucB8, a_ucA);
+		}
+
+		inline static ColorFloatRGBA ConvertFromR6G7B6(uint8_t a_ucR6,
+			uint8_t a_ucG7,
+			uint8_t a_ucB6, uint8_t a_ucA = 255)
+		{
+			uint8_t ucR8 = (uint8_t)((a_ucR6 << 2) + (a_ucR6 >> 4));
+			uint8_t ucG8 = (uint8_t)((a_ucG7 << 1) + (a_ucG7 >> 6));
+			uint8_t ucB8 = (uint8_t)((a_ucB6 << 2) + (a_ucB6 >> 4));
+
+            return ConvertFromRGBA8(ucR8, ucG8, ucB8, a_ucA);
+		}
+
+		// quantize to 4 bits, expand to 8 bits
+		inline ColorFloatRGBA QuantizeR4G4B4(void) const
+		{
+			ColorFloatRGBA frgba = ClampRGB();
+
+			// quantize to 4 bits
+			frgba = frgba.ScaleRGB(15.0f).RoundRGB();
+			uint32_t uiR4 = (uint32_t)frgba.fR;
+            uint32_t uiG4 = (uint32_t)frgba.fG;
+            uint32_t uiB4 = (uint32_t)frgba.fB;
+
+            frgba = ConvertFromRGB4(uiR4, uiG4, uiB4);
+            frgba.fA = fA;
+
+			return frgba;
+		}
+
+		// quantize to 5 bits, expand to 8 bits
+		inline ColorFloatRGBA QuantizeR5G5B5(void) const
+		{
+			ColorFloatRGBA frgba = ClampRGBA();
+
+			// quantize to 5 bits
+			frgba = frgba.ScaleRGB(31.0f).RoundRGB();
+            uint32_t uiR5 = (uint32_t)frgba.fR;
+            uint32_t uiG5 = (uint32_t)frgba.fG;
+            uint32_t uiB5 = (uint32_t)frgba.fB;
+
+            frgba = ConvertFromRGB5(uiR5, uiG5, uiB5);
+            frgba.fA = fA;
+			return frgba;
+		}
+
+		// quantize to 6/7/6 bits, expand to 8 bits
+		inline ColorFloatRGBA QuantizeR6G7B6(void) const
+		{
+			ColorFloatRGBA frgba = ClampRGBA();
+
+			// quantize to 6/7/6 bits
+			uint32_t uiR6 = (uint32_t)frgba.IntRed(63.0f);
+            uint32_t uiG7 = (uint32_t)frgba.IntGreen(127.0f);
+            uint32_t uiB6 = (uint32_t)frgba.IntBlue(63.0f);
+
+            frgba = ConvertFromR6G7B6(uiR6, uiG7, uiB6);
+            frgba.fA = fA;
+            
+			return frgba;
+		}
+
+		inline ColorFloatRGBA ClampRGB(void) const
+		{
+            return ClampRGBA();
+		}
+
+		inline ColorFloatRGBA ClampRGBA(void) const
+		{
+			ColorFloatRGBA frgba = *this;
+			if (frgba.fR < 0.0f) { frgba.fR = 0.0f; }
+			if (frgba.fR > 1.0f) { frgba.fR = 1.0f; }
+			if (frgba.fG < 0.0f) { frgba.fG = 0.0f; }
+			if (frgba.fG > 1.0f) { frgba.fG = 1.0f; }
+			if (frgba.fB < 0.0f) { frgba.fB = 0.0f; }
+			if (frgba.fB > 1.0f) { frgba.fB = 1.0f; }
+			if (frgba.fA < 0.0f) { frgba.fA = 0.0f; }
+			if (frgba.fA > 1.0f) { frgba.fA = 1.0f; }
+
+			return frgba;
+		}
+
+		inline int IntRed(float a_fScale) const
+		{
+			return (int)roundf(fR * a_fScale);
+		}
+
+		inline int IntGreen(float a_fScale) const
+		{
+			return (int)roundf(fG * a_fScale);
+		}
+
+		inline int IntBlue(float a_fScale) const
+		{
+			return (int)roundf(fB * a_fScale);
+		}
+
+		inline int IntAlpha(float a_fScale) const
+		{
+			return (int)roundf(fA * a_fScale);
+		}
+
+		float	fR, fG, fB, fA;
+    };
+
+}
+
diff --git a/libkram/etc2comp/EtcConfig.h b/libkram/etc2comp/EtcConfig.h
index f706da8a..7c9ddac7 100644
--- a/libkram/etc2comp/EtcConfig.h
+++ b/libkram/etc2comp/EtcConfig.h
@@ -1,19 +1,19 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <stdint.h>
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdint.h>
diff --git a/libkram/etc2comp/EtcDifferentialTrys.cpp b/libkram/etc2comp/EtcDifferentialTrys.cpp
index aa1945b0..b6ffc429 100644
--- a/libkram/etc2comp/EtcDifferentialTrys.cpp
+++ b/libkram/etc2comp/EtcDifferentialTrys.cpp
@@ -1,175 +1,175 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcDifferentialTrys.cpp
-
-Gathers the results of the various encoding trys for both halves of a 4x4 block for Differential mode
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcDifferentialTrys.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct a list of trys (encoding attempts)
-	//
-	// a_frgbaColor1 is the basecolor for the first half
-	// a_frgbaColor2 is the basecolor for the second half
-	// a_pauiPixelMapping1 is the pixel order for the first half
-	// a_pauiPixelMapping2 is the pixel order for the second half
-	// a_uiRadius is the amount to vary the base colors
-	//
-	DifferentialTrys::DifferentialTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
-										const unsigned int *a_pauiPixelMapping1,
-										const unsigned int *a_pauiPixelMapping2,
-										unsigned int a_uiRadius,
-										int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-		assert(a_uiRadius <= MAX_RADIUS);
-
-		m_boolSeverelyBentColors = false;
-
-		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR5G5B5();
-		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR5G5B5();
-
-		// quantize base colors
-		// ensure that trys with a_uiRadius don't overflow
-		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(31.0f)+a_iGrayOffset1, a_uiRadius);
-		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(31.0f) + a_iGrayOffset1, a_uiRadius);
-		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(31.0f) + a_iGrayOffset1, a_uiRadius);
-		
-        int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(31.0f) + a_iGrayOffset2, a_uiRadius);
-		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(31.0f) + a_iGrayOffset2, a_uiRadius);
-		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(31.0f) + a_iGrayOffset2, a_uiRadius);
-
-		int iDeltaRed = iRed2 - iRed1;
-		int iDeltaGreen = iGreen2 - iGreen1;
-		int iDeltaBlue = iBlue2 - iBlue1;
-
-		// make sure components are within range
-		{
-			if (iDeltaRed > 3)
-			{
-				if (iDeltaRed > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iRed1 += (iDeltaRed - 3) / 2;
-				iRed2 = iRed1 + 3;
-				iDeltaRed = 3;
-			}
-			else if (iDeltaRed < -4)
-			{
-				if (iDeltaRed < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iRed1 += (iDeltaRed + 4) / 2;
-				iRed2 = iRed1 - 4;
-				iDeltaRed = -4;
-			}
-			assert(iRed1 >= (signed)(0 + a_uiRadius) && iRed1 <= (signed)(31 - a_uiRadius));
-			assert(iRed2 >= (signed)(0 + a_uiRadius) && iRed2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaRed >= -4 && iDeltaRed <= 3);
-
-			if (iDeltaGreen > 3)
-			{
-				if (iDeltaGreen > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iGreen1 += (iDeltaGreen - 3) / 2;
-				iGreen2 = iGreen1 + 3;
-				iDeltaGreen = 3;
-			}
-			else if (iDeltaGreen < -4)
-			{
-				if (iDeltaGreen < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iGreen1 += (iDeltaGreen + 4) / 2;
-				iGreen2 = iGreen1 - 4;
-				iDeltaGreen = -4;
-			}
-			assert(iGreen1 >= (signed)(0 + a_uiRadius) && iGreen1 <= (signed)(31 - a_uiRadius));
-			assert(iGreen2 >= (signed)(0 + a_uiRadius) && iGreen2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaGreen >= -4 && iDeltaGreen <= 3);
-
-			if (iDeltaBlue > 3)
-			{
-				if (iDeltaBlue > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iBlue1 += (iDeltaBlue - 3) / 2;
-				iBlue2 = iBlue1 + 3;
-				iDeltaBlue = 3;
-			}
-			else if (iDeltaBlue < -4)
-			{
-				if (iDeltaBlue < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iBlue1 += (iDeltaBlue + 4) / 2;
-				iBlue2 = iBlue1 - 4;
-				iDeltaBlue = -4;
-			}
-			assert(iBlue1 >= (signed)(0+a_uiRadius) && iBlue1 <= (signed)(31 - a_uiRadius));
-			assert(iBlue2 >= (signed)(0 + a_uiRadius) && iBlue2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaBlue >= -4 && iDeltaBlue <= 3);
-		}
-
-		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
-		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void DifferentialTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue, 
-										const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
-	{
-
-		m_iRed = a_iRed;
-		m_iGreen = a_iGreen;
-		m_iBlue = a_iBlue;
-
-		m_pauiPixelMapping = a_pauiPixelMapping;
-		m_uiRadius = a_uiRadius;
-
-		m_uiTrys = 0;
-        m_ptryBest = nullptr;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcDifferentialTrys.cpp
+
+Gathers the results of the various encoding trys for both halves of a 4x4 block for Differential mode
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcDifferentialTrys.h"
+
+#include <assert.h>
+
+namespace Etc
+{
+
+	// ----------------------------------------------------------------------------------------------------
+	// construct a list of trys (encoding attempts)
+	//
+	// a_frgbaColor1 is the basecolor for the first half
+	// a_frgbaColor2 is the basecolor for the second half
+	// a_pauiPixelMapping1 is the pixel order for the first half
+	// a_pauiPixelMapping2 is the pixel order for the second half
+	// a_uiRadius is the amount to vary the base colors
+	//
+	DifferentialTrys::DifferentialTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
+										const unsigned int *a_pauiPixelMapping1,
+										const unsigned int *a_pauiPixelMapping2,
+										unsigned int a_uiRadius,
+										int a_iGrayOffset1, int a_iGrayOffset2)
+	{
+		assert(a_uiRadius <= MAX_RADIUS);
+
+		m_boolSeverelyBentColors = false;
+
+		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR5G5B5();
+		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR5G5B5();
+
+		// quantize base colors
+		// ensure that trys with a_uiRadius don't overflow
+		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(31.0f)+a_iGrayOffset1, a_uiRadius);
+		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(31.0f) + a_iGrayOffset1, a_uiRadius);
+		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(31.0f) + a_iGrayOffset1, a_uiRadius);
+		
+        int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(31.0f) + a_iGrayOffset2, a_uiRadius);
+		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(31.0f) + a_iGrayOffset2, a_uiRadius);
+		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(31.0f) + a_iGrayOffset2, a_uiRadius);
+
+		int iDeltaRed = iRed2 - iRed1;
+		int iDeltaGreen = iGreen2 - iGreen1;
+		int iDeltaBlue = iBlue2 - iBlue1;
+
+		// make sure components are within range
+		{
+			if (iDeltaRed > 3)
+			{
+				if (iDeltaRed > 7)
+				{
+					m_boolSeverelyBentColors = true;
+				}
+
+				iRed1 += (iDeltaRed - 3) / 2;
+				iRed2 = iRed1 + 3;
+				iDeltaRed = 3;
+			}
+			else if (iDeltaRed < -4)
+			{
+				if (iDeltaRed < -8)
+				{
+					m_boolSeverelyBentColors = true;
+				}
+
+				iRed1 += (iDeltaRed + 4) / 2;
+				iRed2 = iRed1 - 4;
+				iDeltaRed = -4;
+			}
+			assert(iRed1 >= (signed)(0 + a_uiRadius) && iRed1 <= (signed)(31 - a_uiRadius));
+			assert(iRed2 >= (signed)(0 + a_uiRadius) && iRed2 <= (signed)(31 - a_uiRadius));
+			assert(iDeltaRed >= -4 && iDeltaRed <= 3);
+
+			if (iDeltaGreen > 3)
+			{
+				if (iDeltaGreen > 7)
+				{
+					m_boolSeverelyBentColors = true;
+				}
+
+				iGreen1 += (iDeltaGreen - 3) / 2;
+				iGreen2 = iGreen1 + 3;
+				iDeltaGreen = 3;
+			}
+			else if (iDeltaGreen < -4)
+			{
+				if (iDeltaGreen < -8)
+				{
+					m_boolSeverelyBentColors = true;
+				}
+
+				iGreen1 += (iDeltaGreen + 4) / 2;
+				iGreen2 = iGreen1 - 4;
+				iDeltaGreen = -4;
+			}
+			assert(iGreen1 >= (signed)(0 + a_uiRadius) && iGreen1 <= (signed)(31 - a_uiRadius));
+			assert(iGreen2 >= (signed)(0 + a_uiRadius) && iGreen2 <= (signed)(31 - a_uiRadius));
+			assert(iDeltaGreen >= -4 && iDeltaGreen <= 3);
+
+			if (iDeltaBlue > 3)
+			{
+				if (iDeltaBlue > 7)
+				{
+					m_boolSeverelyBentColors = true;
+				}
+
+				iBlue1 += (iDeltaBlue - 3) / 2;
+				iBlue2 = iBlue1 + 3;
+				iDeltaBlue = 3;
+			}
+			else if (iDeltaBlue < -4)
+			{
+				if (iDeltaBlue < -8)
+				{
+					m_boolSeverelyBentColors = true;
+				}
+
+				iBlue1 += (iDeltaBlue + 4) / 2;
+				iBlue2 = iBlue1 - 4;
+				iDeltaBlue = -4;
+			}
+			assert(iBlue1 >= (signed)(0+a_uiRadius) && iBlue1 <= (signed)(31 - a_uiRadius));
+			assert(iBlue2 >= (signed)(0 + a_uiRadius) && iBlue2 <= (signed)(31 - a_uiRadius));
+			assert(iDeltaBlue >= -4 && iDeltaBlue <= 3);
+		}
+
+		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
+		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+	void DifferentialTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue, 
+										const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
+	{
+
+		m_iRed = a_iRed;
+		m_iGreen = a_iGreen;
+		m_iBlue = a_iBlue;
+
+		m_pauiPixelMapping = a_pauiPixelMapping;
+		m_uiRadius = a_uiRadius;
+
+		m_uiTrys = 0;
+        m_ptryBest = nullptr;
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcDifferentialTrys.h b/libkram/etc2comp/EtcDifferentialTrys.h
index 6b1cd9c9..71860908 100644
--- a/libkram/etc2comp/EtcDifferentialTrys.h
+++ b/libkram/etc2comp/EtcDifferentialTrys.h
@@ -1,97 +1,97 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-namespace Etc
-{
-
-	class DifferentialTrys
-	{
-	public:
-
-		static const unsigned int MAX_RADIUS = 2;
-
-		DifferentialTrys(ColorFloatRGBA a_frgbaColor1,
-							ColorFloatRGBA a_frgbaColor2,
-							const unsigned int *a_pauiPixelMapping1,
-							const unsigned int *a_pauiPixelMapping2,
-							unsigned int a_uiRadius,
-							int a_iGrayOffset1, int a_iGrayOffset2);
-
-		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
-		{
-			if (a_i < (0+ a_iDistance))
-			{
-				return (0 + a_iDistance);
-			}
-			else if (a_i > (31- a_iDistance))
-			{
-				return (31 - a_iDistance);
-			}
-
-			return a_i;
-		}
-
-		class Try
-		{
-        public :
-			static const unsigned int SELECTORS = 8;	// per half
-
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-			unsigned int m_uiCW;
-			unsigned int m_auiSelectors[SELECTORS];
-			float m_fError;
-        };
-
-		class Half
-		{
-		public:
-
-			static const unsigned int MAX_TRYS = 125;
-
-			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
-						const unsigned int *a_pauiPixelMapping,
-						unsigned int a_uiRadius);
-
-			// center of trys
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-
-			const unsigned int *m_pauiPixelMapping;
-			unsigned int m_uiRadius;
-
-			unsigned int m_uiTrys;
-			Try m_atry[MAX_TRYS];
-
-			Try *m_ptryBest;
-		};
-
-		Half m_half1;
-		Half m_half2;
-
-		bool m_boolSeverelyBentColors;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcColorFloatRGBA.h"
+
+namespace Etc
+{
+
+	class DifferentialTrys
+	{
+	public:
+
+		static const unsigned int MAX_RADIUS = 2;
+
+		DifferentialTrys(ColorFloatRGBA a_frgbaColor1,
+							ColorFloatRGBA a_frgbaColor2,
+							const unsigned int *a_pauiPixelMapping1,
+							const unsigned int *a_pauiPixelMapping2,
+							unsigned int a_uiRadius,
+							int a_iGrayOffset1, int a_iGrayOffset2);
+
+		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
+		{
+			if (a_i < (0+ a_iDistance))
+			{
+				return (0 + a_iDistance);
+			}
+			else if (a_i > (31- a_iDistance))
+			{
+				return (31 - a_iDistance);
+			}
+
+			return a_i;
+		}
+
+		class Try
+		{
+        public :
+			static const unsigned int SELECTORS = 8;	// per half
+
+			int m_iRed;
+			int m_iGreen;
+			int m_iBlue;
+			unsigned int m_uiCW;
+			unsigned int m_auiSelectors[SELECTORS];
+			float m_fError;
+        };
+
+		class Half
+		{
+		public:
+
+			static const unsigned int MAX_TRYS = 125;
+
+			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
+						const unsigned int *a_pauiPixelMapping,
+						unsigned int a_uiRadius);
+
+			// center of trys
+			int m_iRed;
+			int m_iGreen;
+			int m_iBlue;
+
+			const unsigned int *m_pauiPixelMapping;
+			unsigned int m_uiRadius;
+
+			unsigned int m_uiTrys;
+			Try m_atry[MAX_TRYS];
+
+			Try *m_ptryBest;
+		};
+
+		Half m_half1;
+		Half m_half2;
+
+		bool m_boolSeverelyBentColors;
+	};
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcErrorMetric.h b/libkram/etc2comp/EtcErrorMetric.h
index 993fab88..54a2f10e 100644
--- a/libkram/etc2comp/EtcErrorMetric.h
+++ b/libkram/etc2comp/EtcErrorMetric.h
@@ -1,66 +1,66 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace Etc
-{
-
-	enum ErrorMetric
-	{
-		//RGBA,      // Premul weighted RGB
-		//RGBX,
-		
-        GRAY,
-        REC709,    // Luma weighted(RGB) + A*A
-        
-		NUMERIC,   // X*X + Y*Y + Z*Z + W*W
-//        NUMERICX,  // X*X
-//        NUMERICXY, // X*X + Y*Y
-//
-//		NORMALXYZ,
-		//
-		//ERROR_METRICS,
-		//
-		//BT709 = REC709
-	};
-
-	inline const char *ErrorMetricToString(ErrorMetric errorMetric)
-	{
-		switch (errorMetric)
-		{
-//		case RGBA:
-//			return "RGBA";
-//		case RGBX:
-//			return "RGBX";
-        case GRAY:
-            return "GRAY";
-		case REC709:
-			return "REC709";
-		case NUMERIC:
-			return "NUMERIC";
-//        case NUMERICX:
-//            return "NUMERICX";
-//        case NUMERICXY:
-//            return "NUMERICXY";
-//		case NORMALXYZ:
-//			return "NORMALXYZ";
-		//case ERROR_METRICS:
-		default:
-			return "UNKNOWN";
-		}
-	}
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace Etc
+{
+
+	enum ErrorMetric
+	{
+		//RGBA,      // Premul weighted RGB
+		//RGBX,
+		
+        GRAY,
+        REC709,    // Luma weighted(RGB) + A*A
+        
+		NUMERIC,   // X*X + Y*Y + Z*Z + W*W
+//        NUMERICX,  // X*X
+//        NUMERICXY, // X*X + Y*Y
+//
+//		NORMALXYZ,
+		//
+		//ERROR_METRICS,
+		//
+		//BT709 = REC709
+	};
+
+	inline const char *ErrorMetricToString(ErrorMetric errorMetric)
+	{
+		switch (errorMetric)
+		{
+//		case RGBA:
+//			return "RGBA";
+//		case RGBX:
+//			return "RGBX";
+        case GRAY:
+            return "GRAY";
+		case REC709:
+			return "REC709";
+		case NUMERIC:
+			return "NUMERIC";
+//        case NUMERICX:
+//            return "NUMERICX";
+//        case NUMERICXY:
+//            return "NUMERICXY";
+//		case NORMALXYZ:
+//			return "NORMALXYZ";
+		//case ERROR_METRICS:
+		default:
+			return "UNKNOWN";
+		}
+	}
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcImage.cpp b/libkram/etc2comp/EtcImage.cpp
index f52f2e18..16eeeece 100644
--- a/libkram/etc2comp/EtcImage.cpp
+++ b/libkram/etc2comp/EtcImage.cpp
@@ -267,7 +267,7 @@ namespace Etc
         // alias the output etxture
         m_paucEncodingBits = outputTexture;
         
-        using namespace NAMESPACE_STL;
+        using namespace STL_NAMESPACE;
         
         struct SortedBlock
         {
diff --git a/libkram/etc2comp/EtcIndividualTrys.cpp b/libkram/etc2comp/EtcIndividualTrys.cpp
index 20b463a1..77db49a9 100644
--- a/libkram/etc2comp/EtcIndividualTrys.cpp
+++ b/libkram/etc2comp/EtcIndividualTrys.cpp
@@ -1,89 +1,89 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcIndividualTrys.cpp
-
-Gathers the results of the various encoding trys for both halves of a 4x4 block for Individual mode
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcIndividualTrys.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct a list of trys (encoding attempts)
-	//
-	// a_frgbaColor1 is the basecolor for the first half
-	// a_frgbaColor2 is the basecolor for the second half
-	// a_pauiPixelMapping1 is the pixel order for the first half
-	// a_pauiPixelMapping2 is the pixel order for the second half
-	// a_uiRadius is the amount to vary the base colors
-	//
-	IndividualTrys::IndividualTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
-									const unsigned int *a_pauiPixelMapping1,
-									const unsigned int *a_pauiPixelMapping2,
-									unsigned int a_uiRadius)
-	{
-		assert(a_uiRadius <= MAX_RADIUS);
-
-		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR4G4B4();
-		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR4G4B4();
-
-		// quantize base colors
-		// ensure that trys with a_uiRadius don't overflow
-		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(15.0f), a_uiRadius);
-		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(15.0f), a_uiRadius);
-		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(15.0f), a_uiRadius);
-        
-		int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(15.0f), a_uiRadius);
-		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(15.0f), a_uiRadius);
-		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(15.0f), a_uiRadius);
-
-		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
-		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void IndividualTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue,
-									const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
-	{
-
-		m_iRed = a_iRed;
-		m_iGreen = a_iGreen;
-		m_iBlue = a_iBlue;
-
-		m_pauiPixelMapping = a_pauiPixelMapping;
-		m_uiRadius = a_uiRadius;
-
-		m_uiTrys = 0;
-        m_ptryBest = nullptr;
-
-        // not initialized
-        // m_atry
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+EtcIndividualTrys.cpp
+
+Gathers the results of the various encoding trys for both halves of a 4x4 block for Individual mode
+
+*/
+
+#include "EtcConfig.h"
+#include "EtcIndividualTrys.h"
+
+#include <assert.h>
+
+namespace Etc
+{
+
+	// ----------------------------------------------------------------------------------------------------
+	// construct a list of trys (encoding attempts)
+	//
+	// a_frgbaColor1 is the basecolor for the first half
+	// a_frgbaColor2 is the basecolor for the second half
+	// a_pauiPixelMapping1 is the pixel order for the first half
+	// a_pauiPixelMapping2 is the pixel order for the second half
+	// a_uiRadius is the amount to vary the base colors
+	//
+	IndividualTrys::IndividualTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
+									const unsigned int *a_pauiPixelMapping1,
+									const unsigned int *a_pauiPixelMapping2,
+									unsigned int a_uiRadius)
+	{
+		assert(a_uiRadius <= MAX_RADIUS);
+
+		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR4G4B4();
+		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR4G4B4();
+
+		// quantize base colors
+		// ensure that trys with a_uiRadius don't overflow
+		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(15.0f), a_uiRadius);
+		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(15.0f), a_uiRadius);
+		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(15.0f), a_uiRadius);
+        
+		int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(15.0f), a_uiRadius);
+		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(15.0f), a_uiRadius);
+		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(15.0f), a_uiRadius);
+
+		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
+		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
+
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+	void IndividualTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue,
+									const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
+	{
+
+		m_iRed = a_iRed;
+		m_iGreen = a_iGreen;
+		m_iBlue = a_iBlue;
+
+		m_pauiPixelMapping = a_pauiPixelMapping;
+		m_uiRadius = a_uiRadius;
+
+		m_uiTrys = 0;
+        m_ptryBest = nullptr;
+
+        // not initialized
+        // m_atry
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcIndividualTrys.h b/libkram/etc2comp/EtcIndividualTrys.h
index 49170d43..5fb12fbc 100644
--- a/libkram/etc2comp/EtcIndividualTrys.h
+++ b/libkram/etc2comp/EtcIndividualTrys.h
@@ -1,95 +1,95 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-namespace Etc
-{
-
-	class IndividualTrys
-	{
-	public:
-
-		static const unsigned int MAX_RADIUS = 1;
-
-		IndividualTrys(ColorFloatRGBA a_frgbaColor1,
-						ColorFloatRGBA a_frgbaColor2,
-						const unsigned int *a_pauiPixelMapping1,
-						const unsigned int *a_pauiPixelMapping2,
-						unsigned int a_uiRadius);
-
-		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
-		{
-			if (a_i < (0+ a_iDistance))
-			{
-				return (0 + a_iDistance);
-			}
-			else if (a_i > (15- a_iDistance))
-			{
-				return (15 - a_iDistance);
-			}
-
-			return a_i;
-		}
-
-		class Try
-		{
-        public :
-			static const unsigned int SELECTORS = 8;	// per half
-
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-			unsigned int m_uiCW;
-			unsigned int m_auiSelectors[SELECTORS];
-			float m_fError;
-        };
-
-		class Half
-		{
-		public:
-
-			static const unsigned int MAX_TRYS = 27;
-
-			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
-						const unsigned int *a_pauiPixelMapping,
-						unsigned int a_uiRadius);
-
-			// center of trys
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-
-			const unsigned int *m_pauiPixelMapping;
-			unsigned int m_uiRadius;
-
-			unsigned int m_uiTrys;
-			Try m_atry[MAX_TRYS];
-
-			Try *m_ptryBest;
-		};
-
-		Half m_half1;
-		Half m_half2;
-
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "EtcColorFloatRGBA.h"
+
+namespace Etc
+{
+
+	class IndividualTrys
+	{
+	public:
+
+		static const unsigned int MAX_RADIUS = 1;
+
+		IndividualTrys(ColorFloatRGBA a_frgbaColor1,
+						ColorFloatRGBA a_frgbaColor2,
+						const unsigned int *a_pauiPixelMapping1,
+						const unsigned int *a_pauiPixelMapping2,
+						unsigned int a_uiRadius);
+
+		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
+		{
+			if (a_i < (0+ a_iDistance))
+			{
+				return (0 + a_iDistance);
+			}
+			else if (a_i > (15- a_iDistance))
+			{
+				return (15 - a_iDistance);
+			}
+
+			return a_i;
+		}
+
+		class Try
+		{
+        public :
+			static const unsigned int SELECTORS = 8;	// per half
+
+			int m_iRed;
+			int m_iGreen;
+			int m_iBlue;
+			unsigned int m_uiCW;
+			unsigned int m_auiSelectors[SELECTORS];
+			float m_fError;
+        };
+
+		class Half
+		{
+		public:
+
+			static const unsigned int MAX_TRYS = 27;
+
+			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
+						const unsigned int *a_pauiPixelMapping,
+						unsigned int a_uiRadius);
+
+			// center of trys
+			int m_iRed;
+			int m_iGreen;
+			int m_iBlue;
+
+			const unsigned int *m_pauiPixelMapping;
+			unsigned int m_uiRadius;
+
+			unsigned int m_uiTrys;
+			Try m_atry[MAX_TRYS];
+
+			Try *m_ptryBest;
+		};
+
+		Half m_half1;
+		Half m_half2;
+
+	};
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcMath.cpp b/libkram/etc2comp/EtcMath.cpp
index cd70a9ab..096d5f7a 100644
--- a/libkram/etc2comp/EtcMath.cpp
+++ b/libkram/etc2comp/EtcMath.cpp
@@ -1,64 +1,64 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcConfig.h"
-#include "EtcMath.h"
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate the line that best fits the set of XY points contained in a_afX[] and a_afY[]
-	// use a_fSlope and a_fOffset to define that line
-	//
-	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
-					float *a_fSlope, float *a_fOffset)
-	{
-		float fPoints = (float)a_Points;
-
-		float fSumX = 0.0f;
-		float fSumY = 0.0f;
-		float fSumXY = 0.0f;
-		float fSumX2 = 0.0f;
-
-		for (unsigned int uiPoint = 0; uiPoint < a_Points; uiPoint++)
-		{
-			fSumX += a_afX[uiPoint];
-			fSumY += a_afY[uiPoint];
-			fSumXY += a_afX[uiPoint] * a_afY[uiPoint];
-			fSumX2 += a_afX[uiPoint] * a_afX[uiPoint];
-		}
-
-		float fDivisor = fPoints*fSumX2 - fSumX*fSumX;
-
-		// if vertical line
-		if (fDivisor == 0.0f)
-		{
-			*a_fSlope = 0.0f;
-			*a_fOffset = 0.0f;
-			return true;
-		}
-
-		*a_fSlope = (fPoints*fSumXY - fSumX*fSumY) / fDivisor;
-		*a_fOffset = (fSumY - (*a_fSlope)*fSumX) / fPoints;
-
-		return false;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "EtcConfig.h"
+#include "EtcMath.h"
+
+namespace Etc
+{
+
+	// ----------------------------------------------------------------------------------------------------
+	// calculate the line that best fits the set of XY points contained in a_afX[] and a_afY[]
+	// use a_fSlope and a_fOffset to define that line
+	//
+	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
+					float *a_fSlope, float *a_fOffset)
+	{
+		float fPoints = (float)a_Points;
+
+		float fSumX = 0.0f;
+		float fSumY = 0.0f;
+		float fSumXY = 0.0f;
+		float fSumX2 = 0.0f;
+
+		for (unsigned int uiPoint = 0; uiPoint < a_Points; uiPoint++)
+		{
+			fSumX += a_afX[uiPoint];
+			fSumY += a_afY[uiPoint];
+			fSumXY += a_afX[uiPoint] * a_afY[uiPoint];
+			fSumX2 += a_afX[uiPoint] * a_afX[uiPoint];
+		}
+
+		float fDivisor = fPoints*fSumX2 - fSumX*fSumX;
+
+		// if vertical line
+		if (fDivisor == 0.0f)
+		{
+			*a_fSlope = 0.0f;
+			*a_fOffset = 0.0f;
+			return true;
+		}
+
+		*a_fSlope = (fPoints*fSumXY - fSumX*fSumY) / fDivisor;
+		*a_fOffset = (fSumY - (*a_fSlope)*fSumX) / fPoints;
+
+		return false;
+	}
+
+	// ----------------------------------------------------------------------------------------------------
+	//
+
+} // namespace Etc
diff --git a/libkram/etc2comp/EtcMath.h b/libkram/etc2comp/EtcMath.h
index 3d951fee..c58c9a91 100644
--- a/libkram/etc2comp/EtcMath.h
+++ b/libkram/etc2comp/EtcMath.h
@@ -1,40 +1,40 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <math.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// return true if vertical line
-	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
-					float *a_fSlope, float *a_fOffset);
-
-	inline float ConvertMSEToPSNR(float a_fMSE)
-	{
-		if (a_fMSE == 0.0f)
-		{
-			return INFINITY;
-		}
-
-		return 10.0f * log10f(1.0f / a_fMSE);
-	}
-
-
-}
+/*
+ * Copyright 2015 The Etc2Comp Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <math.h>
+
+namespace Etc
+{
+
+	// ----------------------------------------------------------------------------------------------------
+	// return true if vertical line
+	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
+					float *a_fSlope, float *a_fOffset);
+
+	inline float ConvertMSEToPSNR(float a_fMSE)
+	{
+		if (a_fMSE == 0.0f)
+		{
+			return INFINITY;
+		}
+
+		return 10.0f * log10f(1.0f / a_fMSE);
+	}
+
+
+}
diff --git a/libkram/fmt/format.h b/libkram/fmt/format.h
index 4b26f926..d6d50f59 100644
--- a/libkram/fmt/format.h
+++ b/libkram/fmt/format.h
@@ -39,6 +39,8 @@
 #include <initializer_list>  // std::initializer_list
 #include <limits>            // std::numeric_limits
 #include <memory>            // std::uninitialized_copy
+
+// These use exceptions
 #include <stdexcept>         // std::runtime_error
 #include <system_error>      // std::system_error
 
diff --git a/libkram/json11/json11.cpp b/libkram/json11/json11.cpp
index b78fee62..d96f75aa 100644
--- a/libkram/json11/json11.cpp
+++ b/libkram/json11/json11.cpp
@@ -28,17 +28,19 @@
 
 // not including this in KramConfig.h - used for pool
 #include "BlockedLinearAllocator.h"
+#include "KramZipStream.h"
+
 
 // Heavily modifed by Alec Miller 10/1/23
 // This codebase was frozen by DropBox with very little effort put into it.
 // And I liked the readability of the code.  Optimized with ImmutableStrings
 // and a BlockedLinearAllocator.
 //
-// This is DOM reader/writer.  Building up stl data structures in a DOM
-// to write isn't great memory wise.  May move to a SAX writer.
+// json11 is DOM reader/writer.  Building up stl data structures in a DOM
+// to write isn't great memory wise.  Moved to custom SAX writer.
 // Times to read font atlas file on M1 MBP 14".  1/21/24
 //
-// json11
+// json11 reader
 // Release - parsed 101 KB of json using 576 KB of memory in 14.011ms
 // Debug   - parsed 101 KB of json using 576 KB of memory in 26.779ms
 //
@@ -49,7 +51,7 @@
 
 namespace json11 {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 using namespace kram;
 
 //---------------------
@@ -128,19 +130,18 @@ const char* JsonWriter::escapedString(const char* str)
     
     return _escapedString.c_str();
 }
-  
+
 void JsonWriter::pushObject(const char* key) {
     if (key[0])
     {
         KASSERT(isObject());
         writeCommaAndNewline();
         int indent = _stack.size();
-        sprintf(*_out, "%*s\"%s\":{\n", indent, "", key);
+        writeFormat("%*s\"%s\":{\n", indent, "", key);
     }
     else
     {
-        _out->push_back('{');
-        _out->push_back('\n');
+        writeFormat("{\n");
     }
     _stack.push_back('}');
     _isFirst.push_back(false);
@@ -151,12 +152,11 @@ void JsonWriter::pushArray(const char* key) {
         KASSERT(isObject());
         writeCommaAndNewline();
         int indent = _stack.size();
-        sprintf(*_out, "%*s\"%s\":[\n", indent, "", key);
+        writeFormat("%*s\"%s\":[\n", indent, "", key);
     }
     else
     {
-        _out->push_back('[');
-        _out->push_back('\n');
+        writeFormat("[\n");
     }
     _stack.push_back(']');
     _isFirst.push_back(false);
@@ -166,22 +166,25 @@ void JsonWriter::pop() {
     KASSERT(_stack.empty());
     char c = _stack.back();
     
-    _out->push_back(c);
-    _out->push_back('\n');
+    writeFormat("%c\n", c);
     
     _stack.pop_back();
     _isFirst.pop_back();
 }
 void JsonWriter::popObject() {
     KASSERT(_stack.empty());
+#if KRAM_DEBUG
     char c = _stack.back();
     KASSERT(c == '}');
+#endif
     pop();
 }
 void JsonWriter::popArray() {
     KASSERT(_stack.empty());
+#if KRAM_DEBUG
     char c = _stack.back();
     KASSERT(c == ']');
+#endif
     pop();
 }
 
@@ -189,43 +192,42 @@ void JsonWriter::writeString(const char* key, const char* value) {
     KASSERT(isObject());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s\"%s\":\"%s\"", indent, "", key, escapedString(value));
+    writeFormat("%*s\"%s\":\"%s\"", indent, "", key, escapedString(value));
 }
 void JsonWriter::writeDouble(const char* key, double value) {
     KASSERT(isObject());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s\"%s\":%f", indent, "", key, value);
+    writeFormat("%*s\"%s\":%f", indent, "", key, value);
 }
 void JsonWriter::writeInt32(const char* key, int32_t value) {
     KASSERT(isObject());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s\"%s\":\"%d\"", indent, "", key, value);
+    writeFormat("%*s\"%s\":\"%d\"", indent, "", key, value);
     
 }
 void JsonWriter::writeBool(const char* key, bool value) {
     KASSERT(isObject());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s\"%s\":%s", indent, "", key, value ? "true" : "false");
+    writeFormat("%*s\"%s\":%s", indent, "", key, value ? "true" : "false");
 }
 void JsonWriter::writeNull(const char* key) {
     KASSERT(isObject());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s\"%s\":%s", indent, "", key, "null");
+    writeFormat("%*s\"%s\":%s", indent, "", key, "null");
 }
 
 // can write out json in parallel and combine
-void JsonWriter::writeJson(const JsonWriter& json)
-{
+void JsonWriter::writeJson(const JsonWriter& json) {
     KASSERT(_stack.empty());
     KASSERT(this != &json);
     
     // TODO: indent won't be correct on this
     // so caller may want to set indent
-    _out->append(*json._out);
+    writeFormat("%s", json._out->c_str());
 }
 
 void JsonWriter::writeString(const char* value) {
@@ -233,43 +235,74 @@ void JsonWriter::writeString(const char* value) {
     // only if in array
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s\"%s\"", indent, "", escapedString(value));
+    writeFormat("%*s\"%s\"", indent, "", escapedString(value));
 }
 void JsonWriter::writeDouble(double value) {
     KASSERT(isArray());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s%f", indent, "", value);
+    writeFormat("%*s%f", indent, "", value);
 }
 void JsonWriter::writeInt32(int32_t value) {
     KASSERT(isArray());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s\"%d\"", indent, "", value);
+    writeFormat("%*s\"%d\"", indent, "", value);
 }
 void JsonWriter::writeBool(bool value) {
     KASSERT(isArray());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s%s", indent, "", value ? "true" : "false");
+    writeFormat("%*s%s", indent, "", value ? "true" : "false");
 }
 void JsonWriter::writeNull() {
     KASSERT(isArray());
     writeCommaAndNewline();
     int indent = _stack.size();
-    append_sprintf(*_out, "%*s%s", indent, "", "null");
+    writeFormat("%*s%s", indent, "", "null");
 }
 
 void JsonWriter::writeCommaAndNewline() {
     bool isFirst = _isFirst.back();
     if (!isFirst)
-        _out->push_back(',');
-    _out->push_back('\n');
+        writeFormat(",\n");
+    else
+        writeFormat("\n");
     
     // vector<bool> is special
     _isFirst[_isFirst.size()-1] = true;
 }
 
+void JsonWriter::writeFormat(const char* fmt, ...) {
+    // append to the string, string may grow
+    va_list args;
+    va_start(args, fmt);
+    append_vsprintf(*_out, fmt, args);
+    va_end(args);
+    
+    // when string reach certain length, flush to compressed file and/or buffer
+    if (_stream && _out->size() >= _stream->compressLimit())
+    {
+        // flush the output to a compression stream
+        _stream->compress(Slice((uint8_t*)_out->data(), _out->size()), false); // losing const
+        
+        // reset the buffer
+        _out->clear();
+    }
+}
+
+JsonWriter::~JsonWriter()
+{
+    if (_stream)  {
+        if (!_out->empty()) {
+            _stream->compress(Slice((uint8_t*)_out->data(), _out->size()), true); // losing const
+        }
+    }
+}
+
+    
+
+
 
 /*
 void JsonWriter::writeText(const char* text) {
diff --git a/libkram/json11/json11.h b/libkram/json11/json11.h
index b833c557..e0d7cac7 100644
--- a/libkram/json11/json11.h
+++ b/libkram/json11/json11.h
@@ -54,9 +54,13 @@
 
 #include "ImmutableString.h"
 
+namespace kram {
+class ICompressedStream;
+}
+
 namespace json11 {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 using namespace kram;
 
 class Json;
@@ -64,7 +68,7 @@ class JsonReaderData;
 
 //--------------------------
 
-/* Don't want to maintain this form.  Use SAX not DOM for writer.
+/* Don't want to maintain this form from json11.  Use SAX not DOM for writer.
 // Write json nodes out to a string.  String data is encoded.
 class JsonWriter final {
 public:
@@ -99,8 +103,13 @@ class JsonWriter final {
 // And keys go out in the order added.
 class JsonWriter final {
 public:
+    // This writes into a buffer
     JsonWriter(string* str) : _out(str) {}
     
+    // This writes into a small buffer, and then into a compressed stream
+    JsonWriter(string* str, ICompressedStream* stream) : _out(str), _stream(stream) {}
+    ~JsonWriter();
+    
     void pushObject(const char* key = "");
     void popObject();
     
@@ -125,6 +134,8 @@ class JsonWriter final {
     void writeJson(const JsonWriter& json);
     
 private:
+    void writeFormat(const char* fmt, ...) __printflike(2, 3);
+    
     bool isArray() const { return _stack.back() == ']'; }
     bool isObject() const { return _stack.back() == '}'; }
    
@@ -137,6 +148,7 @@ class JsonWriter final {
     string* _out = nullptr;
     string _stack;
     string _escapedString;
+    ICompressedStream* _stream = nullptr;
 };
 
 class JsonArrayScope {
diff --git a/libkram/kram/BlockedLinearAllocator.cpp b/libkram/kram/BlockedLinearAllocator.cpp
index a58df33d..1549235e 100644
--- a/libkram/kram/BlockedLinearAllocator.cpp
+++ b/libkram/kram/BlockedLinearAllocator.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -6,43 +6,48 @@
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 BlockedLinearAllocator::BlockedLinearAllocator(uint32_t itemsPerBlock, uint32_t itemSize)
-: _itemsPerBlock(itemsPerBlock), _itemSize(itemSize), _blockSize(itemsPerBlock*itemSize)
+    : _itemSize(itemSize),
+      _itemsPerBlock(itemsPerBlock),
+      _blockSize(itemsPerBlock * itemSize)
 {
-    
 }
 
-BlockedLinearAllocator::~BlockedLinearAllocator() {
+BlockedLinearAllocator::~BlockedLinearAllocator()
+{
     resetAndFree();
 }
 
-void BlockedLinearAllocator::reset() {
+void BlockedLinearAllocator::reset()
+{
     // don't free the block memory, reuse for next parse
     _blockCurrent = 0;
     _counter = 0;
     _blockCounter = 0;
 }
 
-void BlockedLinearAllocator::resetAndFree() {
-    for (auto& it: _blocks) {
-        delete [] it;
+void BlockedLinearAllocator::resetAndFree()
+{
+    for (auto& it : _blocks) {
+        delete[] it;
     }
     _blocks.clear();
     reset();
 }
 
-bool BlockedLinearAllocator::checkAllocate() {
+bool BlockedLinearAllocator::checkAllocate()
+{
     // allocate more blocks
     if (_counter >= _blocks.size() * _itemsPerBlock) {
         uint8_t* newBlock = new uint8_t[_blockSize];
         if (!newBlock)
             return false;
-        
+
         _blocks.push_back(newBlock);
     }
-    
+
     // advance to next block
     if (_counter && ((_counter % _itemsPerBlock) == 0)) {
         _blockCurrent++;
@@ -51,11 +56,12 @@ bool BlockedLinearAllocator::checkAllocate() {
     return true;
 }
 
-void* BlockedLinearAllocator::allocate() {
+void* BlockedLinearAllocator::allocate()
+{
     // make sure space exists
     if (!checkAllocate())
         return nullptr;
-    
+
     // return a new item off the block
     auto& block = _blocks[_blockCurrent];
     uint32_t start = _blockCounter++;
@@ -63,4 +69,4 @@ void* BlockedLinearAllocator::allocate() {
     return block + start * _itemSize;
 }
 
-}
+} //namespace kram
diff --git a/libkram/kram/BlockedLinearAllocator.h b/libkram/kram/BlockedLinearAllocator.h
index 6fb350c1..abfc37ec 100644
--- a/libkram/kram/BlockedLinearAllocator.h
+++ b/libkram/kram/BlockedLinearAllocator.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -8,59 +8,63 @@
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // Can use to allocate tree nodes where length is unknown
 // until the tree is fully parsed.
-class BlockedLinearAllocator
-{
+class BlockedLinearAllocator {
 public:
     BlockedLinearAllocator(uint32_t itemsPerBlock, uint32_t itemSize);
     ~BlockedLinearAllocator();
-    
+
     void* allocate();
     // for POD, caller must zero out?
-    template<typename T>
-    T* allocateItem() { return (T*)allocate(); }
-    
+    template <typename T>
+    T* allocateItem()
+    {
+        return (T*)allocate();
+    }
+
     // placement new/delete could also be done, and variable
     // itemSize, but then have to address alignment
-    
+
     // Convert to/from an index.  Call before allocate.
     uint32_t nextItemIndex() const { return _counter; }
-    
+
     // This retrieves data from an index
-    void* itemIndexToData(uint32_t itemIndex) const {
+    void* itemIndexToData(uint32_t itemIndex) const
+    {
         uint32_t blockNum = itemIndex / _itemsPerBlock;
         uint32_t blockIndex = itemIndex % _itemsPerBlock;
         return _blocks[blockNum] + blockIndex * _itemSize;
     }
-    
+
     // can reuse same allocated blocks to avoid fragmentation
     void reset();
-    
+
     // free the allocated blocks
     void resetAndFree();
-    
-    size_t memoryUse() const {
+
+    size_t memoryUse() const
+    {
         return _blocks.size() * _blockSize;
     }
-    
+
 private:
     bool checkAllocate();
-    
+
     using Block = uint8_t*;
     vector<Block> _blocks;
-    
+
     // currently only one item size storeed in Block
     uint32_t _itemSize = 0;
     uint32_t _itemsPerBlock = 0;
     uint32_t _blockSize = 0;
-    
+
     // where in block, and total item count
     uint32_t _blockCurrent = 0;
     uint32_t _blockCounter = 0; // item index into current block
     uint32_t _counter = 0;
 };
 
-}
+} //namespace kram
diff --git a/libkram/kram/ImmutableString.cpp b/libkram/kram/ImmutableString.cpp
index 551a4172..a97f48db 100644
--- a/libkram/kram/ImmutableString.cpp
+++ b/libkram/kram/ImmutableString.cpp
@@ -1,99 +1,106 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #include "ImmutableString.h"
 
+// for memcpy
+#if KRAM_LINUX
+#include <string.h>
+#endif
+
 namespace kram {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
-ImmutableStringPool::ImmutableStringPool(size_t capacity_) {
+ImmutableStringPool::ImmutableStringPool(size_t capacity_)
+{
     capacity = capacity_;
-    
+
     // empty string is always 0.  Only one buffer for now.  Does not grow.
-    ImmutableStringInfo info = { 0, (uint16_t)counter++ };
+    ImmutableStringInfo info = {0, (uint16_t)counter++};
     mem = new char[capacity];
-    
+
     memcpy(mem, &info, sizeof(ImmutableStringInfo));
     mem[sizeof(ImmutableStringInfo)] = 0;
-    
+
     emptyString = mem + sizeof(ImmutableStringInfo);
-    
+
     keyTable.reserve(1024);
-    
+
     // keep aligned to 2B for ImmutableStringInfo
     uint32_t sz = 2;
     size += sz + sizeof(ImmutableStringInfo);
 }
 
-ImmutableStringPool::~ImmutableStringPool() {
-    delete [] mem;
+ImmutableStringPool::~ImmutableStringPool()
+{
+    delete[] mem;
     mem = nullptr;
 }
 
-ImmutableString ImmutableStringPool::getImmutableString(const char* s) {
+ImmutableString ImmutableStringPool::getImmutableString(const char* s)
+{
     if (!s || !*s)
         return emptyString;
-    
+
     // caller passing in an already immutable string in the block
     if (isImmutableString(s))
         return s;
-    
+
     // mutex lock from here on down if hitting from multiple threads
     // this is iterating on map
     mylock lock(mapLock);
-    
+
     // find a block with the string
     auto it = map.find(s);
     if (it != map.end())
         return it->first;
-    
+
     // Keep unique key count under 64K
-    const uint32_t kMaxCounter = 64*1024;
+    const uint32_t kMaxCounter = 64 * 1024;
     if (counter >= kMaxCounter) {
         KLOGE("ImmutableString", "Pool cannot fit string");
         return emptyString;
     }
     // not found, so need to add to an empty block
     size_t sz = strlen(s) + 1;
-    
-    
+
     // see if it will fit the block
-    if ((size + sz + sizeof(ImmutableStringInfo))  > capacity) {
+    if ((size + sz + sizeof(ImmutableStringInfo)) > capacity) {
         KLOGE("ImmutableString", "Pool cannot fit string length %zu", sz);
         return emptyString;
     }
-    
+
     // uint32_t hash = (uint32_t)map.hash_function()( s ); // or just use fnv1a call ?  unordered_map does cache this?
-    ImmutableStringInfo info = { (uint16_t)(counter++), (uint16_t)(sz - 1) }; // hashStr };
-    
+    ImmutableStringInfo info = {(uint16_t)(counter++), (uint16_t)(sz - 1)}; // hashStr };
+
     // 4B header
     sz += sizeof(ImmutableStringInfo);
-    
+
     // This finds a string from a 2B lookup uisng the info.counter
     keyTable.push_back(size + sizeof(ImmutableStringPool));
-    
+
     // append it
     char* immStr = mem + size;
-    
+
     memcpy(immStr, &info, sizeof(ImmutableStringInfo));
     immStr += sizeof(ImmutableStringInfo);
     memcpy(immStr, s, sz);
-    
+
     // add it into the map
     map[immStr] = size;
     size += sz;
-    
+
     // keep aligned to 2 bytes
     size_t align = alignof(ImmutableStringInfo);
     assert(align == 2);
     (void)align;
-    
+
     if (size & 1)
         ++size;
-    
+
     return immStr;
 }
 
-}
+} //namespace kram
diff --git a/libkram/kram/ImmutableString.h b/libkram/kram/ImmutableString.h
index fbc6af66..0bd44a42 100644
--- a/libkram/kram/ImmutableString.h
+++ b/libkram/kram/ImmutableString.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -13,11 +13,12 @@
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // case-sensitive fnv1a hash, can pass existing hash to continue a hash
-inline uint32_t HashFnv1a(const char* val, uint32_t hash = 0x811c9dc5) {
-    const uint32_t prime  = 0x01000193; // 16777619 (32-bit)
+inline uint32_t HashFnv1a(const char* val, uint32_t hash = 0x811c9dc5)
+{
+    const uint32_t prime = 0x01000193; // 16777619 (32-bit)
     while (*val) {
         hash = (hash * prime) ^ (uint32_t)*val++;
     }
@@ -25,16 +26,18 @@ inline uint32_t HashFnv1a(const char* val, uint32_t hash = 0x811c9dc5) {
 }
 
 // this compares string stored as const char*
-struct CompareStrings
-{
+struct CompareStrings {
     // Would count and/or hash help?
     // otherwise, this has to walk memory, hash already found bucket
     template <class T>
     bool operator()(const T& x, const T& y) const
-    { return strcmp(x, y) == 0; }
-    
+    {
+        return strcmp(x, y) == 0;
+    }
+
     template <class T>
-    size_t operator()(const T& x) const {
+    size_t operator()(const T& x) const
+    {
         // 32-bit hash to uint64 conversion on 64-bit system
         return (size_t)HashFnv1a(x);
     }
@@ -51,69 +54,74 @@ struct ImmutableStringInfo {
 using ImmutableString = const char*;
 
 // Store and retrieve immutable strings.  The address of these never changes.
-class ImmutableStringPool
-{
+class ImmutableStringPool {
 public:
-    ImmutableStringPool(size_t capacity_ = 32*1024);
+    ImmutableStringPool(size_t capacity_ = 32 * 1024);
     ~ImmutableStringPool();
-    
+
     ImmutableString getImmutableString(const char* s);
-    string_view getImmutableStringView(const char* s) {
+    string_view getImmutableStringView(const char* s)
+    {
         ImmutableString str = getImmutableString(s);
         return string_view(getImmutableString(s), getLength(str));
     }
-    
+
     // Compress 8B to 2B using counter
-    uint16_t getCounter(ImmutableString str) const {
-        const ImmutableStringInfo* info = ((const ImmutableStringInfo*)(str-sizeof(ImmutableStringInfo)));
+    uint16_t getCounter(ImmutableString str) const
+    {
+        const ImmutableStringInfo* info = ((const ImmutableStringInfo*)(str - sizeof(ImmutableStringInfo)));
         return info->counter;
     }
     // cached strlen of string
-    uint16_t getLength(ImmutableString str) const {
-        const ImmutableStringInfo* info = ((const ImmutableStringInfo*)(str-sizeof(ImmutableStringInfo)));
+    uint16_t getLength(ImmutableString str) const
+    {
+        const ImmutableStringInfo* info = ((const ImmutableStringInfo*)(str - sizeof(ImmutableStringInfo)));
         return info->length;
     }
-    
+
     // Can lookup string from counter
-    ImmutableString getImmutableString(uint16_t counter_) const {
+    ImmutableString getImmutableString(uint16_t counter_) const
+    {
         mylock lock(mapLock);
         return mem + keyTable[counter_];
     }
-    string_view getImmutableStringView(uint16_t counter_) const {
+    string_view getImmutableStringView(uint16_t counter_) const
+    {
         mylock lock(mapLock);
         ImmutableString str = mem + keyTable[counter_];
         return string_view(str, getLength(str));
     }
-    
+
     // Can call outside of mutex if mem never grows.
-    bool isImmutableString(const char* s) const {
+    bool isImmutableString(const char* s) const
+    {
         return s >= mem && s < mem + capacity;
     }
-    
+
 private:
     using mymutex = std::mutex;
     using mylock = std::unique_lock<mymutex>; // or lock_guard?
-    
+
     mutable mymutex mapLock;
-    
+
     // Remap strings to immutable strings.
     // Could be unordered_set.
     using ImmutableMap = unordered_map<ImmutableString, uint32_t, CompareStrings, CompareStrings>;
     ImmutableMap map;
-    
+
     // Can convert keys to 2B using lookup table.  Can grow.
     vector<uint32_t> keyTable;
-    
+
     // Only has one block of memory right now.
     // This block cannot grow or addresses are all invalidated.
     char* mem = nullptr;
     uint32_t size = 0;
     uint32_t capacity = 0;
-    
+
     // A count of how many strings are stored
     uint32_t counter = 0;
-    
+
     ImmutableString emptyString = nullptr;
 };
 
-}
+} //namespace kram
diff --git a/libkram/kram/KTXImage.cpp b/libkram/kram/KTXImage.cpp
index 363e9b04..be48245e 100644
--- a/libkram/kram/KTXImage.cpp
+++ b/libkram/kram/KTXImage.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -16,6 +16,14 @@
 // for zstd decompress
 #include "zstd.h"
 
+#ifndef USE_LIBCOMPRESSION
+#define USE_LIBCOMPRESSION 0 // KRAM_APPLE
+#endif
+
+#if USE_LIBCOMPRESSION
+#include <compression.h>
+#endif
+
 namespace kram {
 
 // These are props added into the KTX file props data.
@@ -30,7 +38,7 @@ const char* kPropChannels = "KramChannels";
 const char* kPropAddress = "KramAddress";
 const char* kPropFilter = "KramFilter";
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // These start each KTX file to indicate the type
 const uint8_t kKTXIdentifier[kKTXIdentifierSize] = {
@@ -50,110 +58,109 @@ const uint32_t DXGI_FORMAT_ETC2_OFFSET = 50;
 // enum based on dxgiformat.h, but added astc ldr/hdr from obscure docs
 // and added etc2/eac by just making up constants since those aren't public
 // Copyright (C) Microsoft.  All rights reserved.
-enum MyDXGIFormat : uint32_t
-{
-    DXGI_FORMAT_UNKNOWN                                 = 0,
-    
+enum MyDXGIFormat : uint32_t {
+    DXGI_FORMAT_UNKNOWN = 0,
+
     //DXGI_FORMAT_R32G32B32A32_TYPELESS                   = 1,
-    DXGI_FORMAT_R32G32B32A32_FLOAT                      = 2,
-    DXGI_FORMAT_R32G32B32A32_UINT                       = 3,
-    DXGI_FORMAT_R32G32B32A32_SINT                       = 4,
+    DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
+    DXGI_FORMAT_R32G32B32A32_UINT = 3,
+    DXGI_FORMAT_R32G32B32A32_SINT = 4,
     //DXGI_FORMAT_R32G32B32_TYPELESS                      = 5,
-    DXGI_FORMAT_R32G32B32_FLOAT                         = 6,
-    DXGI_FORMAT_R32G32B32_UINT                          = 7,
-    DXGI_FORMAT_R32G32B32_SINT                          = 8,
-    
+    DXGI_FORMAT_R32G32B32_FLOAT = 6,
+    DXGI_FORMAT_R32G32B32_UINT = 7,
+    DXGI_FORMAT_R32G32B32_SINT = 8,
+
     //DXGI_FORMAT_R16G16B16A16_TYPELESS                   = 9,
-    DXGI_FORMAT_R16G16B16A16_FLOAT                      = 10,
-    DXGI_FORMAT_R16G16B16A16_UNORM                      = 11,
-    DXGI_FORMAT_R16G16B16A16_UINT                       = 12,
-    DXGI_FORMAT_R16G16B16A16_SNORM                      = 13,
-    DXGI_FORMAT_R16G16B16A16_SINT                       = 14,
-    
+    DXGI_FORMAT_R16G16B16A16_FLOAT = 10,
+    DXGI_FORMAT_R16G16B16A16_UNORM = 11,
+    DXGI_FORMAT_R16G16B16A16_UINT = 12,
+    DXGI_FORMAT_R16G16B16A16_SNORM = 13,
+    DXGI_FORMAT_R16G16B16A16_SINT = 14,
+
     //DXGI_FORMAT_R32G32_TYPELESS                         = 15,
-    DXGI_FORMAT_R32G32_FLOAT                            = 16,
-    DXGI_FORMAT_R32G32_UINT                             = 17,
-    DXGI_FORMAT_R32G32_SINT                             = 18,
+    DXGI_FORMAT_R32G32_FLOAT = 16,
+    DXGI_FORMAT_R32G32_UINT = 17,
+    DXGI_FORMAT_R32G32_SINT = 18,
     //DXGI_FORMAT_R32G8X24_TYPELESS                       = 19,
     //DXGI_FORMAT_D32_FLOAT_S8X24_UINT                    = 20,
     //DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS                = 21,
     //DXGI_FORMAT_X32_TYPELESS_G8X24_UINT                 = 22,
-    
+
     //DXGI_FORMAT_R10G10B10A2_TYPELESS                    = 23,
     //DXGI_FORMAT_R10G10B10A2_UNORM                       = 24,
     //DXGI_FORMAT_R10G10B10A2_UINT                        = 25,
-    //DXGI_FORMAT_R11G11B10_FLOAT                         = 26,
-    
+    DXGI_FORMAT_R11G11B10_FLOAT                         = 26,
+
     //DXGI_FORMAT_R8G8B8A8_TYPELESS                       = 27,
-    DXGI_FORMAT_R8G8B8A8_UNORM                          = 28,
-    DXGI_FORMAT_R8G8B8A8_UNORM_SRGB                     = 29,
-    DXGI_FORMAT_R8G8B8A8_UINT                           = 30,
-    DXGI_FORMAT_R8G8B8A8_SNORM                          = 31,
-    DXGI_FORMAT_R8G8B8A8_SINT                           = 32,
-    
+    DXGI_FORMAT_R8G8B8A8_UNORM = 28,
+    DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
+    DXGI_FORMAT_R8G8B8A8_UINT = 30,
+    DXGI_FORMAT_R8G8B8A8_SNORM = 31,
+    DXGI_FORMAT_R8G8B8A8_SINT = 32,
+
     //DXGI_FORMAT_R16G16_TYPELESS                         = 33,
-    DXGI_FORMAT_R16G16_FLOAT                            = 34,
-    DXGI_FORMAT_R16G16_UNORM                            = 35,
-    DXGI_FORMAT_R16G16_UINT                             = 36,
-    DXGI_FORMAT_R16G16_SNORM                            = 37,
-    DXGI_FORMAT_R16G16_SINT                             = 38,
-    
+    DXGI_FORMAT_R16G16_FLOAT = 34,
+    DXGI_FORMAT_R16G16_UNORM = 35,
+    DXGI_FORMAT_R16G16_UINT = 36,
+    DXGI_FORMAT_R16G16_SNORM = 37,
+    DXGI_FORMAT_R16G16_SINT = 38,
+
     //DXGI_FORMAT_R32_TYPELESS                            = 39,
     //DXGI_FORMAT_D32_FLOAT                               = 40,
-    DXGI_FORMAT_R32_FLOAT                               = 41,
-    DXGI_FORMAT_R32_UINT                                = 42,
-    DXGI_FORMAT_R32_SINT                                = 43,
-    
+    DXGI_FORMAT_R32_FLOAT = 41,
+    DXGI_FORMAT_R32_UINT = 42,
+    DXGI_FORMAT_R32_SINT = 43,
+
     //DXGI_FORMAT_R24G8_TYPELESS                          = 44,
     //DXGI_FORMAT_D24_UNORM_S8_UINT                       = 45,
     //DXGI_FORMAT_R24_UNORM_X8_TYPELESS                   = 46,
     //DXGI_FORMAT_X24_TYPELESS_G8_UINT                    = 47,
     //DXGI_FORMAT_R8G8_TYPELESS                           = 48,
-    
-    DXGI_FORMAT_R8G8_UNORM                              = 49,
-    DXGI_FORMAT_R8G8_UINT                               = 50,
-    DXGI_FORMAT_R8G8_SNORM                              = 51,
-    DXGI_FORMAT_R8G8_SINT                               = 52,
-    
-    DXGI_FORMAT_R16_FLOAT                               = 54,
+
+    DXGI_FORMAT_R8G8_UNORM = 49,
+    DXGI_FORMAT_R8G8_UINT = 50,
+    DXGI_FORMAT_R8G8_SNORM = 51,
+    DXGI_FORMAT_R8G8_SINT = 52,
+
+    DXGI_FORMAT_R16_FLOAT = 54,
     //DXGI_FORMAT_D16_UNORM                               = 55,
-    DXGI_FORMAT_R16_UNORM                               = 56,
-    DXGI_FORMAT_R16_UINT                                = 57,
-    DXGI_FORMAT_R16_SNORM                               = 58,
-    DXGI_FORMAT_R16_SINT                                = 59,
-    
-    DXGI_FORMAT_R8_UNORM                                = 61,
-    DXGI_FORMAT_R8_UINT                                 = 62,
-    DXGI_FORMAT_R8_SNORM                                = 63,
-    DXGI_FORMAT_R8_SINT                                 = 64,
-    
+    DXGI_FORMAT_R16_UNORM = 56,
+    DXGI_FORMAT_R16_UINT = 57,
+    DXGI_FORMAT_R16_SNORM = 58,
+    DXGI_FORMAT_R16_SINT = 59,
+
+    DXGI_FORMAT_R8_UNORM = 61,
+    DXGI_FORMAT_R8_UINT = 62,
+    DXGI_FORMAT_R8_SNORM = 63,
+    DXGI_FORMAT_R8_SINT = 64,
+
     //DXGI_FORMAT_A8_UNORM                                = 65,
     //DXGI_FORMAT_R1_UNORM                                = 66,
-    //DXGI_FORMAT_R9G9B9E5_SHAREDEXP                      = 67,
-    
+    DXGI_FORMAT_R9G9B9E5_SHAREDEXP                      = 67,
+
     //DXGI_FORMAT_R8G8_B8G8_UNORM                         = 68,
     //DXGI_FORMAT_G8R8_G8B8_UNORM                         = 69,
-    
-    DXGI_FORMAT_BC1_UNORM                               = 71,
-    DXGI_FORMAT_BC1_UNORM_SRGB                          = 72,
-    DXGI_FORMAT_BC3_UNORM                               = 77,
-    DXGI_FORMAT_BC3_UNORM_SRGB                          = 78,
-    DXGI_FORMAT_BC4_UNORM                               = 80,
-    DXGI_FORMAT_BC4_SNORM                               = 81,
-    DXGI_FORMAT_BC5_UNORM                               = 83,
-    DXGI_FORMAT_BC5_SNORM                               = 84,
-    DXGI_FORMAT_BC6H_UF16                               = 95,
-    DXGI_FORMAT_BC6H_SF16                               = 96,
-    DXGI_FORMAT_BC7_UNORM                               = 98,
-    DXGI_FORMAT_BC7_UNORM_SRGB                          = 99,
-    
-    DXGI_FORMAT_B8G8R8A8_UNORM                          = 87,
-    DXGI_FORMAT_B8G8R8X8_UNORM                          = 88,
+
+    DXGI_FORMAT_BC1_UNORM = 71,
+    DXGI_FORMAT_BC1_UNORM_SRGB = 72,
+    DXGI_FORMAT_BC3_UNORM = 77,
+    DXGI_FORMAT_BC3_UNORM_SRGB = 78,
+    DXGI_FORMAT_BC4_UNORM = 80,
+    DXGI_FORMAT_BC4_SNORM = 81,
+    DXGI_FORMAT_BC5_UNORM = 83,
+    DXGI_FORMAT_BC5_SNORM = 84,
+    DXGI_FORMAT_BC6H_UF16 = 95,
+    DXGI_FORMAT_BC6H_SF16 = 96,
+    DXGI_FORMAT_BC7_UNORM = 98,
+    DXGI_FORMAT_BC7_UNORM_SRGB = 99,
+
+    DXGI_FORMAT_B8G8R8A8_UNORM = 87,
+    DXGI_FORMAT_B8G8R8X8_UNORM = 88,
     //DXGI_FORMAT_B8G8R8A8_TYPELESS                       = 90,
-    DXGI_FORMAT_B8G8R8A8_UNORM_SRGB                     = 91,
+    DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91,
     //DXGI_FORMAT_B8G8R8X8_TYPELESS                       = 92,
-    DXGI_FORMAT_B8G8R8X8_UNORM_SRGB                     = 93,
-    
+    DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93,
+
     // Astc constants are taken from here.
     // HDR constant weren't too hard to guess from gap, but are a guess.
     // Not officially in DX now that Windows Mobile was killed off.
@@ -213,7 +220,7 @@ enum MyDXGIFormat : uint32_t
     DXGI_FORMAT_ASTC_12X12_UNORM = 186,
     DXGI_FORMAT_ASTC_12X12_UNORM_SRGB = 187,
     DXGI_FORMAT_ASTC_12X12_HDR = 188,
-    
+
     // These are fabricated by kram.  See here for RFI on formats
     // and extensibility on DDS format.  Use at own risk.
     // Set to DXGI_FORMAT_UNKNOWN if don't like this hack.
@@ -338,6 +345,9 @@ enum GLFormat : uint32_t {
     GL_RG32F = 0x8230,
     GL_RGBA32F = 0x8814,
 
+    GL_R11F_G11F_B10F = 0x8C3A,
+    GL_RGB9_E5 = 0x8C3D,
+    
 #if SUPPORT_RGB
     GL_RGB8 = 0x8051,
     GL_SRGB8 = 0x8C41,
@@ -385,7 +395,7 @@ enum GLFormatBase {
     GL_RG = 0x8227,
     GL_RGB = 0x1907,
     GL_RGBA = 0x1908,
-    GL_SRGB = 0x8C40,  // only for BC1
+    GL_SRGB = 0x8C40, // only for BC1
     GL_SRGB_ALPHA = 0x8C42,
 };
 
@@ -411,7 +421,7 @@ enum MyVKFormat {
 
     // distinguish HDR from LDR formats
     // Provided by VK_EXT_texture_compression_astc_hdr
-    VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT = 1000066000,  // large decimal
+    VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT = 1000066000, // large decimal
     VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK_EXT = 1000066001,
     VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK_EXT = 1000066002,
     VK_FORMAT_ASTC_6x5_SFLOAT_BLOCK_EXT = 1000066003,
@@ -463,29 +473,32 @@ enum MyVKFormat {
     VK_FORMAT_ASTC_8x8_UNORM_BLOCK = 171,
     VK_FORMAT_ASTC_8x8_SRGB_BLOCK = 172,
 
-// not support these
-//    VK_FORMAT_ASTC_5x4_UNORM_BLOCK = 159,
-//    VK_FORMAT_ASTC_5x4_SRGB_BLOCK = 160,
-//    VK_FORMAT_ASTC_6x5_UNORM_BLOCK = 163,
-//    VK_FORMAT_ASTC_6x5_SRGB_BLOCK = 164,
-//    VK_FORMAT_ASTC_8x5_UNORM_BLOCK = 167,
-//    VK_FORMAT_ASTC_8x5_SRGB_BLOCK = 168,
-//    VK_FORMAT_ASTC_8x6_UNORM_BLOCK = 169,
-//    VK_FORMAT_ASTC_8x6_SRGB_BLOCK = 170,
-
-//    VK_FORMAT_ASTC_10x5_UNORM_BLOCK = 173,
-//    VK_FORMAT_ASTC_10x5_SRGB_BLOCK = 174,
-//    VK_FORMAT_ASTC_10x6_UNORM_BLOCK = 175,
-//    VK_FORMAT_ASTC_10x6_SRGB_BLOCK = 176,
-//    VK_FORMAT_ASTC_10x8_UNORM_BLOCK = 177,
-//    VK_FORMAT_ASTC_10x8_SRGB_BLOCK = 178,
-//    VK_FORMAT_ASTC_10x10_UNORM_BLOCK = 179,
-//    VK_FORMAT_ASTC_10x10_SRGB_BLOCK = 180,
-//    VK_FORMAT_ASTC_12x10_UNORM_BLOCK = 181,
-//    VK_FORMAT_ASTC_12x10_SRGB_BLOCK = 182,
-//    VK_FORMAT_ASTC_12x12_UNORM_BLOCK = 183,
-//    VK_FORMAT_ASTC_12x12_SRGB_BLOCK = 184,
-
+// not supporting these
+// VK_FORMAT_ASTC_5x4_UNORM_BLOCK = 159,
+// VK_FORMAT_ASTC_5x4_SRGB_BLOCK = 160,
+// VK_FORMAT_ASTC_6x5_UNORM_BLOCK = 163,
+// VK_FORMAT_ASTC_6x5_SRGB_BLOCK = 164,
+// VK_FORMAT_ASTC_8x5_UNORM_BLOCK = 167,
+// VK_FORMAT_ASTC_8x5_SRGB_BLOCK = 168,
+// VK_FORMAT_ASTC_8x6_UNORM_BLOCK = 169,
+// VK_FORMAT_ASTC_8x6_SRGB_BLOCK = 170,
+
+// VK_FORMAT_ASTC_10x5_UNORM_BLOCK = 173,
+// VK_FORMAT_ASTC_10x5_SRGB_BLOCK = 174,
+// VK_FORMAT_ASTC_10x6_UNORM_BLOCK = 175,
+// VK_FORMAT_ASTC_10x6_SRGB_BLOCK = 176,
+// VK_FORMAT_ASTC_10x8_UNORM_BLOCK = 177,
+// VK_FORMAT_ASTC_10x8_SRGB_BLOCK = 178,
+// VK_FORMAT_ASTC_10x10_UNORM_BLOCK = 179,
+// VK_FORMAT_ASTC_10x10_SRGB_BLOCK = 180,
+// VK_FORMAT_ASTC_12x10_UNORM_BLOCK = 181,
+// VK_FORMAT_ASTC_12x10_SRGB_BLOCK = 182,
+// VK_FORMAT_ASTC_12x12_UNORM_BLOCK = 183,
+// VK_FORMAT_ASTC_12x12_SRGB_BLOCK = 184,
+
+    VK_FORMAT_B10G11R11_UFLOAT_PACK32 = 122,
+    VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 = 123,
+    
 #if SUPPORT_RGB
     // import only
     VK_FORMAT_R8G8B8_UNORM = 23,
@@ -545,9 +558,9 @@ class KTXFormatInfo {
         constexpr uint32_t glNameStart = sizeof("GL_") - 1;
         constexpr uint32_t dxNameStart = sizeof("DXGI_FORMAT_") - 1;
         constexpr uint32_t metalNameStart = sizeof("MyMTLPixelFormat") - 1;
-        
+
         formatName = formatName_;
-        
+
         // skip the redunant part
         metalName = metalName_ + metalNameStart;
         vulkanName = vulkanName_ + vulkanNameStart;
@@ -630,27 +643,27 @@ static bool initFormatsIfNeeded()
     if (gFormatTable) {
         return true;
     }
-    
+
     mylock lock(gFormatTableMutex);
-    
+
     if (gFormatTable) {
         return true;
     }
-    
+
     mymap* formatTable = new unordered_map<uint32_t /*MyMTLPixelFormat*/, KTXFormatInfo>();
 
 // the following table could be included multiple ties to build switch statements, but instead use a hashmap
 #define KTX_FORMAT(fmt, metalType, vulkanType, directxType, glType, glBase, x, y, blockSize, numChannels, flags) \
-    (*formatTable)[(uint32_t)metalType] = KTXFormatInfo(             \
-        #fmt, #metalType, #vulkanType, #directxType, #glType,        \
-        metalType, vulkanType, directxType, glType, glBase,          \
+    (*formatTable)[(uint32_t)metalType] = KTXFormatInfo(                                                         \
+        #fmt, #metalType, #vulkanType, #directxType, #glType,                                                    \
+        metalType, vulkanType, directxType, glType, glBase,                                                      \
         x, y, blockSize, numChannels, (flags));
 
     KTX_FORMAT(Invalid, MyMTLPixelFormatInvalid, VK_FORMAT_UNDEFINED, DXGI_FORMAT_UNKNOWN, GL_UNKNOWN, GL_RGBA, 1, 1, 0, 0, 0)
 
     // BC
     KTX_FORMAT(BC1, MyMTLPixelFormatBC1_RGBA, VK_FORMAT_BC1_RGB_UNORM_BLOCK, DXGI_FORMAT_BC1_UNORM, GL_COMPRESSED_RGB_S3TC_DXT1_EXT, GL_RGB, 4, 4, 8, 3, FLAG_ENC_BC)
-    KTX_FORMAT(BC1s, MyMTLPixelFormatBC1_RGBA_sRGB, VK_FORMAT_BC1_RGB_SRGB_BLOCK, DXGI_FORMAT_BC1_UNORM_SRGB,GL_COMPRESSED_SRGB_S3TC_DXT1_EXT, GL_SRGB, 4, 4, 8, 3, FLAG_ENC_BC | FLAG_SRGB)
+    KTX_FORMAT(BC1s, MyMTLPixelFormatBC1_RGBA_sRGB, VK_FORMAT_BC1_RGB_SRGB_BLOCK, DXGI_FORMAT_BC1_UNORM_SRGB, GL_COMPRESSED_SRGB_S3TC_DXT1_EXT, GL_SRGB, 4, 4, 8, 3, FLAG_ENC_BC | FLAG_SRGB)
 
     KTX_FORMAT(BC3, MyMTLPixelFormatBC3_RGBA, VK_FORMAT_BC3_UNORM_BLOCK, DXGI_FORMAT_BC3_UNORM, GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, 4, 4, 16, 4, FLAG_ENC_BC)
     KTX_FORMAT(BC3s, MyMTLPixelFormatBC3_RGBA_sRGB, VK_FORMAT_BC3_SRGB_BLOCK, DXGI_FORMAT_BC3_UNORM_SRGB, GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT, GL_SRGB_ALPHA, 4, 4, 16, 4, FLAG_ENC_BC | FLAG_SRGB)
@@ -683,23 +696,23 @@ static bool initFormatsIfNeeded()
     // ASTC
     KTX_FORMAT(ASTC4x4, MyMTLPixelFormatASTC_4x4_LDR, VK_FORMAT_ASTC_4x4_UNORM_BLOCK, DXGI_FORMAT_ASTC_4X4_UNORM, GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_RGBA, 4, 4, 16, 4, FLAG_ENC_ASTC)
     KTX_FORMAT(ASTC4x4s, MyMTLPixelFormatASTC_4x4_sRGB, VK_FORMAT_ASTC_4x4_SRGB_BLOCK, DXGI_FORMAT_ASTC_4X4_UNORM_SRGB, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR, GL_SRGB_ALPHA, 4, 4, 16, 4, FLAG_ENC_ASTC | FLAG_SRGB)
-    KTX_FORMAT(ASTC4x4h, MyMTLPixelFormatASTC_4x4_HDR, VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT, DXGI_FORMAT_UNKNOWN, GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_RGBA, 4, 4, 16, 4, FLAG_ENC_ASTC | FLAG_16F)  // gl type same as LDR
+    KTX_FORMAT(ASTC4x4h, MyMTLPixelFormatASTC_4x4_HDR, VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT, DXGI_FORMAT_UNKNOWN, GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_RGBA, 4, 4, 16, 4, FLAG_ENC_ASTC | FLAG_16F) // gl type same as LDR
 
     KTX_FORMAT(ASTC5x5, MyMTLPixelFormatASTC_5x5_LDR, VK_FORMAT_ASTC_5x5_UNORM_BLOCK, DXGI_FORMAT_ASTC_5X5_UNORM, GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_RGBA, 5, 5, 16, 4, FLAG_ENC_ASTC)
     KTX_FORMAT(ASTC5x5s, MyMTLPixelFormatASTC_5x5_sRGB, VK_FORMAT_ASTC_5x5_SRGB_BLOCK, DXGI_FORMAT_ASTC_5X5_UNORM_SRGB, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR, GL_SRGB_ALPHA, 5, 5, 16, 4, FLAG_ENC_ASTC | FLAG_SRGB)
-    KTX_FORMAT(ASTC5x5h, MyMTLPixelFormatASTC_5x5_HDR, VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK_EXT, DXGI_FORMAT_ASTC_5X5_HDR, GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_RGBA, 5, 5, 16, 4, FLAG_ENC_ASTC | FLAG_16F)  // gl type same as LDR
+    KTX_FORMAT(ASTC5x5h, MyMTLPixelFormatASTC_5x5_HDR, VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK_EXT, DXGI_FORMAT_ASTC_5X5_HDR, GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_RGBA, 5, 5, 16, 4, FLAG_ENC_ASTC | FLAG_16F) // gl type same as LDR
 
     KTX_FORMAT(ASTC6x6, MyMTLPixelFormatASTC_6x6_LDR, VK_FORMAT_ASTC_6x6_UNORM_BLOCK, DXGI_FORMAT_ASTC_6X6_UNORM, GL_COMPRESSED_RGBA_ASTC_6x6_KHR, GL_RGBA, 6, 6, 16, 4, FLAG_ENC_ASTC)
     KTX_FORMAT(ASTC6x6s, MyMTLPixelFormatASTC_6x6_sRGB, VK_FORMAT_ASTC_6x6_SRGB_BLOCK, DXGI_FORMAT_ASTC_6X6_UNORM_SRGB, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR, GL_SRGB_ALPHA, 6, 6, 16, 4, FLAG_ENC_ASTC | FLAG_SRGB)
-    KTX_FORMAT(ASTC6x6h, MyMTLPixelFormatASTC_6x6_HDR, VK_FORMAT_ASTC_6x6_SFLOAT_BLOCK_EXT, DXGI_FORMAT_ASTC_6X6_HDR, GL_COMPRESSED_RGBA_ASTC_6x6_KHR, GL_RGBA, 6, 6, 16, 4, FLAG_ENC_ASTC | FLAG_16F)  // gl type same as LDR
+    KTX_FORMAT(ASTC6x6h, MyMTLPixelFormatASTC_6x6_HDR, VK_FORMAT_ASTC_6x6_SFLOAT_BLOCK_EXT, DXGI_FORMAT_ASTC_6X6_HDR, GL_COMPRESSED_RGBA_ASTC_6x6_KHR, GL_RGBA, 6, 6, 16, 4, FLAG_ENC_ASTC | FLAG_16F) // gl type same as LDR
 
     KTX_FORMAT(ASTC8x8, MyMTLPixelFormatASTC_8x8_LDR, VK_FORMAT_ASTC_8x8_UNORM_BLOCK, DXGI_FORMAT_ASTC_8X8_UNORM, GL_COMPRESSED_RGBA_ASTC_8x8_KHR, GL_RGBA, 8, 8, 16, 4, FLAG_ENC_ASTC)
     KTX_FORMAT(ASTC8x8s, MyMTLPixelFormatASTC_8x8_sRGB, VK_FORMAT_ASTC_8x8_SRGB_BLOCK, DXGI_FORMAT_ASTC_8X8_UNORM_SRGB, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR, GL_SRGB_ALPHA, 8, 8, 16, 4, FLAG_ENC_ASTC | FLAG_SRGB)
-    KTX_FORMAT(ASTC8x8h, MyMTLPixelFormatASTC_8x8_HDR, VK_FORMAT_ASTC_8x8_SFLOAT_BLOCK_EXT, DXGI_FORMAT_ASTC_8X8_HDR, GL_COMPRESSED_RGBA_ASTC_8x8_KHR, GL_RGBA, 8, 8, 16, 4, FLAG_ENC_ASTC | FLAG_16F)  // gl type same as LDR
+    KTX_FORMAT(ASTC8x8h, MyMTLPixelFormatASTC_8x8_HDR, VK_FORMAT_ASTC_8x8_SFLOAT_BLOCK_EXT, DXGI_FORMAT_ASTC_8X8_HDR, GL_COMPRESSED_RGBA_ASTC_8x8_KHR, GL_RGBA, 8, 8, 16, 4, FLAG_ENC_ASTC | FLAG_16F) // gl type same as LDR
 
     // Explicit
     KTX_FORMAT(EXPr8, MyMTLPixelFormatR8Unorm, VK_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, GL_R8, GL_RED, 1, 1, 1, 1, 0)
-    KTX_FORMAT(EXPrg8, MyMTLPixelFormatRG8Unorm,  VK_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM,GL_RG8, GL_RG, 1, 1, 2, 2, 0)
+    KTX_FORMAT(EXPrg8, MyMTLPixelFormatRG8Unorm, VK_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, GL_RG8, GL_RG, 1, 1, 2, 2, 0)
     KTX_FORMAT(EXPrgba8, MyMTLPixelFormatRGBA8Unorm, VK_FORMAT_R8G8B8A8_UNORM, DXGI_FORMAT_R8G8B8A8_UNORM, GL_RGBA8, GL_RGBA, 1, 1, 4, 4, 0)
     KTX_FORMAT(EXPsrgba8, MyMTLPixelFormatRGBA8Unorm_sRGB, VK_FORMAT_R8G8B8A8_SRGB, DXGI_FORMAT_R8G8B8A8_UNORM_SRGB, GL_SRGB8_ALPHA8, GL_SRGB_ALPHA, 1, 1, 4, 4, FLAG_SRGB)
 
@@ -711,8 +724,16 @@ static bool initFormatsIfNeeded()
     KTX_FORMAT(EXPrg32f, MyMTLPixelFormatRG32Float, VK_FORMAT_R32G32_SFLOAT, DXGI_FORMAT_R32G32_FLOAT, GL_RG32F, GL_RG, 1, 1, 8, 2, FLAG_32F)
     KTX_FORMAT(EXPrgba32f, MyMTLPixelFormatRGBA32Float, VK_FORMAT_R32G32B32A32_SFLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT, GL_RGBA32F, GL_RGBA, 1, 1, 16, 4, FLAG_32F)
 
+    // import only (can convert dds -> ktx/ktx2)
+    KTX_FORMAT(EXPrg11b10f, MyMTLPixelFormatRG11B10Float, VK_FORMAT_B10G11R11_UFLOAT_PACK32, DXGI_FORMAT_R11G11B10_FLOAT, GL_R11F_G11F_B10F, GL_RGB, 1, 1, 4, 3, FLAG_16F)
+    // GL_UNSIGNED_INT_10F_11F_11F_REV
+    
+    // import only (can convert dds -> ktx/ktx2)
+    KTX_FORMAT(EXPrgb9f, MyMTLPixelFormatRGB9E5Float, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, DXGI_FORMAT_R9G9B9E5_SHAREDEXP, GL_RGB9_E5, GL_RGB, 1, 1, 4, 3, FLAG_16F)
+    // GL_UNSIGNED_INT_5_9_9_9_REV
+    
 #if SUPPORT_RGB
-    // these are import only formats
+    // import only formats (can convert dds -> ktx/ktx2)
     // DX only has one of these as a valid type
     KTX_FORMAT(EXPrgb8, MyMTLPixelFormatRGB8Unorm_internal, VK_FORMAT_R8G8B8_UNORM, DXGI_FORMAT_UNKNOWN, GL_RGB8, GL_RGB, 1, 1, 3, 3, 0)
     KTX_FORMAT(EXPsrgb8, MyMTLPixelFormatRGB8Unorm_sRGB_internal, VK_FORMAT_R8G8B8_SRGB, DXGI_FORMAT_UNKNOWN, GL_SRGB8, GL_SRGB, 1, 1, 3, 3, FLAG_SRGB)
@@ -721,7 +742,7 @@ static bool initFormatsIfNeeded()
 #endif
 
     gFormatTable = formatTable;
-    
+
     return true;
 }
 
@@ -859,7 +880,6 @@ uint32_t directxType(MyMTLPixelFormat format)
     return it.directxType;
 }
 
-
 const char* vulkanTypeName(MyMTLPixelFormat format)
 {
     const auto& it = formatInfo(format);
@@ -889,7 +909,7 @@ MyMTLPixelFormat glToMetalFormat(uint32_t format)
     if (format == 0) {
         return MyMTLPixelFormatInvalid;
     }
-    
+
     initFormatsIfNeeded();
 
     for (const auto& it : *gFormatTable) {
@@ -908,7 +928,7 @@ MyMTLPixelFormat vulkanToMetalFormat(uint32_t format)
     if (format == 0) {
         return MyMTLPixelFormatInvalid;
     }
-    
+
     initFormatsIfNeeded();
 
     for (const auto& it : *gFormatTable) {
@@ -927,7 +947,7 @@ MyMTLPixelFormat directxToMetalFormat(uint32_t format)
     if (format == 0) {
         return MyMTLPixelFormatInvalid;
     }
-    
+
     initFormatsIfNeeded();
 
     for (const auto& it : *gFormatTable) {
@@ -1177,22 +1197,24 @@ MyMTLTextureType KTXHeader::metalTextureType() const
 
 //---------------------------------------------------
 
-inline bool isKTX2File(const uint8_t* data, size_t dataSize) {
+inline bool isKTX2File(const uint8_t* data, size_t dataSize)
+{
     if (dataSize < sizeof(kKTX2Identifier)) {
         return false;
     }
-    
+
     if (memcmp(data, kKTX2Identifier, sizeof(kKTX2Identifier)) != 0) {
         return false;
     }
     return true;
 }
 
-inline bool isKTX1File(const uint8_t* data, size_t dataSize) {
+inline bool isKTX1File(const uint8_t* data, size_t dataSize)
+{
     if (dataSize < sizeof(kKTXIdentifier)) {
         return false;
     }
-    
+
     if (memcmp(data, kKTXIdentifier, sizeof(kKTXIdentifier)) != 0) {
         return false;
     }
@@ -1279,11 +1301,10 @@ void KTXImage::initProps(const uint8_t* propsData, size_t propDataSize)
             }
 
             //LOGD("KTXImage", "KTXProp '%s': %s\n", keyStart, valueStart);
-            auto propPair = NAMESPACE_STL::make_pair(
+            auto propPair = STL_NAMESPACE::make_pair(
                 string((const char*)keyStart),
-                string((const char*)valueStart)
-            );
-                          
+                string((const char*)valueStart));
+
             props.emplace_back(propPair);
 
             // pad to 4 byte alignment
@@ -1301,10 +1322,9 @@ void KTXImage::addProp(const char* name, const char* value)
             return;
         }
     }
-    auto propPair = NAMESPACE_STL::make_pair(
+    auto propPair = STL_NAMESPACE::make_pair(
         string(name),
-        string(value)
-    );
+        string(value));
     props.emplace_back(propPair);
 }
 
@@ -1432,7 +1452,7 @@ void KTXImage::initMipLevels(bool doMipmaps, int32_t mipMinSize, int32_t mipMaxS
 
     bool needsDownsample = (numSkippedMips > 0) || (w > mipMaxSize || h > mipMaxSize);
 
-    int32_t maxMipLevels = 16;  // 64K x 64K
+    int32_t maxMipLevels = 16; // 64K x 64K
 
     // can still downsample src multiple times even with only 1 level exported
     if ((!doMipmaps) && needsDownsample) {
@@ -1813,8 +1833,8 @@ bool KTXImage::openKTX2(const uint8_t* imageData, size_t imageDataLength, bool i
 
             // uncompressed level
             auto& level1 = mipLevels[i];
-            level1.lengthCompressed = level2.lengthCompressed;      // need this for copyLevel to have enough data
-            uint8_t* dstData = (uint8_t*)fileData + level1.offset;  // can const_cast, since class owns data
+            level1.lengthCompressed = level2.lengthCompressed; // need this for copyLevel to have enough data
+            uint8_t* dstData = (uint8_t*)fileData + level1.offset; // can const_cast, since class owns data
 
             if (!unpackLevel(i, srcData, dstData)) {
                 return false;
@@ -1869,12 +1889,23 @@ bool KTXImage::unpackLevel(uint32_t mipNumber, const uint8_t* srcData, uint8_t*
 
             case KTX2SupercompressionZlib: {
                 // can use miniz or libCompression
+#if USE_LIBCOMPRESSION
+                // TODO: see if this is faster
+                char scratchBuffer[compression_decode_scratch_buffer_size(COMPRESSION_ZLIB)];
+
+                size_t dstDataSizeMiniz = compression_decode_buffer(
+                    (uint8_t*)dstData, dstDataSize,
+                    (const uint8_t*)srcData, srcDataSize,
+                    scratchBuffer,
+                    COMPRESSION_ZLIB);
+#else
                 mz_ulong dstDataSizeMiniz = 0;
                 if (mz_uncompress(dstData, &dstDataSizeMiniz,
                                   srcData, srcDataSize) != MZ_OK) {
                     KLOGE("kram", "decode mip zlib failed");
                     return false;
                 }
+#endif
                 if (dstDataSizeMiniz != dstDataSize) {
                     KLOGE("kram", "decode mip zlib size not expected");
                     return false;
@@ -1930,4 +1961,4 @@ void KTXImage::reserveImageData(size_t totalSize)
     fileData = _imageData.data();
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KTXImage.h b/libkram/kram/KTXImage.h
index d5e4680b..1b0b77a7 100644
--- a/libkram/kram/KTXImage.h
+++ b/libkram/kram/KTXImage.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -11,7 +11,7 @@
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // TODO: abstract MyMTLPixelFormat and move to readable/neutral type
 enum MyMTLPixelFormat {
@@ -72,8 +72,8 @@ enum MyMTLPixelFormat {
     MyMTLPixelFormatEAC_RGBA8_sRGB = 179,
 
     // not supporting
-    //    MyMTLPixelFormatETC2_RGB8A1            = 182,
-    //    MyMTLPixelFormatETC2_RGB8A1_sRGB       = 183,
+    // MyMTLPixelFormatETC2_RGB8A1            = 182,
+    // MyMTLPixelFormatETC2_RGB8A1_sRGB       = 183,
 
     // ------
     // Explicit formats
@@ -90,6 +90,10 @@ enum MyMTLPixelFormat {
     // MyMTLPixelFormatRG8Snorm     = 32,
     // MyMTLPixelFormatRGBA8Snorm      = 72,
 
+    // TODO: also 10a2Unorm
+    MyMTLPixelFormatRG11B10Float = 92,
+    MyMTLPixelFormatRGB9E5Float = 93,
+    
     // TODO: also BGRA8Unorm types?
 
     MyMTLPixelFormatR16Float = 25,
@@ -100,9 +104,6 @@ enum MyMTLPixelFormat {
     MyMTLPixelFormatRG32Float = 105,
     MyMTLPixelFormatRGBA32Float = 125,
 
-// TODO: also need rgb9e5 for fallback if ASTC HDR/6H not supported
-// That is Unity's fallback if alpha not needed, otherwise RGBA16F.
-
 #if SUPPORT_RGB
     // Can import files from KTX/KTX2 with RGB data, but convert right away to RGBA.
     // These are not export formats.  Watch alignment on these too.  These
@@ -116,7 +117,7 @@ enum MyMTLPixelFormat {
 
 enum MyMTLTextureType {
     // MyMTLTextureType1D = 0,   // not twiddled or compressed, more like a buffer but with texture limits
-    MyMTLTextureType1DArray = 1,  // not twiddled or compressed, more like a buffer but with texture limits
+    MyMTLTextureType1DArray = 1, // not twiddled or compressed, more like a buffer but with texture limits
     MyMTLTextureType2D = 2,
     MyMTLTextureType2DArray = 3,
     // MyMTLTextureType2DMultisample = 4,
@@ -151,20 +152,20 @@ class KTXHeader {
     };
 
     uint32_t endianness = 0x04030201;
-    uint32_t glType = 0;      // compressed = 0
-    uint32_t glTypeSize = 1;  // doesn't depend on endianness
+    uint32_t glType = 0; // compressed = 0
+    uint32_t glTypeSize = 1; // doesn't depend on endianness
 
     uint32_t glFormat = 0;
-    uint32_t glInternalFormat = 0;      // must be same as glFormat
-    uint32_t glBaseInternalFormat = 0;  // GL_RED, RG, RGB, RGBA, SRGB, SRGBA
+    uint32_t glInternalFormat = 0; // must be same as glFormat
+    uint32_t glBaseInternalFormat = 0; // GL_RED, RG, RGB, RGBA, SRGB, SRGBA
 
     uint32_t pixelWidth = 1;
-    uint32_t pixelHeight = 0;  // >0 for 2d
-    uint32_t pixelDepth = 0;   // >0 for 3d
+    uint32_t pixelHeight = 0; // >0 for 2d
+    uint32_t pixelDepth = 0; // >0 for 3d
 
     uint32_t numberOfArrayElements = 0;
     uint32_t numberOfFaces = 1;
-    uint32_t numberOfMipmapLevels = 1;  // 0 means auto mip
+    uint32_t numberOfMipmapLevels = 1; // 0 means auto mip
 
     uint32_t bytesOfKeyValueData = 0;
 
@@ -201,7 +202,7 @@ class KTX2Header {
         // '«', 'K', 'T', 'X', ' ', '2', '0', '»', '\r', '\n', '\x1A', '\n'
     };
 
-    uint32_t vkFormat = 0;  // invalid format
+    uint32_t vkFormat = 0; // invalid format
     uint32_t typeSize = 1;
 
     uint32_t pixelWidth = 1;
@@ -240,16 +241,16 @@ class KTX2Header {
 // and the offsts include a 4 byte length at the start of each level.
 class KTXImageLevel {
 public:
-    uint64_t offset = 0;            //  differ in ordering - ktx largest first, ktx2 smallest first
-    uint64_t lengthCompressed = 0;  // set to 0 if not compresseds
-    uint64_t length = 0;            // numChunks * mipSize when written for non cube on KTX1 or all KTX2, internally only stores mipSize
+    uint64_t offset = 0; // differ in ordering - ktx largest first, ktx2 smallest first
+    uint64_t lengthCompressed = 0; // set to 0 if not compresseds
+    uint64_t length = 0; // numChunks * mipSize when written for non cube on KTX1 or all KTX2, internally only stores mipSize
 };
 
 enum KTX2Supercompression {
     KTX2SupercompressionNone = 0,
-    KTX2SupercompressionBasisLZ = 1,  // can transcode, but can't gen from KTX file using ktxsc, uses sgdByteLength
-    KTX2SupercompressionZstd = 2,     // faster deflate, ktxsc support
-    KTX2SupercompressionZlib = 3,     // deflate, no ktxsc support (use miniz)
+    KTX2SupercompressionBasisLZ = 1, // can transcode, but can't gen from KTX file using ktxsc, uses sgdByteLength
+    KTX2SupercompressionZstd = 2, // faster deflate, ktxsc support
+    KTX2SupercompressionZlib = 3, // deflate, no ktxsc support (use miniz)
     // TODO: Need LZFSE?
     // TODO: need Kraken for PS4
     // TODO: need Xbox format
@@ -257,7 +258,7 @@ enum KTX2Supercompression {
 
 struct KTX2Compressor {
     KTX2Supercompression compressorType = KTX2SupercompressionNone;
-    float compressorLevel = 0.0f;  // 0.0 is default
+    float compressorLevel = 0.0f; // 0.0 is default
 
     bool isCompressed() const { return compressorType != KTX2SupercompressionNone; }
 };
@@ -311,7 +312,7 @@ class KTXImage {
 
     // determine if image stores rgb * a
     bool isPremul() const;
-    
+
     // can use on ktx1/2 files, does a decompress if needed
     bool unpackLevel(uint32_t mipNumber, const uint8_t* srcData, uint8_t* dstData) const;
 
@@ -324,7 +325,7 @@ class KTXImage {
     uint32_t mipLengthCalc(uint32_t mipNumber) const;
     size_t mipLengthLargest() const { return mipLevels[0].length; }
     size_t mipLength(uint32_t mipNumber) const { return mipLevels[mipNumber].length; }
-    
+
     // level
     size_t levelLength(uint32_t mipNumber) const { return mipLevels[mipNumber].length * totalChunks(); }
     size_t levelLengthCompressed(uint32_t mipNumber) const { return mipLevels[mipNumber].lengthCompressed; }
@@ -335,16 +336,16 @@ class KTXImage {
 
     // trying to bury access to KTX1 header, since this supports KTX2 now
     uint32_t arrayCount() const { return std::max(1u, header.numberOfArrayElements); }
-    uint32_t mipCount() const   { return std::max(1u, header.numberOfMipmapLevels); }
-    uint32_t faceCount() const  { return std::max(1u, header.numberOfFaces); }
-    
+    uint32_t mipCount() const { return std::max(1u, header.numberOfMipmapLevels); }
+    uint32_t faceCount() const { return std::max(1u, header.numberOfFaces); }
+
 private:
     bool openKTX2(const uint8_t* imageData, size_t imageDataLength, bool isInfoOnly);
 
     // ktx2 mips are uncompressed to convert back to ktx1, but without the image offset
     vector<uint8_t> _imageData;
 
-public:  // TODO: bury this
+public: // TODO: bury this
     MyMTLTextureType textureType = MyMTLTextureType2D;
     MyMTLPixelFormat pixelFormat = MyMTLPixelFormatInvalid;
 
@@ -358,16 +359,16 @@ class KTXImage {
     bool skipImageLength = false;
     KTX2Supercompression supercompressionType = KTX2SupercompressionNone;
 
-    KTXHeader header;  // copy of KTXHeader from KTX1, so can be modified and then written back
+    KTXHeader header; // copy of KTXHeader from KTX1, so can be modified and then written back
 
     // write out only string/string props, for easy of viewing
     vector<pair<string, string> > props;
 
-    vector<KTXImageLevel> mipLevels;  // offsets into fileData
+    vector<KTXImageLevel> mipLevels; // offsets into fileData
 
     // this only holds data for mipLevels
     size_t fileDataLength = 0;
-    const uint8_t* fileData = nullptr;  // mmap data
+    const uint8_t* fileData = nullptr; // mmap data
 };
 
 // GL/D3D hobbled non-pow2 mips by only supporting round down, not round up
@@ -435,17 +436,17 @@ const char* formatTypeName(MyMTLPixelFormat format);
 
 // metal
 const char* metalTypeName(MyMTLPixelFormat format);
-uint32_t metalType(MyMTLPixelFormat format);  // really MTLPixelFormat
+uint32_t metalType(MyMTLPixelFormat format); // really MTLPixelFormat
 
 // directx
 const char* directxTypeName(MyMTLPixelFormat format);
-uint32_t directxType(MyMTLPixelFormat format);           // really DXFormat
-MyMTLPixelFormat directxToMetalFormat(uint32_t format);  // really DXFormat
+uint32_t directxType(MyMTLPixelFormat format); // really DXFormat
+MyMTLPixelFormat directxToMetalFormat(uint32_t format); // really DXFormat
 
 // vuklan
 const char* vulkanTypeName(MyMTLPixelFormat format);
-uint32_t vulkanType(MyMTLPixelFormat format);           // really VKFormat
-MyMTLPixelFormat vulkanToMetalFormat(uint32_t format);  // really VKFormat
+uint32_t vulkanType(MyMTLPixelFormat format); // really VKFormat
+MyMTLPixelFormat vulkanToMetalFormat(uint32_t format); // really VKFormat
 
 // gl
 const char* glTypeName(MyMTLPixelFormat format);
@@ -457,4 +458,4 @@ const char* textureTypeName(MyMTLTextureType textureType);
 // find a corresponding srgb/non-srgb format for a given format
 MyMTLPixelFormat toggleSrgbFormat(MyMTLPixelFormat format);
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp
index f31e300b..5afd9069 100644
--- a/libkram/kram/Kram.cpp
+++ b/libkram/kram/Kram.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -18,15 +18,23 @@
 #include "KTXImage.h"
 #include "KramDDSHelper.h"
 #include "KramFileHelper.h"
-#include "KramImage.h"  // has config defines, move them out
+#include "KramImage.h" // has config defines, move them out
 #include "KramMmapHelper.h"
 #include "KramTimer.h"
-#define KRAM_VERSION "1.0"
-//#include "KramVersion.h"
+//#define KRAM_VERSION "1.0"
+#include "KramVersion.h"
 #include "TaskSystem.h"
 #include "lodepng.h"
 #include "miniz.h"
 
+#ifndef USE_LIBCOMPRESSION
+#define USE_LIBCOMPRESSION 0 // KRAM_APPLE
+#endif
+
+#if USE_LIBCOMPRESSION
+#include <compression.h>
+#endif
+
 // one .cpp must supply these new overrides
 #if USE_EASTL
 void* __cdecl operator new[](size_t size, const char* name, int flags, unsigned debugFlags, const char* file, int line)
@@ -36,7 +44,7 @@ void* __cdecl operator new[](size_t size, const char* name, int flags, unsigned
 
 void* operator new[](size_t size, size_t alignment, size_t alignmentOffset, const char* pName, int flags, unsigned debugFlags, const char* file, int line)
 {
-    return new uint8_t[size];  // TODO: honor alignment
+    return new uint8_t[size]; // TODO: honor alignment
 }
 
 #endif
@@ -50,10 +58,12 @@ void* operator new[](size_t size, size_t alignment, size_t alignmentOffset, cons
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
-// lodepng iccp decode is failing when setting this for some reason, find out why
-// Must set it with LODEPNG_NO_COMPILE_ZLIB in lodepng.h if true
+// This fails with libCompression (see inter-a.png)
+// and with miniZ for the ICCP block (see inter-a.png)
+// lodepng passes 16K to the custom zlib decompress, but
+// the data read isn't that big.
 static bool useMiniZ = false;
 
 template <typename T>
@@ -76,7 +86,8 @@ bool isKTX2Filename(const char* filename)
 bool isDDSFilename(const char* filename)
 {
     // should really look at first 4 bytes of data
-    return endsWithExtension(filename, ".dds");
+    return endsWithExtension(filename, ".dds") ||
+           endsWithExtension(filename, ".DDS"); // allow uppercase
 }
 bool isPNGFilename(const char* filename)
 {
@@ -137,11 +148,11 @@ bool KTXImageData::open(const char* filename, KTXImage& image, bool isInfoOnly_)
     _name = toFilenameShort(filename);
 
     if (isPNGFilename(filename)) {
-       bool success = openPNG(filename, image);
-        
+        bool success = openPNG(filename, image);
+
         if (success)
             fixPixelFormat(image, filename);
-        
+
         return success;
     }
 
@@ -254,10 +265,10 @@ bool KTXImageData::openPNG(const uint8_t* data, size_t dataSize, KTXImage& image
     // This is returned by LoadPng.  Note that many png have this set
     // by default and not controllable by artists.
     bool isSrgb = false;
-    
+
     Image singleImage;
     bool isLoaded = LoadPng(data, dataSize, false, false, isSrgb, singleImage);
-    
+
     // don't need png data anymore
     close();
 
@@ -287,7 +298,7 @@ bool KTXImageData::openPNG(const uint8_t* data, size_t dataSize, KTXImage& image
 
     // TODO: png has 16u format useful for heights
 
-    image.initMipLevels(sizeof(KTXHeader));  // TODO: could also make this ktx2 with zstd compress
+    image.initMipLevels(sizeof(KTXHeader)); // TODO: could also make this ktx2 with zstd compress
     image.reserveImageData();
     memcpy((uint8_t*)image.fileData, &image.header, sizeof(KTXHeader));
 
@@ -303,7 +314,7 @@ bool KTXImageData::open(const uint8_t* data, size_t dataSize, KTXImage& image, b
 
     if (isPNGFile(data, dataSize)) {
         // data stored in image
-        return openPNG(data, dataSize, image);  // TODO: pass isInfoOnly
+        return openPNG(data, dataSize, image); // TODO: pass isInfoOnly
     }
     else if (isDDSFile(data, dataSize)) {
         // converts dds to ktx, data stored in image
@@ -374,13 +385,13 @@ class MortonOrder {
 // https://en.wikipedia.org/wiki/Grayscale
 inline Color toGrayscaleRec709(Color c, const Mipper& mipper)
 {
-    const float4 kRec709Conversion = float4m(0.2126f, 0.7152f, 0.0722f, 0.0f);  // really a float3
+    const float4 kRec709Conversion = float4m(0.2126f, 0.7152f, 0.0722f, 0.0f); // really a float3
 
     // convert to linear, do luminance, then back to srgb primary
 
     float4 clin = mipper.toLinear(c);
     float luminance = dot(clin, kRec709Conversion);
-    luminance = std::min(luminance, 1.0f);  // to avoid assert if math goes above 1.0
+    luminance = std::min(luminance, 1.0f); // to avoid assert if math goes above 1.0
 
     c.r = (uint8_t)(roundf(linearToSRGBFunc(luminance) * 255.0f));
 
@@ -412,12 +423,40 @@ unsigned LodepngDecompressUsingMiniz(
     const LodePNGDecompressSettings* settings)
 {
     // mz_ulong doesn't line up with size_t on Windows, but does on macOS
-    mz_ulong dstDataSizeUL = *dstDataSize;
+    KASSERT(*dstDataSize != 0);
+
+#if USE_LIBCOMPRESSION
+    // this returns 121 dstSize instead of 16448 on 126 srcSize.
+    // Open src dir to see this.  Have to advance by 2 to fix this.
+    if (srcDataSize <= 2) {
+        return MZ_DATA_ERROR;
+    }
+
+    char scratchBuffer[compression_decode_scratch_buffer_size(COMPRESSION_ZLIB)];
+    size_t bytesDecoded = compression_decode_buffer(
+        (uint8_t*)*dstData, *dstDataSize,
+        (const uint8_t*)srcData + 2, srcDataSize - 2,
+        scratchBuffer,
+        COMPRESSION_ZLIB);
 
-    int result = mz_uncompress(*dstData, &dstDataSizeUL,
+    int result = MZ_OK;
+    if (bytesDecoded != *dstDataSize) {
+        result = MZ_DATA_ERROR;
+        *dstDataSize = 0;
+    }
+#else
+    // This works.
+    mz_ulong bytesDecoded = *dstDataSize;
+    int result = mz_uncompress(*dstData, &bytesDecoded,
                                srcData, srcDataSize);
 
-    *dstDataSize = dstDataSizeUL;
+    if (result != MZ_OK || bytesDecoded != *dstDataSize) {
+        *dstDataSize = 0;
+    }
+    else {
+        *dstDataSize = bytesDecoded;
+    }
+#endif
 
     return result;
 }
@@ -428,18 +467,20 @@ unsigned LodepngCompressUsingMiniz(
     const unsigned char* srcData, size_t srcDataSize,
     const LodePNGCompressSettings* settings)
 {
+    // TODO: no setting for compression level in settings?
+    // TODO: libCompression can only encode zlib to quality 5
+
     // mz_ulong doesn't line up with size_t on Windows, but does on macOS
     mz_ulong dstDataSizeUL = *dstDataSize;
 
-    int result = mz_compress(*dstData, &dstDataSizeUL,
-                               srcData, srcDataSize);
+    int result = mz_compress2(*dstData, &dstDataSizeUL,
+                              srcData, srcDataSize, MZ_DEFAULT_COMPRESSION);
 
     *dstDataSize = dstDataSizeUL;
 
     return result;
 }
 
-
 //-----------------------
 
 // TODO: fix this to identify srgb, otherwise will skip GAMA block
@@ -447,33 +488,34 @@ unsigned LodepngCompressUsingMiniz(
 // have code for this.˜
 static const bool doParseIccProfile = false;
 
-struct IccProfileTag
-{
+struct IccProfileTag {
     uint32_t type, offset, size;
 };
 
-static void swapEndianUint32(uint32_t& x) {
-    x =  ((x << 24) & 0xff000000 ) |
-         ((x <<  8) & 0x00ff0000 ) |
-         ((x >>  8) & 0x0000ff00 ) |
-         ((x >> 24) & 0x000000ff );
+static void swapEndianUint32(uint32_t& x)
+{
+    x = ((x << 24) & 0xff000000) |
+        ((x << 8) & 0x00ff0000) |
+        ((x >> 8) & 0x0000ff00) |
+        ((x >> 24) & 0x000000ff);
 }
 
 // https://github.com/lvandeve/lodepng/blob/master/pngdetail.cpp
-static int getICCInt32(const unsigned char* icc, size_t size, size_t pos) {
-  if (pos + 4 > size) return 0;
-    
-  // this is just swapEndianUint32 in byte form
-  return (int)((icc[pos] << 24) | (icc[pos + 1] << 16) | (icc[pos + 2] << 8) | (icc[pos + 3] << 0));
+static int getICCInt32(const unsigned char* icc, size_t size, size_t pos)
+{
+    if (pos + 4 > size) return 0;
+
+    // this is just swapEndianUint32 in byte form
+    return (int)((icc[pos] << 24) | (icc[pos + 1] << 16) | (icc[pos + 2] << 8) | (icc[pos + 3] << 0));
 }
 
-static float getICC15Fixed16(const unsigned char* icc, size_t size, size_t pos) {
-  return getICCInt32(icc, size, pos) / 65536.0;
+static float getICC15Fixed16(const unsigned char* icc, size_t size, size_t pos)
+{
+    return getICCInt32(icc, size, pos) / 65536.0;
 }
 
 // this is all big-endian, so needs swapped, 132 bytes total
-struct IccProfileHeader
-{
+struct IccProfileHeader {
     uint32_t size; // 0
     uint32_t cmmType; // 4 - 'appl'
     uint32_t version; // 8
@@ -494,29 +536,28 @@ struct IccProfileHeader
     uint32_t padding[7]; // 100
     uint32_t numTags; // 128
 };
-static_assert( sizeof(IccProfileHeader) == 132, "invalid IccProfileHeader");
-
-#define MAKEFOURCC(str)                                                       \
-    ((uint32_t)(uint8_t)(str[0]) | ((uint32_t)(uint8_t)(str[1]) << 8) |       \
-    ((uint32_t)(uint8_t)(str[2]) << 16) | ((uint32_t)(uint8_t)(str[3]) << 24 ))
+static_assert(sizeof(IccProfileHeader) == 132, "invalid IccProfileHeader");
 
+#define MAKEFOURCC(str)                                                 \
+    ((uint32_t)(uint8_t)(str[0]) | ((uint32_t)(uint8_t)(str[1]) << 8) | \
+     ((uint32_t)(uint8_t)(str[2]) << 16) | ((uint32_t)(uint8_t)(str[3]) << 24))
 
 // this must be run after deflate if profile is compressed
 bool parseIccProfile(const uint8_t* data, uint32_t dataSize, bool& isSrgb)
 {
     isSrgb = false;
-    
+
     // should look at other blocks if this is false
     if (dataSize < sizeof(IccProfileHeader)) {
         return false;
     }
-    
+
     // copy header so can endianSwap it
     IccProfileHeader header = *(const IccProfileHeader*)data;
     // convert big to little endian
     swapEndianUint32(header.size);
     swapEndianUint32(header.numTags);
-    
+
     if (header.signature != MAKEFOURCC("acsp")) {
         return false;
     }
@@ -525,20 +566,20 @@ bool parseIccProfile(const uint8_t* data, uint32_t dataSize, bool& isSrgb)
         isSrgb = true;
         return true;
     }
-    
+
     IccProfileTag* tags = (IccProfileTag*)(data + sizeof(IccProfileHeader));
 
     for (uint32_t i = 0; i < header.numTags; ++i) {
         IccProfileTag tag = tags[i];
         swapEndianUint32(tag.offset);
         swapEndianUint32(tag.size);
-        
+
         // There's also tag.name which is 'wtpt' and others.
         // Open a .icc profile to see all these names
-        
+
         uint32_t datatype = *(const uint32_t*)(data + tag.offset);
-        
-        switch(datatype) {
+
+        switch (datatype) {
             case MAKEFOURCC("XYZ "): {
                 if (tag.type == MAKEFOURCC("wtpt")) {
                     float x = getICC15Fixed16(data, dataSize, tag.offset + 8);
@@ -578,14 +619,14 @@ bool parseIccProfile(const uint8_t* data, uint32_t dataSize, bool& isSrgb)
             case MAKEFOURCC("sf32"):
                 // chad - chromatic adaptation matrix
                 break;
-                
+
             case MAKEFOURCC("mAB "):
                 // A2B0, A2B1 - Intent-0/1, device to PCS table
             case MAKEFOURCC("mBA "):
                 // B2A0, B2A1 - Intent-0/1, PCS to device table
             case MAKEFOURCC("sig "):
                 // rig0
-                
+
             case MAKEFOURCC("text"):
             case MAKEFOURCC("mluc"):
                 // muti-localizaed description strings
@@ -596,11 +637,12 @@ bool parseIccProfile(const uint8_t* data, uint32_t dataSize, bool& isSrgb)
                 break;
         }
     }
-    
+
     return true;
 }
-   
-bool isIccProfileSrgb(const uint8_t* data, uint32_t dataSize) {
+
+bool isIccProfileSrgb(const uint8_t* data, uint32_t dataSize)
+{
     bool isSrgb = false;
     parseIccProfile(data, dataSize, isSrgb);
     return isSrgb;
@@ -631,12 +673,12 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
     }
 
     isSrgb = false;
-   
+
     // Stop at the idat, or if not present the end of the file
     const uint8_t* end = lodepng_chunk_find_const(data, data + dataSize, "IDAT");
     if (!end)
         end = data + dataSize;
-    
+
     bool hasNonSrgbBlocks = false;
     bool hasSrgbBlock = false;
     {
@@ -645,46 +687,44 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
             lodepng_chunk_find_const(data, end, "iCCP") != nullptr ||
             lodepng_chunk_find_const(data, end, "gAMA") != nullptr ||
             lodepng_chunk_find_const(data, end, "cHRM") != nullptr;
-        
+
         // Apps like Figma always set this
         hasSrgbBlock = lodepng_chunk_find_const(data, end, "sRGB") != nullptr;
     }
-    
+
     const uint8_t* chunkData = lodepng_chunk_find_const(data, end, "sRGB");
     if (chunkData) {
-        lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+        lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
         isSrgb = state.info_png.srgb_defined;
         //state.info_png.srgb_intent; // 0-3
     }
-    
+
     if (doParseIccProfile && !chunkData) {
         chunkData = lodepng_chunk_find_const(data, end, "iCCP");
         if (chunkData) {
-            lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+            lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
             if (state.info_png.iccp_defined) {
                 if (!isSrgb)
                     isSrgb = isIccProfileSrgb(state.info_png.iccp_profile, state.info_png.iccp_profile_size);
             }
-                
         }
     }
-    
+
     if (!chunkData) {
         chunkData = lodepng_chunk_find_const(data, end, "gAMA");
         if (chunkData) {
-            lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+            lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
             if (state.info_png.gama_defined) {
                 if (!isSrgb)
                     isSrgb = state.info_png.gama_gamma == 45455; // 1/2.2 x 100000
             }
-                
         }
     }
-    
+
     if (!chunkData) {
         chunkData = lodepng_chunk_find_const(data, end, "cHRM");
         if (chunkData) {
-            lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+            lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
             if (state.info_png.chrm_defined) {
                 if (!isSrgb)
                     isSrgb =
@@ -699,7 +739,7 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
             }
         }
     }
-    
+
     // because Apple finder thumbnails can't be overridden with custom thumbanailer
     // and defaults to white bkgd (making white icons impossible to see).
     // track the bkgd block, and set/re-define as all black.  Maybe will honor that.
@@ -707,7 +747,7 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
     bool hasBlackBackground = false;
     chunkData = lodepng_chunk_find_const(data, data + dataSize, "bKGD");
     if (chunkData) {
-        lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+        lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
         if (state.info_png.background_defined) {
             hasBackground = true;
             hasBlackBackground =
@@ -716,7 +756,7 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
                 state.info_png.background_b == 0;
         }
     }
-    
+
     // don't convert png bit depths, but can convert pallete data
     //    if (state.info_png.color.bitdepth != 8) {
     //        return false;
@@ -733,7 +773,7 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
         case LCT_MAX_OCTET_VALUE:
         case LCT_RGB:
         case LCT_RGBA:
-        case LCT_PALETTE:  // ?
+        case LCT_PALETTE: // ?
             hasColor = true;
             break;
     }
@@ -746,12 +786,11 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
         case LCT_MAX_OCTET_VALUE:
         case LCT_RGBA:
         case LCT_GREY_ALPHA:
-        case LCT_PALETTE:  // ?
+        case LCT_PALETTE: // ?
             hasAlpha = true;
             break;
     }
-    
-    
+
     // this inserts onto end of array, it doesn't resize
     vector<uint8_t> pixelsPNG;
     pixelsPNG.clear();
@@ -792,7 +831,7 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray
 
     sourceImage.setSrgbState(isSrgb, hasSrgbBlock, hasNonSrgbBlocks);
     sourceImage.setBackgroundState(hasBlackBackground);
-    
+
     return sourceImage.loadImageFromPixels(pixels, width, height, hasColor, hasAlpha);
 }
 
@@ -809,7 +848,7 @@ bool SavePNG(Image& image, const char* filename)
     // Then if srgb, see if that matches content type srgb state below.
     TexContentType contentType = findContentTypeFromFilename(filename);
     bool isSrgb = contentType == TexContentTypeAlbedo;
-    
+
     // Skip file if it has srgb block, and none of the other block types.
     // This code will also strip the sRGB block from apps like Figma that always set it.
     if (image.hasBlackBackground()) {
@@ -828,7 +867,7 @@ bool SavePNG(Image& image, const char* filename)
         state.info_png.srgb_defined = 1;
         state.info_png.srgb_intent = 0;
     }
-    
+
     // always redefine background to black, so Finder thumbnails are not white
     // this makes viewing any white icons nearly impossible.  Make suer lodepng
     // ignores this background on import, want the stored pixels not ones composited.
@@ -838,13 +877,13 @@ bool SavePNG(Image& image, const char* filename)
     state.info_png.background_r = 0;
     state.info_png.background_g = 0;
     state.info_png.background_b = 0;
-    
+
     // TODO: could write other data into Txt block
     // or try to preserve those
-    
+
     // TODO: image converted to 32-bit, so will save out large ?
     // Can we write out L, LA, RGB, RGBA based on image state?
- 
+
     // use miniz as the encoder
     auto& settings = lodepng_default_compress_settings;
     if (useMiniZ)
@@ -853,31 +892,31 @@ bool SavePNG(Image& image, const char* filename)
     // encode to png
     vector<unsigned char> outputData;
     unsigned error = lodepng::encode(outputData, (const uint8_t*)(image.pixels().data()), image.width(), image.height(), state);
-    
+
     if (error) {
         return false;
     }
-    
+
     FileHelper fileHelper;
-    if (!fileHelper.open(filename, "wb+")) {
+    if (!fileHelper.open(filename, "w+b")) {
         return false;
     }
-    
+
     // this is overrwriting the source file currently
     // TODO: could use tmp file, and then replace existing
     // this could destroy original png on failure otherwise
     if (!fileHelper.write((const uint8_t*)outputData.data(), outputData.size())) {
         return false;
     }
-    
+
     KLOGI("Kram", "saved %s %s sRGB block", filename, isSrgb ? "with" : "without");
-    
+
     return true;
 }
 
 bool SetupTmpFile(FileHelper& tmpFileHelper, const char* suffix)
 {
-    return tmpFileHelper.openTemporaryFile(suffix, "w+b");
+    return tmpFileHelper.openTemporaryFile("kramimage-", suffix, "w+b");
 }
 
 bool SetupSourceImage(const string& srcFilename, Image& sourceImage,
@@ -943,12 +982,12 @@ bool SetupSourceImage(const string& srcFilename, Image& sourceImage,
     if (isPNG) {
         bool isSrgb = false;
         if (!LoadPng(data, dataSize, isPremulSrgb, isGray, isSrgb, sourceImage)) {
-            return false;  // error
+            return false; // error
         }
     }
     else {
         if (!LoadKtx(data, dataSize, sourceImage)) {
-            return false;  // error
+            return false; // error
         }
     }
 
@@ -1119,7 +1158,7 @@ static const char* formatFormat(MyMTLPixelFormat format)
             break;
 
         default:
-            assert(false);  // unknown format
+            assert(false); // unknown format
             break;
     }
 
@@ -1229,7 +1268,7 @@ string formatInputAndOutput(int32_t testNumber, const char* srcFilename, MyMTLPi
     assert(extSeparatorStr != nullptr);
     size_t extSeparator = extSeparatorStr - dst.c_str();
     dst.erase(extSeparator);
-    dst.append(".ktx");  // TODO: test ktx2 too
+    dst.append(".ktx"); // TODO: test ktx2 too
 
     cmd += dst;
 
@@ -1250,9 +1289,9 @@ bool kramTestCommand(int32_t testNumber,
     //#define SwizzleA " -swizzle 000r"
     //#define SwizzleLA " -swizzle rrrg"
 
-#define ASTCSwizzle2nm " -swizzle gggr"  // store as L+A, decode to snorm with .ag * 2 - 1
-#define ASTCSwizzleL1 " -swizzle rrr1"   // store as L
-#define ASTCSwizzle2 " -swizzle gggr"    // store as L+A, decode to snorm with .ag
+#define ASTCSwizzle2nm " -swizzle gggr" // store as L+A, decode to snorm with .ag * 2 - 1
+#define ASTCSwizzleL1 " -swizzle rrr1" // store as L
+#define ASTCSwizzle2 " -swizzle gggr" // store as L+A, decode to snorm with .ag
 
     // TODO: these are all run at default quality
     bool isNotPremul = true;
@@ -1425,7 +1464,7 @@ bool kramTestCommand(int32_t testNumber,
             testNumber = 3003;
             encoder = kTexEncoderEtcenc;
             cmd += " -sdf";
-            cmd +=   formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatEAC_R11Unorm, encoder);
+            cmd += formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatEAC_R11Unorm, encoder);
 
             break;
 
@@ -1433,7 +1472,7 @@ bool kramTestCommand(int32_t testNumber,
             testNumber = 3004;
             encoder = kTexEncoderATE;
             cmd += " -sdf" ASTCSwizzleL1;
-            cmd +=     formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul);
+            cmd += formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul);
             break;
 
         default:
@@ -1608,7 +1647,6 @@ void kramFixupUsage(bool showVersion = true)
           showVersion ? usageName : "");
 }
 
-
 void kramInfoUsage(bool showVersion = true)
 {
     KLOGI("Kram",
@@ -1708,22 +1746,22 @@ void kramEncodeUsage(bool showVersion = true)
 
           // can force an encoder when there is overlap
           "\t-encoder squish"
-          "\tbc[1,3,4,5] %s\n"  // can be disabled
+          "\tbc[1,3,4,5] %s\n" // can be disabled
 
           "\t-encoder bcenc"
-          "\tbc[1,3,4,5,7] %s\n"  // can be disabled
+          "\tbc[1,3,4,5,7] %s\n" // can be disabled
 
           "\t-encoder ate"
-          "\tbc[1,4,5,7] %s\n"  // can be disabled
+          "\tbc[1,4,5,7] %s\n" // can be disabled
 
           "\t-encoder ate"
-          "\tastc[4x4,8x8] %s\n"  // can be disabled
+          "\tastc[4x4,8x8] %s\n" // can be disabled
 
           "\t-encoder astcenc"
-          "\tastc[4x4,5x5,6x6,8x8] ldr/hdr support %s\n"  // can be disabled
+          "\tastc[4x4,5x5,6x6,8x8] ldr/hdr support %s\n" // can be disabled
 
           "\t-encoder etcenc"
-          "\tetc2[r,rg,rgb,rgba] %s\n"  // can be disabled
+          "\tetc2[r,rg,rgb,rgba] %s\n" // can be disabled
 
           "\t-encoder explicit"
           "\tr|rg|rgba[8|16f|32f]\n"
@@ -1764,19 +1802,19 @@ void kramEncodeUsage(bool showVersion = true)
           "\tsrc set to linear\n"
           "\t-srcsrgbimage"
           "\tsrc set to png flag (unreliable) or container format\n"
-          
+
           // normals and snorm data
           "\t-signed"
           "\tSigned r or rg for etc/bc formats, astc doesn't have signed format.\n"
           "\t-normal"
           "\tNormal map rg storage signed for etc/bc (rg01), only unsigned astc L+A (gggr).\n"
-          
+
           // sdf
           "\t-sdf"
           "\tGenerate single-channel SDF from a bitmap, can mip and drop large mips. Encode to r8, bc4, etc2r, astc4x4 (Unorm LLL1) to encode\n"
           "\t-sdfThreshold 120"
           "\tSDF generation uses bitmap converted from 8-bit red channel\n"
-          
+
           "\t-gray"
           "\tConvert to grayscale before premul\n"
 
@@ -1929,7 +1967,7 @@ static int32_t kramAppInfo(vector<const char*>& args)
 
     FileHelper dstFileHelper;
     if (!dstFilename.empty()) {
-        if (!dstFileHelper.open(dstFilename.c_str(), "wb+")) {
+        if (!dstFileHelper.open(dstFilename.c_str(), "w+b")) {
             KLOGE("Kram", "info couldn't open output file");
             return -1;
         }
@@ -2039,51 +2077,49 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6
         KLOGE("Kram", "info couldn't open png file");
         return "";
     }
-    
+
     // TODO: also gama 2.2 block sometimes used in older files
     bool isSrgb = false;
-    
+
     const uint8_t* end = lodepng_chunk_find_const(data, data + dataSize, "IDAT");
     if (!end)
         end = data + dataSize;
-    
+
     const uint8_t* chunkData = lodepng_chunk_find_const(data, end, "sRGB");
     if (chunkData) {
-        lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+        lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
         isSrgb = state.info_png.srgb_defined;
         //state.info_png.srgb_intent; // 0-3
     }
-    
+
     // Adobe Photoshop 2022 only sets iccp + gama instead of sRGB flag, but iccp takes
     // priority to gama block.
     if (doParseIccProfile && !chunkData) {
         chunkData = lodepng_chunk_find_const(data, end, "iCCP");
         if (chunkData) {
-            lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+            lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
             if (state.info_png.iccp_defined) {
                 if (!isSrgb)
                     isSrgb = isIccProfileSrgb(state.info_png.iccp_profile, state.info_png.iccp_profile_size);
             }
-                
         }
     }
-    
+
     if (!chunkData) {
         chunkData = lodepng_chunk_find_const(data, end, "gAMA");
         if (chunkData) {
-            lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+            lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
             if (state.info_png.gama_defined) {
                 if (!isSrgb)
                     isSrgb = state.info_png.gama_gamma == 45455; // 1/2.2 x 100000
             }
-                
         }
     }
-    
+
     if (!chunkData) {
         chunkData = lodepng_chunk_find_const(data, data + dataSize, "cHRM");
         if (chunkData) {
-            lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+            lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
             if (state.info_png.chrm_defined) {
                 if (!isSrgb)
                     isSrgb =
@@ -2098,7 +2134,7 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6
             }
         }
     }
-    
+
     // because Apple finder thumbnails can't be overridden with custom thumbanailer
     // and defaults to white bkgd (making white icons impossible to see).
     // track the bkgd block, and set/re-define as all black.  Maybe will honor that.
@@ -2106,7 +2142,7 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6
     bool hasBlackBackground = false;
     chunkData = lodepng_chunk_find_const(data, data + dataSize, "bKGD");
     if (chunkData) {
-        lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
+        lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
         if (state.info_png.background_defined) {
             hasBackground = true;
             hasBlackBackground =
@@ -2115,7 +2151,7 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6
                 state.info_png.background_b == 0;
         }
     }
-    
+
     string info;
 
     bool hasColor = true;
@@ -2130,7 +2166,7 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6
         case LCT_MAX_OCTET_VALUE:
         case LCT_RGB:
         case LCT_RGBA:
-        case LCT_PALETTE:  // ?
+        case LCT_PALETTE: // ?
             hasColor = true;
             break;
     }
@@ -2143,7 +2179,7 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6
         case LCT_MAX_OCTET_VALUE:
         case LCT_RGBA:
         case LCT_GREY_ALPHA:
-        case LCT_PALETTE:  // ?
+        case LCT_PALETTE: // ?
             hasAlpha = true;
             break;
     }
@@ -2179,15 +2215,14 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6
             hasAlpha ? "y" : "n",
             hasPalette ? "y" : "n",
             isSrgb ? "y" : "n",
-            hasBackground ? "y" : "n"
-            );
+            hasBackground ? "y" : "n");
     info += tmp;
 
     // optional block with ppi
     chunkData = lodepng_chunk_find_const(data, end, "pHYs");
     if (chunkData) {
-        lodepng_inspect_chunk(&state, chunkData - data, data, end-data);
-    
+        lodepng_inspect_chunk(&state, chunkData - data, data, end - data);
+
         if (state.info_png.phys_defined && state.info_png.phys_unit == 1) {
             float metersToInches = 39.37;
             // TODO: there is info_pgn.phys_unit (0 - unknown, 1 - meters)
@@ -2255,7 +2290,7 @@ string kramInfoKTXToString(const string& srcFilename, const KTXImage& srcImage,
     numPixels *= (float)numChunks;
 
     if (srcImage.mipCount() > 1) {
-        numPixels *= 4.0 / 3.0f;  // TODO: estimate for now
+        numPixels *= 4.0 / 3.0f; // TODO: estimate for now
     }
 
     // to megapixels
@@ -2363,7 +2398,7 @@ string kramInfoKTXToString(const string& srcFilename, const KTXImage& srcImage,
                 append_sprintf(info,
                                "%" PRIu64 ",%" PRIu64 "\n",
                                mip.offset,
-                               mip.length  // only size of one mip right now, not mip * numChunks
+                               mip.length // only size of one mip right now, not mip * numChunks
                 );
             }
         }
@@ -2501,10 +2536,10 @@ static int32_t kramAppDecode(vector<const char*>& args)
     }
 
     const char* dstExt = ".ktx";
-    //    if (isDstKTX2)
-    //        dstExt = ".ktx2";
-    //    if (isDstDDS)
-    //        dstExt = ".dds";
+    // if (isDstKTX2)
+    //     dstExt = ".ktx2";
+    // if (isDstDDS)
+    //     dstExt = ".dds";
 
     KTXImage srcImage;
     KTXImageData srcImageData;
@@ -2536,7 +2571,7 @@ static int32_t kramAppDecode(vector<const char*>& args)
     params.decoder = textureDecoder;
     params.swizzleText = swizzleText;
 
-    KramDecoder decoder;  // just to call decode
+    KramDecoder decoder; // just to call decode
     success = decoder.decode(srcImage, tmpFileHelper.pointer(), params);
 
     // rename to dest filepath, note this only occurs if above succeeded
@@ -2559,7 +2594,7 @@ int32_t kramAppFixup(vector<const char*>& args)
     string srcFilename;
     bool doFixupSrgb = false;
     bool error = false;
-    
+
     for (int32_t i = 0; i < argc; ++i) {
         // check for options
         const char* word = args[i];
@@ -2569,9 +2604,9 @@ int32_t kramAppFixup(vector<const char*>& args)
             error = true;
             break;
         }
-        
+
         // TDOO: may want to add output command too
-        
+
         if (isStringEqual(word, "-srgb")) {
             doFixupSrgb = true;
         }
@@ -2593,12 +2628,12 @@ int32_t kramAppFixup(vector<const char*>& args)
             break;
         }
     }
-        
+
     if (srcFilename.empty()) {
         KLOGE("Kram", "no input file given\n");
         error = true;
     }
-        
+
     if (doFixupSrgb) {
         bool isPNG = isPNGFilename(srcFilename);
 
@@ -2606,11 +2641,11 @@ int32_t kramAppFixup(vector<const char*>& args)
             KLOGE("Kram", "fixup srgb only supports png input");
             error = true;
         }
-        
+
         bool success = !error;
-        
+
         Image srcImage;
-        
+
         // load the png, this doesn't return srgb state of original png
         if (success)
             success = SetupSourceImage(srcFilename, srcImage);
@@ -2618,16 +2653,16 @@ int32_t kramAppFixup(vector<const char*>& args)
         // stuff srgb block based on filename to content conversion for now
         if (success) {
             success = SavePNG(srcImage, srcFilename.c_str());
-            
+
             if (!success) {
                 KLOGE("Kram", "fixup srgb could not save to file");
             }
         }
-        
+
         if (!success)
             error = true;
     }
-    
+
     return error ? -1 : 0;
 }
 
@@ -2671,7 +2706,7 @@ static int32_t kramAppEncode(vector<const char*>& args)
                 error = true;
                 break;
             }
-            
+
             infoArgs.sdfThreshold = StringToInt32(args[i]);
             if (infoArgs.sdfThreshold < 1 || infoArgs.sdfThreshold > 255) {
                 KLOGE("Kram", "sdfThreshold arg invalid");
@@ -2887,12 +2922,12 @@ static int32_t kramAppEncode(vector<const char*>& args)
         else if (isStringEqual(word, "-srgb")) {
             // not validating format for whether it's srgb or not
             infoArgs.isSRGBSrc = true;
-            
+
             // The format may override this setting.  Not all formats
             // have an srgb varient.
             infoArgs.isSRGBDst = true;
         }
-        
+
         // This means ignore the srgb state on the src image
         // This has to be specified after -srgb
         else if (isStringEqual(word, "-srclin")) {
@@ -3229,9 +3264,6 @@ static int32_t kramAppEncode(vector<const char*>& args)
     return success ? 0 : -1;
 }
 
-
-                   
-                   
 int32_t kramAppScript(vector<const char*>& args)
 {
     // this is help
@@ -3324,7 +3356,7 @@ int32_t kramAppScript(vector<const char*>& args)
 
     // as a global this auto allocates 16 threads, and don't want that unless actually
     // using scripting.  And even then want control over the number of threads.
-    std::atomic<int32_t> errorCounter(0);  // doesn't initialize to 0 otherwise
+    std::atomic<int32_t> errorCounter(0); // doesn't initialize to 0 otherwise
     std::atomic<int32_t> skippedCounter(0);
     int32_t commandCounter = 0;
 
@@ -3364,7 +3396,7 @@ int32_t kramAppScript(vector<const char*>& args)
                 // stop any new work when not "continue on error"
                 if (isHaltedOnError && int32_t(errorCounter) > 0) {
                     skippedCounter++;
-                    return 0;  // not really success, just skipping command
+                    return 0; // not really success, just skipping command
                 }
 
                 Timer commandTimer;
@@ -3591,12 +3623,13 @@ int32_t kramAppMain(int32_t argc, char* argv[])
     return kramAppCommand(args);
 }
 
-bool isSupportedFilename(const char* filename) {
+bool isSupportedFilename(const char* filename)
+{
     if (isPNGFilename(filename) ||
         isKTXFilename(filename) ||
         isKTX2Filename(filename) ||
         isDDSFilename(filename)) {
-    return true;
+        return true;
     }
     return false;
 }
@@ -3610,13 +3643,13 @@ void fixPixelFormat(KTXImage& image, const char* filename)
     static bool doReplacePixelFormatFromContentType = true;
     if (!doReplacePixelFormatFromContentType)
         return;
-    
+
     bool isPNG = isPNGFilename(filename);
     if (!isPNG)
         return;
-    
+
     TexContentType contentType = findContentTypeFromFilename(filename);
-    
+
     bool isSrgb = contentType == TexContentTypeAlbedo;
     image.pixelFormat = isSrgb ? MyMTLPixelFormatRGBA8Unorm_sRGB : MyMTLPixelFormatRGBA8Unorm;
 }
@@ -3626,12 +3659,12 @@ void fixPixelFormat(KTXImage& image, const char* filename)
 TexContentType findContentTypeFromFilename(const char* filename)
 {
     string filenameShort = filename;
-    
+
     const char* dotPosStr = strrchr(filenameShort.c_str(), '.');
     if (dotPosStr == nullptr)
         return TexContentTypeUnknown;
     auto dotPos = dotPosStr - filenameShort.c_str();
-    
+
     // now chop off the extension
     filenameShort = filenameShort.substr(0, dotPos);
 
@@ -3644,36 +3677,28 @@ TexContentType findContentTypeFromFilename(const char* filename)
     }
     else if (endsWith(filenameShort, "-n") ||
              endsWith(filenameShort, "_normal") ||
-             endsWith(filenameShort, "_Normal")
-             )
-    {
+             endsWith(filenameShort, "_Normal")) {
         return TexContentTypeNormal;
     }
     else if (endsWith(filenameShort, "-a") ||
              endsWith(filenameShort, "-d") ||
              endsWith(filenameShort, "_baseColor") ||
-             endsWith(filenameShort, "_Color")
-             )
-    {
+             endsWith(filenameShort, "_Color")) {
         return TexContentTypeAlbedo;
     }
     else if (endsWith(filenameShort, "-ao") ||
-             endsWith(filenameShort, "_AO")
-             )
-    {
+             endsWith(filenameShort, "_AO")) {
         return TexContentTypeAO;
     }
     else if (endsWith(filenameShort, "-mr") ||
              endsWith(filenameShort, "_Metallic") ||
              endsWith(filenameShort, "_Roughness") ||
-             endsWith(filenameShort, "_MetaliicRoughness")
-             )
-    {
+             endsWith(filenameShort, "_MetaliicRoughness")) {
         return TexContentTypeMetallicRoughness;
     }
-    
+
     // fallback to albedo for now
     return TexContentTypeAlbedo;
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/Kram.h b/libkram/kram/Kram.h
index e8cb6299..a48f8f1c 100644
--- a/libkram/kram/Kram.h
+++ b/libkram/kram/Kram.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -8,7 +8,7 @@
 #include "KramMmapHelper.h"
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 class Image;
 class KTXImage;
@@ -74,8 +74,7 @@ string kramInfoToString(const string& srcFilename, bool isVerbose);
 // this is entry point to library for cli app
 int32_t kramAppMain(int32_t argc, char* argv[]);
 
-enum TexContentType
-{
+enum TexContentType {
     TexContentTypeUnknown = 0,
     TexContentTypeAlbedo,
     TexContentTypeNormal,
@@ -92,5 +91,4 @@ void fixPixelFormat(KTXImage& image, const char* filename);
 // This is using naming conventions on filenames, but KTX/KTX2 hold channel props
 TexContentType findContentTypeFromFilename(const char* filename);
 
-
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h
index 7432e61f..3f16af7e 100644
--- a/libkram/kram/KramConfig.h
+++ b/libkram/kram/KramConfig.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -10,10 +10,21 @@
 #include <TargetConditionals.h>
 #if TARGET_OS_OSX
 #define KRAM_MAC 1
-#else
+#elif TARGET_OS_IOS
 #define KRAM_IOS 1
-#endif
-#elif __unix__
+#elif TARGET_OS_VISION
+#define KRAM_VISION 1
+#endif
+// taken from eaplatform.h, need PS5/XboxX
+// make sure to def 0 case after if adding these
+// just for reference
+//#elif defined(__ORBIS__)
+//#define KRAM_PS4 1
+//#elif defined(_XBOX_ONE)
+//#define KRAM_XBOX_ONE
+#elif defined(__ANDROID__)
+#define KRAM_ANDROID 1
+#elif defined(__unix__)
 #define KRAM_LINUX 1
 #endif
 
@@ -30,6 +41,12 @@
 #ifndef KRAM_IOS
 #define KRAM_IOS 0
 #endif
+#ifndef KRAM_ANDROID
+#define KRAM_ANDROID 0
+#endif
+#ifndef KRAM_VISION
+#define KRAM_VISION 0
+#endif
 
 // TODO: add Profile build (rename RelWithDbgInfo)
 
@@ -41,6 +58,10 @@
 #define KRAM_DEBUG 1
 #endif
 
+// Don't really need 3 libs.  This means can build one
+// but can't use availability or platforms specifics then.
+#define KRAM_APPLE (KRAM_MAC || KRAM_IOS || KRAM_VISION)
+
 //------------------------
 
 #if KRAM_WIN
@@ -59,26 +80,6 @@
     4305 // '*=': truncation from 'double' to 'float'
 */
 
-#endif
-
-//------------------------
-#if KRAM_MAC
-
-#if TARGET_CPU_X86_64
-#define USE_SSE 1
-#elif TARGET_CPU_ARM64
-#define USE_NEON 1
-#endif
-
-#endif
-
-#if KRAM_IOS
-#define USE_NEON 1
-#endif
-
-//------------------------
-#if KRAM_WIN
-
 // avoid conflicts with min/max macros, use std instead
 #define NOMINMAX
 
@@ -88,47 +89,14 @@
 #define _CRT_SECURE_NO_WARNINGS 1
 #include <tchar.h>
 
-// For now assume Intel on Win
-#define USE_SSE 1
-
 #endif
 
-//------------------------
-// TODO: unix
-
 //------------------------
 
-// one of these must be set
-#ifndef USE_SSE
-#define USE_SSE 0
-#endif
-#ifndef USE_NEON
-#define USE_NEON 0
-#endif
-
-// clang can compile simd/simd.h code on other platforms
-// this provides vector extensions from gcc that were setup for OpenCL shaders
-#ifndef USE_SIMDLIB
-// TODO: bring over simd for Win
-#if !KRAM_WIN 
-#define USE_SIMDLIB 1
-#else
-#define USE_SIMDLIB 0
-#endif
-#endif
-
-// use _Float16/_fp16 vs. other
-#if KRAM_MAC || KRAM_IOS
-#define USE_FLOAT16 1
-#else
-#define USE_FLOAT16 0
-#endif
-
-
-// can override from build system
+// SIMD_WORKSPACE is set
 
 // can't have ATE defined to 1 on other platforms
-#if !(KRAM_MAC || KRAM_IOS)
+#if !KRAM_APPLE
 #undef COMPILE_ATE
 #endif
 
@@ -152,11 +120,6 @@
 #define COMPILE_EASTL 0
 #endif
 
-// eliminate this
-//#ifndef COMPILE_FASTL
-//#define COMPILE_FASTL 0
-//#endif
-
 // basis transcoder only (read not writes)
 #ifndef COMPILE_BASIS
 #define COMPILE_BASIS 0
@@ -171,122 +134,73 @@
 // EASTL only seems to define that for Visual Studio natvis, and not lldb
 #define USE_EASTL COMPILE_EASTL
 
+// some code not compiling with size_t otherwise
+#include <stddef.h>
+
+// linux need this for memcpy
+#include <string.h>
+
 #if USE_EASTL
 
-#define NAMESPACE_STL eastl
+#define STL_NAMESPACE eastl
 
 // this probably breaks all STL debugging
-#include <EASTL/algorithm.h>  // for max
+#include <EASTL/algorithm.h> // for max
 //#include "EASTL/atomic.h"
-#include <EASTL/functional.h>
-
-#include <EASTL/deque.h>
-#include <EASTL/iterator.h>    // for copy_if on Win
-#include <EASTL/sort.h>
-#include <EASTL/basic_string.h>
-
 #include <EASTL/array.h>
+#include <EASTL/basic_string.h>
+#include <EASTL/deque.h>
+#include <EASTL/functional.h>
+#include <EASTL/initializer_list.h>
+#include <EASTL/iterator.h> // for copy_if on Win
 #include <EASTL/map.h>
+#include <EASTL/shared_ptr.h> // includes thread/mutex
+#include <EASTL/sort.h>
+#include <EASTL/span.h>
+#include <EASTL/unique_ptr.h>
 #include <EASTL/unordered_map.h>
 #include <EASTL/vector.h>
 
-#include <EASTL/shared_ptr.h>  // includes thread/mutex
-#include <EASTL/unique_ptr.h>
-#include <EASTL/initializer_list.h>
-
 // std - simpler than using eastl version
 #include <atomic>
 
-
-/* This library just doesn't work, but was an interesting idea
-#define USE_FASTL COMPILE_FASTL
-#elif USE_FASTL
-
-#define NAMESPACE_STL fastl
-
-// these are all vector based
-#include "../fastl/falgorithm.h"
-#include "../fastl/vector.h"
-
-// These don't really work.  They are constantly shifting the key-value pairs on add/revmoe
-#include "../fastl/map.h"
-#include "../fastl/set.h"
-#include "../fastl/unordered_map.h"
-#include "../fastl/unordered_set.h"
-
-// still too many holes in this (rfind, insert, back, pop_back, find_last_of, substr)
-
-#include "../fastl/fstring.h"
-
-// This was to fallback on sso of basic_string
-//#include <string>
-//namespace NAMESPACE_STL
-//{
-//    using string = std::string;
-//}
-
-// std - for missing functionality
-#include <array>
-#include <deque>
-#include <memory> // for unique_ptr/shared_ptr
-//#include <initializer_list>
-#include <iterator>  // for copy_if and back_inserter on Win
-
-// threads
-#include <functional>
-#include <atomic>
-
-// On macOS, mutex, codition_variable, thread pull in system_error which pulls in std::string
-// when then instantiates 5 versions of basic_string into all files
-//#include <mutex>
-//#include <condition_variable>
-//#include <thread>
-*/
-
 #else
 
-/*
-// seems that Modules have "partial" support in Xcode, whatever that means
-// these imports are taken from MSVC which has a full implementation
- 
-import std.memory;
-import std.threading;
-import std.core;
-import std.filesystem;
-import std.regex;
-*/
+// in Xcode 14, C++20 Modules have "partial" support... whatever that means.
+// These imports are taken from MSVC which has a full implementation.
+//import std.memory;
+//import std.threading;
+//import std.core;
+//import std.filesystem;
+//import std.regex;
 
-#define NAMESPACE_STL std
+#define STL_NAMESPACE std
 
 // all std
-#include <algorithm>  // for max
-#include <functional>
-
-#include <deque>
-#include <iterator>  // for copy_if on Win
-#include <string>
-
+#include <algorithm> // for max
+#include <array>
 #include <atomic>
-#include <memory>    // for shared_ptr
+#include <chrono>
+#include <condition_variable>
+#include <cstdlib>
+#include <deque>
+#include <functional>
 #include <initializer_list>
-
-#include <array>
+#include <iterator> // for copy_if on Win
 #include <map>
-#include <unordered_map>
-#include <vector>
-
+#include <memory> // for shared_ptr
 #include <mutex>
-#include <condition_variable>
-#include <thread>
-#include <chrono>
 #include <random>
-
-#include <cstdlib>
+#include <string>
+#include <span>
+#include <thread>
+#include <unordered_map>
+#include <vector>
 //#include <exception>
+#include <filesystem>
 #include <type_traits>
 #include <typeinfo>
 #include <utility>
-#include <filesystem>
 #include <variant>
 
 #endif
@@ -299,223 +213,31 @@ import std.regex;
 // includes that are usable across all files
 #include "KramLog.h"
 
-// this has every intrinsic header in it
-#if USE_SSE
-// to keep astcenc compiling
-#include <immintrin.h>  // AVX1
-#elif USE_NEON
-#include "sse2neon.h"
-#endif
-
-// TODO: move half4 to it's own file, but always include it
-// Apple's files don't have a half4 type.
-namespace simd {
-
-// This has spotty support on Android.  They left out hw support
-// for _Float16 on many of the devices.  So there would need this fallback.
-
-#if USE_FLOAT16
-using half = _Float16;
-#else
-// for lack of a better type
-using half = uint16_t;
-#endif
+//-------------------------
+// simd
 
-// Really just a storage format and wrapper for half, math work done in float4.
-class half4 {
-public:
-#if USE_SSE
-    // for lack of a better type, not __m128i since that's 2x bigger
-    using tType = uint64_t;
-#elif USE_NEON
-    using tType = float16x4_t;
-#endif
-
-    union {
-        tType reg;
-        half v[4];
-        struct {
-            half x, y, z, w;
-        };
-        struct {
-            half r, g, b, a;
-        };
-    };
-
-    half4() {}
-    explicit half4(half val) : x(val), y(val), z(val), w(val) {}  // xyzw = val
-    explicit half4(tType val) { reg = val; }
-    half4(half xx, half yy, half zz, half ww) : x(xx), y(yy), z(zz), w(ww) {}
-    half4(const half4& val) { reg = val.reg; }
-
-    // no real ops here, althought Neon does have sevearal
-    // use of these pull data out of simd registers
-    half& operator[](int32_t index)
-    {
-        return v[index];
-    }
-    const half& operator[](int32_t index) const
-    {
-        return v[index];
-    }
-};
-
-}  // namespace simd
-
-#if !USE_EASTL
-
-namespace NAMESPACE_STL {
-
-// scalar ops
-#if USE_FASTL
-template<typename T>
-inline T min(T x, T minValue) { return x < minValue ? x : minValue; }
-template<typename T>
-inline T max(T x, T maxValue) { return x > maxValue ? x : maxValue; }
+// This is now all in kram.xcconfig for KRAM_APPLE
+#if !KRAM_APPLE
+//have to use simdk on non-Apple platforms
+#define USE_SIMDLIB 1
+#define USE_SIMDLIBMODULE 0
 #endif
 
-// already defined in C++17
-//template<typename T>
-//inline T clamp(T x, T minValue, T maxValue) { return min(max(x, minValue), maxValue); }
-
-
-}  // namespace std
+#if USE_SIMDLIB
 
-#endif
+// new vector math
+//#if USE_SIMDLIBMODULE
+// import this as a clang module now
+//import vectormath234;
+//#else
+#include "vectormath234.h"
+//#endif
 
-#if USE_SIMDLIB
-#include "simd/simd.h"
 #else
-// emulate float4
+// old vector math, using simd/simd.h
 #include "float4a.h"
 #endif
 
-namespace simd {
-
-#if USE_SIMDLIB
-
-// functional ctor
-inline float4 float4m(float3 v, float w)
-{
-    return vector4(v, w);
-}
-
-inline float2 float2m(float x, float y)
-{
-    return {x, y};
-}
-inline float3 float3m(float x, float y, float z)
-{
-    return {x, y, z};
-}
-inline float4 float4m(float x, float y, float z, float w)
-{
-    return {x, y, z, w};
-}
-
-inline float2 float2m(float x)
-{
-    return x;
-}
-
-inline float3 float3m(float x)
-{
-    return x;
-}
-
-inline float4 float4m(float x)
-{
-    return x;
-}
-
-inline float saturate(float v)
-{
-    return std::clamp(v, 0.0f, 1.0f);
-}
-inline double saturate(double v)
-{
-    return std::clamp(v, 0.0, 1.0);
-}
-inline float2 saturate(const float2& v)
-{
-    return simd_clamp(v, 0.0f, 1.0f);
-}
-inline float3 saturate(const float3& v)
-{
-    return simd_clamp(v, 0.0f, 1.0f);
-}
-inline float4 saturate(const float4& v)
-{
-    return simd_clamp(v, 0.0f, 1.0f);
-}
-
-#endif
-
-#if USE_FLOAT16
-
-inline float4 toFloat4(const half4& vv)
-{
-    // https://patchwork.ozlabs.org/project/gcc/patch/559BC75A.1080606@arm.com/
-    // https://gcc.gnu.org/onlinedocs/gcc-7.5.0/gcc/Half-Precision.html
-    // https://developer.arm.com/documentation/dui0491/i/Using-NEON-Support/Converting-vectors
-    return float4m((float)vv.x, (float)vv.y, (float)vv.z, (float)vv.w);
-}
-inline half4 toHalf4(const float4& vv)
-{
-    return half4((_Float16)vv.x, (_Float16)vv.y, (_Float16)vv.z, (_Float16)vv.w);
-}
-
-#elif USE_SSE
-
-// using casts instead of vv.reg, so these calls work with Apple SIMD too
-
-inline float4 toFloat4(const half4& vv)
-{
-    // https://patchwork.ozlabs.org/project/gcc/patch/559BC75A.1080606@arm.com/
-    // https://gcc.gnu.org/onlinedocs/gcc-7.5.0/gcc/Half-Precision.html
-    // https://developer.arm.com/documentation/dui0491/i/Using-NEON-Support/Converting-vectors
-    __m128i reg16 = _mm_setzero_si128();
-
-    // TODO: switch to load low 64-bits, but don't know which one _mm_cvtsi32_si128(&vv.reg); ?
-    // want 0 extend here, sse overuses int32_t when really unsigned and zero extended value
-    reg16 = _mm_insert_epi16(reg16, vv[0], 0);
-    reg16 = _mm_insert_epi16(reg16, vv[1], 1);
-    reg16 = _mm_insert_epi16(reg16, vv[2], 2);
-    reg16 = _mm_insert_epi16(reg16, vv[3], 3);
-
-    return float4(_mm_cvtph_ps(reg16));
-}
-inline half4 toHalf4(const float4& vv)
-{
-    __m128i reg16 = _mm_cvtps_ph(*(const __m128*)&vv, 0);  // 4xfp32-> 4xfp16,  round to nearest-even
-
-    // TODO: switch to store/steam, but don't know which one _mm_storeu_epi16 ?
-    half4 val;  // = 0;
-
-    // 0 extended
-    val[0] = (half)_mm_extract_epi16(reg16, 0);
-    val[1] = (half)_mm_extract_epi16(reg16, 1);
-    val[2] = (half)_mm_extract_epi16(reg16, 2);
-    val[3] = (half)_mm_extract_epi16(reg16, 3);
-    return val;
-}
-
-#elif USE_NEON
-
-// using casts intead of vv.reg, so these calls work with Apple SIMD too
-
-inline float4 toFloat4(const half4& vv)
-{
-    return float4(vcvt_f32_f16(*(const float32x4_t*)&vv));
-}
-inline half4 toHalf4(const float4& vv)
-{
-    return half4(vcvt_f16_f32(*(const float32x4_t*)&vv));
-}
-#endif
-
-}  // namespace simd
-
 //---------------------------------------
 
 // this just strips args
@@ -527,7 +249,7 @@ inline half4 toHalf4(const float4& vv)
 //---------------------------------------
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // Use this on vectors
 template <typename T>
@@ -535,4 +257,22 @@ inline size_t vsizeof(const vector<T>& v)
 {
     return sizeof(T) * v.size();
 }
-}  // namespace kram
+
+template <typename T>
+inline size_t velemsizeof(const vector<T>& v)
+{
+    return sizeof(T);
+}
+
+// TODO: make sure these don't conflict with std:: versions
+template <typename T>
+inline constexpr const span<T> make_span(const vector<T>& v) {
+    return span<T, dynamic_extent>(const_cast<T*>(v.data()), v.size());
+}
+
+template <typename T>
+inline constexpr span<T> make_span(vector<T>& v) {
+    return span<T, dynamic_extent>(v.data(), v.size());
+}
+    
+} // namespace kram
diff --git a/libkram/kram/KramDDSHelper.cpp b/libkram/kram/KramDDSHelper.cpp
index 82488f5b..c00b1424 100644
--- a/libkram/kram/KramDDSHelper.cpp
+++ b/libkram/kram/KramDDSHelper.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -8,153 +8,151 @@
 #include "KramFileHelper.h"
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 const uint32_t DDS_MAGIC = 0x20534444; // "DDS "
 
-#define MAKEFOURCC(str)                                                       \
-    ((uint32_t)(uint8_t)(str[0]) | ((uint32_t)(uint8_t)(str[1]) << 8) |       \
-    ((uint32_t)(uint8_t)(str[2]) << 16) | ((uint32_t)(uint8_t)(str[3]) << 24 ))
+#define MAKEFOURCC(str)                                                 \
+    ((uint32_t)(uint8_t)(str[0]) | ((uint32_t)(uint8_t)(str[1]) << 8) | \
+     ((uint32_t)(uint8_t)(str[2]) << 16) | ((uint32_t)(uint8_t)(str[3]) << 24))
 
 // DX9 era formats, only for reading dds files in dx9 style
-enum D3DFORMAT : uint32_t
-{
-    D3DFMT_UNKNOWN              =  0,
-
-//    D3DFMT_R8G8B8               = 20,
-    D3DFMT_A8R8G8B8             = 21,
-//    D3DFMT_X8R8G8B8             = 22,
-//    D3DFMT_R5G6B5               = 23,
-//    D3DFMT_X1R5G5B5             = 24,
-//    D3DFMT_A1R5G5B5             = 25,
-//    D3DFMT_A4R4G4B4             = 26,
-//    D3DFMT_R3G3B2               = 27,
-//    D3DFMT_A8                   = 28,
-//    D3DFMT_A8R3G3B2             = 29,
-//    D3DFMT_X4R4G4B4             = 30,
-//    D3DFMT_A2B10G10R10          = 31,
-    D3DFMT_A8B8G8R8             = 32,
-//    D3DFMT_X8B8G8R8             = 33,
-//    D3DFMT_G16R16               = 34,
-//    D3DFMT_A2R10G10B10          = 35,
-//    D3DFMT_A16B16G16R16         = 36,
-
-//    D3DFMT_A8P8                 = 40,
-//    D3DFMT_P8                   = 41,
-
-//    D3DFMT_L8                   = 50,
-//    D3DFMT_A8L8                 = 51,
-//    D3DFMT_A4L4                 = 52,
-
-//    D3DFMT_V8U8                 = 60,
-//    D3DFMT_L6V5U5               = 61,
-//    D3DFMT_X8L8V8U8             = 62,
-//    D3DFMT_Q8W8V8U8             = 63,
-//    D3DFMT_V16U16               = 64,
-//    D3DFMT_A2W10V10U10          = 67,
-
-//    D3DFMT_UYVY                 = MAKEFOURCC("UYVY"),
-//    D3DFMT_R8G8_B8G8            = MAKEFOURCC("RGBG"),
-//    D3DFMT_YUY2                 = MAKEFOURCC("YUY2"),
-//    D3DFMT_G8R8_G8B8            = MAKEFOURCC("GRGB"),
-    
-    D3DFMT_DXT1                 = MAKEFOURCC("DXT1"),
-    D3DFMT_DXT2                 = MAKEFOURCC("DXT2"),
-    D3DFMT_DXT3                 = MAKEFOURCC("DXT3"),
-    D3DFMT_DXT4                 = MAKEFOURCC("DXT4"),
-    D3DFMT_DXT5                 = MAKEFOURCC("DXT5"),
+enum D3DFORMAT : uint32_t {
+    D3DFMT_UNKNOWN = 0,
+
+    // D3DFMT_R8G8B8               = 20,
+    D3DFMT_A8R8G8B8 = 21,
+    // D3DFMT_X8R8G8B8             = 22,
+    // D3DFMT_R5G6B5               = 23,
+    // D3DFMT_X1R5G5B5             = 24,
+    // D3DFMT_A1R5G5B5             = 25,
+    // D3DFMT_A4R4G4B4             = 26,
+    // D3DFMT_R3G3B2               = 27,
+    // D3DFMT_A8                   = 28,
+    // D3DFMT_A8R3G3B2             = 29,
+    // D3DFMT_X4R4G4B4             = 30,
+    // D3DFMT_A2B10G10R10          = 31,
+    D3DFMT_A8B8G8R8 = 32,
+    // D3DFMT_X8B8G8R8             = 33,
+    // D3DFMT_G16R16               = 34,
+    // D3DFMT_A2R10G10B10          = 35,
+    // D3DFMT_A16B16G16R16         = 36,
+
+    // D3DFMT_A8P8                 = 40,
+    // D3DFMT_P8                   = 41,
+
+    // D3DFMT_L8                   = 50,
+    // D3DFMT_A8L8                 = 51,
+    // D3DFMT_A4L4                 = 52,
+
+    // D3DFMT_V8U8                 = 60,
+    // D3DFMT_L6V5U5               = 61,
+    // D3DFMT_X8L8V8U8             = 62,
+    // D3DFMT_Q8W8V8U8             = 63,
+    // D3DFMT_V16U16               = 64,
+    // D3DFMT_A2W10V10U10          = 67,
+
+    // D3DFMT_UYVY                 = MAKEFOURCC("UYVY"),
+    // D3DFMT_R8G8_B8G8            = MAKEFOURCC("RGBG"),
+    // D3DFMT_YUY2                 = MAKEFOURCC("YUY2"),
+    // D3DFMT_G8R8_G8B8            = MAKEFOURCC("GRGB"),
+
+    D3DFMT_DXT1 = MAKEFOURCC("DXT1"),
+    D3DFMT_DXT2 = MAKEFOURCC("DXT2"),
+    D3DFMT_DXT3 = MAKEFOURCC("DXT3"),
+    D3DFMT_DXT4 = MAKEFOURCC("DXT4"),
+    D3DFMT_DXT5 = MAKEFOURCC("DXT5"),
 
     // Not worth support dx9-style files, these don't even hold srgb state
     D3DFMT_ATI1 = MAKEFOURCC("ATI1"),
     D3DFMT_BC4U = MAKEFOURCC("BC4U"),
     D3DFMT_BC4S = MAKEFOURCC("BC4S"),
-    
+
     D3DFMT_ATI2 = MAKEFOURCC("ATI2"),
     D3DFMT_BC5U = MAKEFOURCC("BC5U"),
     D3DFMT_BC5S = MAKEFOURCC("BC5S"),
-    
-//    D3DFMT_D16_LOCKABLE         = 70,
-//    D3DFMT_D32                  = 71,
-//    D3DFMT_D15S1                = 73,
-//    D3DFMT_D24S8                = 75,
-//    D3DFMT_D24X8                = 77,
-//    D3DFMT_D24X4S4              = 79,
-//    D3DFMT_D16                  = 80,
-//
-//    D3DFMT_D32F_LOCKABLE        = 82,
-//    D3DFMT_D24FS8               = 83,
+
+    // D3DFMT_D16_LOCKABLE         = 70,
+    // D3DFMT_D32                  = 71,
+    // D3DFMT_D15S1                = 73,
+    // D3DFMT_D24S8                = 75,
+    // D3DFMT_D24X8                = 77,
+    // D3DFMT_D24X4S4              = 79,
+    // D3DFMT_D16                  = 80,
+    //
+    // D3DFMT_D32F_LOCKABLE        = 82,
+    // D3DFMT_D24FS8               = 83,
 
     //D3DFMT_D32_LOCKABLE         = 84,
     //D3DFMT_S8_LOCKABLE          = 85,
 
-//    D3DFMT_L16                  = 81,
-//
-//    D3DFMT_VERTEXDATA           =100,
-//    D3DFMT_INDEX16              =101,
-//    D3DFMT_INDEX32              =102,
+    // D3DFMT_L16                  = 81,
+    //
+    // D3DFMT_VERTEXDATA           =100,
+    // D3DFMT_INDEX16              =101,
+    // D3DFMT_INDEX32              =102,
 
     //D3DFMT_Q16W16V16U16         =110,
 
     //D3DFMT_MULTI2_ARGB8         = MAKEFOURCC("MET1"),
 
-    D3DFMT_R16F                 = 111,
-    D3DFMT_G16R16F              = 112,
-    D3DFMT_A16B16G16R16F        = 113,
+    D3DFMT_R16F = 111,
+    D3DFMT_G16R16F = 112,
+    D3DFMT_A16B16G16R16F = 113,
 
-    D3DFMT_R32F                 = 114,
-    D3DFMT_G32R32F              = 115,
-    D3DFMT_A32B32G32R32F        = 116,
+    D3DFMT_R32F = 114,
+    D3DFMT_G32R32F = 115,
+    D3DFMT_A32B32G32R32F = 116,
 
-//    D3DFMT_CxV8U8               = 117,
+    // D3DFMT_CxV8U8               = 117,
 
     //D3DFMT_A1                   = 118,
     //D3DFMT_A2B10G10R10_XR_BIAS  = 119,
     //D3DFMT_BINARYBUFFER         = 199,
 
-    D3DFMT_FORCE_DWORD          =0x7fffffff
+    D3DFMT_FORCE_DWORD = 0x7fffffff
 };
 
-enum DDS_FLAGS : uint32_t
-{
-    
+enum DDS_FLAGS : uint32_t {
+
     DDSD_HEIGHT = 0x00000002,
-    DDSD_DEPTH  = 0x00800000,
+    DDSD_DEPTH = 0x00800000,
 
     DDSD_WIDTH = 0x00000004,
     DDSD_LINEARSIZE = 0x00080000,
     DDSD_PITCH = 0x00000008,
 
-    DDSD_CAPS        = 0x00000001,
+    DDSD_CAPS = 0x00000001,
     DDSD_PIXELFORMAT = 0x00001000,
     DDSD_MIPMAPCOUNT = 0x00020000,
 
     // ddspf
     DDSPF_ALPHAPIXELS = 0x00000001,
-    DDSPF_FOURCC =      0x00000004,
-    DDSPF_RGB =         0x00000040,
-    DDSPF_LUMINANCE =   0x00020000, // dx9
-    DDSPF_ALPHA =       0x00000002, // dx9
+    DDSPF_FOURCC = 0x00000004,
+    DDSPF_RGB = 0x00000040,
+    DDSPF_LUMINANCE = 0x00020000, // dx9
+    DDSPF_ALPHA = 0x00000002, // dx9
     //DDSPF_BUMPDUDV =    0x00080000,
-    
+
     // caps
     DDSCAPS_TEXTURE = 0x00001000,
-    DDSCAPS_MIPMAP  = 0x00400000,
+    DDSCAPS_MIPMAP = 0x00400000,
     DDSCAPS_COMPLEX = 0x00000008,
-    
+
     // caps2
     DDSCAPS2_VOLUME = 0x200000,
     DDSCAPS2_CUBEMAP_ALLFACES = 0x0000FA00, // DDSCAPS2_CUBEMAP | all faces
     DDSCAPS2_CUBEMAP = 0x00000200, // DDSCAPS2_CUBEMAP
-    
+
     DDS_RESOURCE_MISC_TEXTURECUBE = 0x4,
-    
+
     // resourceDimension
     DDS_DIMENSION_TEXTURE1D = 2,
     DDS_DIMENSION_TEXTURE2D = 3,
     DDS_DIMENSION_TEXTURE3D = 4,
-    
+
     FOURCC_DX10 = MAKEFOURCC("DX10"),
-    
+
     // dx10 misc2 flags
     DDS_ALPHA_MODE_UNKNOWN = 0,
     DDS_ALPHA_MODE_STRAIGHT = 1,
@@ -163,164 +161,161 @@ enum DDS_FLAGS : uint32_t
     DDS_ALPHA_MODE_CUSTOM = 4,
 };
 
-struct DDS_PIXELFORMAT
-{
-    uint32_t    size;
-    uint32_t    flags;
-    uint32_t    fourCC;
-    uint32_t    RGBBitCount;
-    uint32_t    RBitMask;
-    uint32_t    GBitMask;
-    uint32_t    BBitMask;
-    uint32_t    ABitMask;
+struct DDS_PIXELFORMAT {
+    uint32_t size;
+    uint32_t flags;
+    uint32_t fourCC;
+    uint32_t RGBBitCount;
+    uint32_t RBitMask;
+    uint32_t GBitMask;
+    uint32_t BBitMask;
+    uint32_t ABitMask;
 };
 
-struct DDS_HEADER
-{
-    uint32_t        size;
-    uint32_t        flags;
-    uint32_t        height;
-    uint32_t        width;
-    uint32_t        pitchOrLinearSize;
-    uint32_t        depth; // only if DDS_HEADER_FLAGS_VOLUME is set in flags
-    uint32_t        mipMapCount;
-    uint32_t        reserved1[11];
+struct DDS_HEADER {
+    uint32_t size;
+    uint32_t flags;
+    uint32_t height;
+    uint32_t width;
+    uint32_t pitchOrLinearSize;
+    uint32_t depth; // only if DDS_HEADER_FLAGS_VOLUME is set in flags
+    uint32_t mipMapCount;
+    uint32_t reserved1[11];
     DDS_PIXELFORMAT ddspf;
-    uint32_t        caps;
-    uint32_t        caps2;
-    uint32_t        caps3;
-    uint32_t        caps4;
-    uint32_t        reserved2;
+    uint32_t caps;
+    uint32_t caps2;
+    uint32_t caps3;
+    uint32_t caps4;
+    uint32_t reserved2;
 };
 
-struct DDS_HEADER_DXT10
-{
-    uint32_t /*DXGI_FORMAT*/     dxgiFormat;
-    uint32_t        resourceDimension;
-    uint32_t        miscFlag; // see D3D11_RESOURCE_MISC_FLAG
-    uint32_t        arraySize;
-    uint32_t        miscFlags2;
+struct DDS_HEADER_DXT10 {
+    uint32_t /*DXGI_FORMAT*/ dxgiFormat;
+    uint32_t resourceDimension;
+    uint32_t miscFlag; // see D3D11_RESOURCE_MISC_FLAG
+    uint32_t arraySize;
+    uint32_t miscFlags2;
 };
 
 // DX9 bitmask parsing adapted from GetPixelFormat() call here https://github.com/microsoft/DirectXTex/blob/main/DDSTextureLoader/DDSTextureLoader12.cpp
 static MyMTLPixelFormat getMetalFormatFromDDS9(const DDS_PIXELFORMAT& ddpf)
 {
-    // Copyright (c) Microsoft Corporation.
-    // Licensed under the MIT License.
-    #define ISBITMASK( r,g,b,a ) ( ddpf.RBitMask == r && ddpf.GBitMask == g && ddpf.BBitMask == b && ddpf.ABitMask == a )
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+#define ISBITMASK(r, g, b, a) (ddpf.RBitMask == r && ddpf.GBitMask == g && ddpf.BBitMask == b && ddpf.ABitMask == a)
 
-    if (ddpf.flags & DDSPF_RGB)
-    {
+    if (ddpf.flags & DDSPF_RGB) {
         // Note that sRGB formats are written using the "DX10" extended header
         // here would need to force the format to an srgb format from cli
-        switch (ddpf.RGBBitCount)
-        {
-        case 32:
-            if (ISBITMASK(0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000))
-            {
-                return MyMTLPixelFormatRGBA8Unorm;
-            }
+        switch (ddpf.RGBBitCount) {
+            case 32:
+                if (ISBITMASK(0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000)) {
+                    return MyMTLPixelFormatRGBA8Unorm;
+                }
 
-            if (ISBITMASK(0xffffffff, 0, 0, 0))
-            {
-                // Only 32-bit color channel format in D3D9 was R32F
-                return MyMTLPixelFormatR32Float; // D3DX writes this out as a FourCC of 114
-            }
-            break;
+                if (ISBITMASK(0xffffffff, 0, 0, 0)) {
+                    // Only 32-bit color channel format in D3D9 was R32F
+                    return MyMTLPixelFormatR32Float; // D3DX writes this out as a FourCC of 114
+                }
+                break;
 
-        case 8:
-            // NVTT versions 1.x wrote this as RGB instead of LUMINANCE
-            if (ISBITMASK(0xff, 0, 0, 0))
-            {
-                return MyMTLPixelFormatR8Unorm;
-            }
+            case 8:
+                // NVTT versions 1.x wrote this as RGB instead of LUMINANCE
+                if (ISBITMASK(0xff, 0, 0, 0)) {
+                    return MyMTLPixelFormatR8Unorm;
+                }
 
-            // No 3:3:2 or paletted DXGI formats aka D3DFMT_R3G3B2, D3DFMT_P8
-            break;
+                // No 3:3:2 or paletted DXGI formats aka D3DFMT_R3G3B2, D3DFMT_P8
+                break;
         }
     }
-    else if (ddpf.flags & DDSPF_LUMINANCE)
-    {
+    else if (ddpf.flags & DDSPF_LUMINANCE) {
         // TODO: need rrrg swizzle on these
-        switch (ddpf.RGBBitCount)
-        {
-        case 16:
-            if (ISBITMASK(0x00ff, 0, 0, 0xff00))
-            {
-                return MyMTLPixelFormatRG8Unorm; // D3DX10/11 writes this out as DX10 extension
-            }
-            break;
+        switch (ddpf.RGBBitCount) {
+            case 16:
+                if (ISBITMASK(0x00ff, 0, 0, 0xff00)) {
+                    return MyMTLPixelFormatRG8Unorm; // D3DX10/11 writes this out as DX10 extension
+                }
+                break;
 
-        case 8:
-            if (ISBITMASK(0xff, 0, 0, 0))
-            {
-                return MyMTLPixelFormatR8Unorm; // D3DX10/11 writes this out as DX10 extension
-            }
+            case 8:
+                if (ISBITMASK(0xff, 0, 0, 0)) {
+                    return MyMTLPixelFormatR8Unorm; // D3DX10/11 writes this out as DX10 extension
+                }
 
-            // No DXGI format maps to ISBITMASK(0x0f,0,0,0xf0) aka D3DFMT_A4L4
+                // No DXGI format maps to ISBITMASK(0x0f,0,0,0xf0) aka D3DFMT_A4L4
 
-            if (ISBITMASK(0x00ff, 0, 0, 0xff00))
-            {
-                return MyMTLPixelFormatRG8Unorm; // Some DDS writers assume the bitcount should be 8 instead of 16
-            }
-            break;
+                if (ISBITMASK(0x00ff, 0, 0, 0xff00)) {
+                    return MyMTLPixelFormatRG8Unorm; // Some DDS writers assume the bitcount should be 8 instead of 16
+                }
+                break;
         }
     }
-    else if (ddpf.flags & DDSPF_ALPHA)
-    {
-        if (8 == ddpf.RGBBitCount)
-        {
+    else if (ddpf.flags & DDSPF_ALPHA) {
+        if (8 == ddpf.RGBBitCount) {
             // TODO: need rrrr swizzle
             return MyMTLPixelFormatR8Unorm; // really A8, but use a swizzle
         }
     }
-    else if (ddpf.flags & DDSPF_FOURCC)
-    {
-        switch (ddpf.fourCC)
-        {
-            case D3DFMT_DXT1: return MyMTLPixelFormatBC1_RGBA;
+    else if (ddpf.flags & DDSPF_FOURCC) {
+        switch (ddpf.fourCC) {
+            case D3DFMT_DXT1:
+                return MyMTLPixelFormatBC1_RGBA;
             //case D3DFMT_DXT2: return MyMTLPixelFormatBC2_RGBA; // isPremul
             //case D3DFMT_DXT3: return MyMTLPixelFormatBC2_RGBA;
-            case D3DFMT_DXT4: return MyMTLPixelFormatBC3_RGBA; // isPremul
-            case D3DFMT_DXT5: return MyMTLPixelFormatBC3_RGBA;
-                
-            case D3DFMT_ATI1: return MyMTLPixelFormatBC4_RUnorm;
-            case D3DFMT_BC4U: return MyMTLPixelFormatBC4_RUnorm;
-            case D3DFMT_BC4S: return MyMTLPixelFormatBC4_RSnorm;
-            
-            case D3DFMT_ATI2: return MyMTLPixelFormatBC5_RGUnorm;
-            case D3DFMT_BC5U: return MyMTLPixelFormatBC5_RGUnorm;
-            case D3DFMT_BC5S: return MyMTLPixelFormatBC5_RGSnorm;
-            
-            case D3DFMT_R16F: return MyMTLPixelFormatR16Float;
-            case D3DFMT_G16R16F: return MyMTLPixelFormatRG16Float;
-            case D3DFMT_A16B16G16R16F: return MyMTLPixelFormatRGBA16Float;
-                
-            case D3DFMT_R32F: return MyMTLPixelFormatR32Float;
-            case D3DFMT_G32R32F: return MyMTLPixelFormatRG32Float;
-            case D3DFMT_A32B32G32R32F: return MyMTLPixelFormatRGBA32Float;
+            case D3DFMT_DXT4:
+                return MyMTLPixelFormatBC3_RGBA; // isPremul
+            case D3DFMT_DXT5:
+                return MyMTLPixelFormatBC3_RGBA;
+
+            case D3DFMT_ATI1:
+                return MyMTLPixelFormatBC4_RUnorm;
+            case D3DFMT_BC4U:
+                return MyMTLPixelFormatBC4_RUnorm;
+            case D3DFMT_BC4S:
+                return MyMTLPixelFormatBC4_RSnorm;
+
+            case D3DFMT_ATI2:
+                return MyMTLPixelFormatBC5_RGUnorm;
+            case D3DFMT_BC5U:
+                return MyMTLPixelFormatBC5_RGUnorm;
+            case D3DFMT_BC5S:
+                return MyMTLPixelFormatBC5_RGSnorm;
+
+            case D3DFMT_R16F:
+                return MyMTLPixelFormatR16Float;
+            case D3DFMT_G16R16F:
+                return MyMTLPixelFormatRG16Float;
+            case D3DFMT_A16B16G16R16F:
+                return MyMTLPixelFormatRGBA16Float;
+
+            case D3DFMT_R32F:
+                return MyMTLPixelFormatR32Float;
+            case D3DFMT_G32R32F:
+                return MyMTLPixelFormatRG32Float;
+            case D3DFMT_A32B32G32R32F:
+                return MyMTLPixelFormatRGBA32Float;
         }
-        
     }
 
     return MyMTLPixelFormatInvalid;
-    #undef ISBITMASK
+#undef ISBITMASK
 }
 
 bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool isInfoOnly)
 {
     const uint32_t magicSize = sizeof(uint32_t);
     uint32_t mipDataOffset = magicSize + sizeof(DDS_HEADER);
-    
+
     if (dataSize <= mipDataOffset) {
         KLOGE("kram", "bad dataSize too small %zu <= %d", dataSize, mipDataOffset);
         return false;
     }
-    
+
     const uint32_t& magic = *(const uint32_t*)data;
     const DDS_HEADER& hdr = *(const DDS_HEADER*)(data + magicSize);
     const DDS_PIXELFORMAT& format = hdr.ddspf;
-    
+
     if (magic != DDS_MAGIC) {
         KLOGE("kram", "bad magic number 0x%08X", magic);
         return false;
@@ -334,16 +329,16 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool
         KLOGE("kram", "bad format size %d", format.size);
         return false;
     }
-    
+
     // this flag must be set even though just using fourcc to indicate DX10
     if ((format.flags & DDSPF_FOURCC) == 0) {
         KLOGE("kram", "missing format.fourCC flag");
         return false;
     }
-    
+
     bool isDDS10 = format.fourCC == FOURCC_DX10;
     const DDS_HEADER_DXT10& hdr10 = *(const DDS_HEADER_DXT10*)(data + magicSize + sizeof(DDS_HEADER));
-    
+
     MyMTLPixelFormat pixelFormat = MyMTLPixelFormatInvalid;
     if (isDDS10) {
         mipDataOffset += sizeof(DDS_HEADER_DXT10);
@@ -352,30 +347,30 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool
     else {
         pixelFormat = getMetalFormatFromDDS9(format);
     }
-    
+
     // Kram only supports a subset of DDS formats
     if (pixelFormat == MyMTLPixelFormatInvalid) {
         KLOGE("kram", "unsupported dds format");
         return false;
     }
-    
+
     // make sure to copy mips/slices from DDS array-ordered to mip-ordered for KTX
     uint32_t width = (hdr.flags & DDSD_WIDTH) ? hdr.width : 1;
     uint32_t height = (hdr.flags & DDSD_HEIGHT) ? hdr.height : 1;
     uint32_t depth = (hdr.flags & DDSD_DEPTH) ? hdr.depth : 1;
-    
+
     uint32_t mipCount = (hdr.flags & DDSD_MIPMAPCOUNT) ? hdr.mipMapCount : 1;
     uint32_t arrayCount = 1;
-    
+
     if (isDDS10) {
         arrayCount = hdr10.arraySize;
     }
-    
+
     // make sure that counts are reasonable
     const uint32_t kMaxMipCount = 16;
     const uint32_t kMaxTextureSize = 1u << (kMaxMipCount - 1); // 32K
-    const uint32_t kMaxArrayCount = 2*1024;
-   
+    const uint32_t kMaxArrayCount = 2 * 1024;
+
     if (width > kMaxTextureSize) {
         KLOGE("kram", "bad dimension width %d", width);
         return false;
@@ -396,7 +391,7 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool
         KLOGE("kram", "bad dimension height %d", arrayCount);
         return false;
     }
-    
+
     // does mipCount = 0 mean automip?
     if (width == 0)
         width = 1;
@@ -404,20 +399,20 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool
         height = 1;
     if (depth == 0)
         depth = 1;
-    
+
     if (mipCount == 0)
         mipCount = 1;
     if (arrayCount == 0)
         arrayCount = 1;
-    
+
     bool isCube = false;
     bool isArray = arrayCount > 1;
     bool isPremul = false;
-    
+
     if (isDDS10) {
         isCube = (hdr10.miscFlag & DDS_RESOURCE_MISC_TEXTURECUBE);
-        
-        switch(hdr10.resourceDimension) {
+
+        switch (hdr10.resourceDimension) {
             case DDS_DIMENSION_TEXTURE1D:
                 image.textureType = MyMTLTextureType1DArray;
                 isArray = true; // kram doesn't support 1d
@@ -439,7 +434,7 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool
     }
     else {
         isArray = false;
-        
+
         if (hdr.flags & DDSD_DEPTH) {
             image.textureType = MyMTLTextureType3D;
         }
@@ -447,29 +442,29 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool
             image.textureType = MyMTLTextureTypeCube;
         }
     }
-    
+
     // transfer premul setting, would like to not depend on "info" to carry this
     if (isPremul)
         image.addChannelProps("Alb.ra,Alb.ga,Alb.ba,Alb.a");
-    
+
     //-------------
-    
+
     // TODO: may need to fix these to KTX conventions first
     image.width = width;
     image.height = height;
     image.depth = depth;
-    
+
     auto& ktxHdr = image.header;
-    ktxHdr.pixelWidth  = image.width;
+    ktxHdr.pixelWidth = image.width;
     ktxHdr.pixelHeight = image.height;
-    ktxHdr.pixelDepth  = image.depth;
-    
+    ktxHdr.pixelDepth = image.depth;
+
     ktxHdr.initFormatGL(pixelFormat);
-    
+
     ktxHdr.numberOfFaces = isCube ? 6 : 1;
     ktxHdr.numberOfMipmapLevels = mipCount;
     ktxHdr.numberOfArrayElements = arrayCount;
-    
+
     // fix up the values, so that can convert header properly to type in info
     // TODO: this means image and ktx header don't match
     if (!isArray)
@@ -485,46 +480,46 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool
             return false;
         }
     }
-        
+
     // make sure derived type lines up
     if (ktxHdr.metalTextureType() != image.textureType) {
         KLOGE("kram", "unsupported textureType");
         return false;
     }
-    
+
     image.pixelFormat = pixelFormat;
-    
+
     // allocate data
     image.initMipLevels(mipDataOffset);
-    
+
     // Skip allocating the pixels
     if (!isInfoOnly) {
         image.reserveImageData();
-        
+
         uint8_t* dstImageData = image.imageData().data();
         const uint8_t* srcImageData = data + mipDataOffset;
-        
+
         size_t srcOffset = 0;
         for (uint32_t chunkNum = 0; chunkNum < image.totalChunks(); ++chunkNum) {
             for (uint32_t mipNum = 0; mipNum < image.mipCount(); ++mipNum) {
                 // memcpy from src to dst
                 size_t dstOffset = image.chunkOffset(mipNum, chunkNum);
                 size_t mipLength = image.mipLevels[mipNum].length;
-        
+
                 if ((mipDataOffset + srcOffset + mipLength) > dataSize) {
                     KLOGE("kram", "source image data incomplete");
                     return false;
                 }
-                
+
                 memcpy(dstImageData + dstOffset, srcImageData + srcOffset, mipLength);
-                
+
                 srcOffset += mipLength;
             }
         }
     }
-    
+
     // Now have a valid KTX or KTX2 file from the DDS
-    
+
     return true;
 }
 
@@ -535,46 +530,46 @@ bool DDSHelper::save(const KTXImage& image, FileHelper& fileHelper)
     // be compressed when writing to DDS, so KTX conversion is simpler.
     if (image.isSupercompressed())
         return false;
-    
+
     // Can only write out if matching format in DDS
     if (directxType(image.pixelFormat) == MyMTLPixelFormatInvalid)
         return false;
-    
+
     // https://docs.microsoft.com/en-us/windows/win32/direct3ddds/dds-header
-    
+
     // lots of headers, this is newer dx10 style dds
     DDS_HEADER hdr = {};
     DDS_PIXELFORMAT& format = hdr.ddspf;
     DDS_HEADER_DXT10 hdr10 = {};
-    
+
     hdr.size = sizeof(DDS_HEADER);
     format.size = sizeof(DDS_PIXELFORMAT);
-    
+
     hdr.width = image.width;
     hdr.height = image.height;
     hdr.depth = image.depth;
-    
+
     hdr.mipMapCount = image.mipCount();
-    
+
     hdr.caps |= DDSCAPS_TEXTURE;
     if (image.mipCount() > 1) {
         hdr.caps |= DDSCAPS_MIPMAP;
         hdr.flags |= DDSD_MIPMAPCOUNT;
     }
-    
+
     // indicate this is newer dds file with pixelFormat
     // important to set FOURCC flag
     format.fourCC = FOURCC_DX10;
     format.flags |= DDSPF_FOURCC;
-    
+
     hdr.flags |= DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT;
-    
+
     if (hdr.depth > 1)
         hdr.flags |= DDSD_DEPTH;
-    
+
     if (isBlockFormat(image.pixelFormat)) {
         hdr.flags |= DDSD_LINEARSIZE;
-        
+
         // This is assuming BC 4x4 blocks
         hdr.pitchOrLinearSize = image.blockDims().x * blockSizeOfFormat(image.pixelFormat);
     }
@@ -582,21 +577,21 @@ bool DDSHelper::save(const KTXImage& image, FileHelper& fileHelper)
         hdr.flags |= DDSD_PITCH;
         hdr.pitchOrLinearSize = image.blockDims().x * blockSizeOfFormat(image.pixelFormat);
     }
-    
+
     hdr10.arraySize = image.arrayCount();
     hdr10.dxgiFormat = directxType(image.pixelFormat);
-    
+
     switch (image.textureType) {
         case MyMTLTextureType1DArray:
             hdr.caps |= DDSCAPS_COMPLEX;
-            
+
             hdr10.resourceDimension = DDS_DIMENSION_TEXTURE1D;
             break;
         case MyMTLTextureTypeCube:
         case MyMTLTextureTypeCubeArray:
             hdr.caps |= DDSCAPS_COMPLEX;
             hdr.caps2 = DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_ALLFACES;
-            
+
             hdr10.miscFlag = DDS_RESOURCE_MISC_TEXTURECUBE;
             hdr10.resourceDimension = DDS_DIMENSION_TEXTURE2D;
             break;
@@ -612,17 +607,17 @@ bool DDSHelper::save(const KTXImage& image, FileHelper& fileHelper)
         case MyMTLTextureType3D:
             hdr.caps |= DDSCAPS_COMPLEX;
             hdr.caps2 = DDSCAPS2_VOLUME;
-            
+
             hdr10.resourceDimension = DDS_DIMENSION_TEXTURE3D;
             break;
     }
-    
+
     // fill out in the format fields
     if (!isBlockFormat(image.pixelFormat)) {
         if (isColorFormat(image.pixelFormat)) {
             bool hasG = numChannelsOfFormat(image.pixelFormat) >= 2;
             bool hasB = numChannelsOfFormat(image.pixelFormat) >= 3;
-            
+
             format.flags |= DDSPF_RGB;
             // supposed to include alpha bits too
             format.RGBBitCount = blockSizeOfFormat(image.pixelFormat) * 8;
@@ -635,7 +630,7 @@ bool DDSHelper::save(const KTXImage& image, FileHelper& fileHelper)
             format.ABitMask = 0xff000000;
         }
     }
-    
+
     // set premul state
     // The legacy D3DX 10 and D3DX 11 utility libraries will fail to load any .DDS file with miscFlags2 not equal to zero.
     if (image.isPremul()) {
@@ -646,38 +641,35 @@ bool DDSHelper::save(const KTXImage& image, FileHelper& fileHelper)
     }
     // TODO: also hdr10.miscFlags2 |= DDS_ALPHA_MODE_OPAQUE (alpha full opaque)
     // TODO: also hdr10.miscFlags2 |= DDS_ALPHA_MODE_CUSTOM (raw data in alpha)
-        
+
     bool success = true;
-    
+
     success = success && fileHelper.write((const uint8_t*)&DDS_MAGIC, sizeof(DDS_MAGIC));
     success = success && fileHelper.write((const uint8_t*)&hdr, sizeof(hdr));
     success = success && fileHelper.write((const uint8_t*)&hdr10, sizeof(hdr10));
-    
+
     if (success) {
         // Now write the mip data out in the order dds expects
         // Ugh, dds stores each array item mips, then the next array item mips.
         const uint8_t* imageData = image.fileData;
         for (uint32_t chunkNum = 0; chunkNum < image.totalChunks(); ++chunkNum) {
-            
             for (uint32_t mipNum = 0; mipNum < image.mipCount(); ++mipNum) {
                 size_t offset = image.chunkOffset(mipNum, chunkNum);
                 size_t mipLength = image.mipLevels[mipNum].length;
-                
+
                 success = fileHelper.write(imageData + offset, mipLength);
                 if (!success) {
                     break;
                 }
             }
-            
+
             if (!success) {
                 break;
             }
         }
     }
-    
+
     return success;
 }
 
-
-
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramDDSHelper.h b/libkram/kram/KramDDSHelper.h
index 0038e891..6f25f273 100644
--- a/libkram/kram/KramDDSHelper.h
+++ b/libkram/kram/KramDDSHelper.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -10,7 +10,7 @@
 //#include "KramConfig.h"
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 class KTXImage;
 class FileHelper;
@@ -31,4 +31,4 @@ class DDSHelper {
     bool save(const KTXImage& image, FileHelper& fileHelper);
 };
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramFileHelper.cpp b/libkram/kram/KramFileHelper.cpp
index a1b87126..fbc52f1a 100644
--- a/libkram/kram/KramFileHelper.cpp
+++ b/libkram/kram/KramFileHelper.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -15,13 +15,13 @@
 
 #include "tmpfileplus.h"
 
-#if KRAM_MAC || KRAM_IOS || KRAM_LINUX
-#include <unistd.h>  // for getpagesize()
+#if KRAM_APPLE || KRAM_LINUX
+#include <unistd.h> // for getpagesize()
 #endif
 
 #if KRAM_WIN
-#include <direct.h>   // direct-ory for _mkdir, _rmdir
-#include <windows.h>  // for GetNativeSystemInfo()
+#include <direct.h> // direct-ory for _mkdir, _rmdir
+#include <windows.h> // for GetNativeSystemInfo()
 
 // Windows mkdir doesn't take permission
 #define mkdir(fname, permission) _mkdir(fname)
@@ -29,7 +29,7 @@
 #endif
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 #define nl "\n"
 
@@ -45,7 +45,7 @@ static void mkdirRecursive(char* path)
 
     if (*path != '\0' && mkdir(path, 0755) && errno != EEXIST) {
         KLOGE("kram", "error while trying to create '%s'" nl "%s" nl,
-              path, strerror(errno));  // same as %m
+              path, strerror(errno)); // same as %m
     }
 }
 
@@ -65,7 +65,7 @@ static FILE* fopen_mkdir(const char* path, const char* mode)
 FileHelper::~FileHelper() { close(); }
 
 // no current extension
-bool FileHelper::openTemporaryFile(const char* suffix, const char* access)
+bool FileHelper::openTemporaryFile(const char* prefix, const char* suffix, const char* access)
 {
     close();
 
@@ -82,7 +82,7 @@ bool FileHelper::openTemporaryFile(const char* suffix, const char* access)
     int keep = 0;
 
     // Note: can't pass . either, always opened as rw
-    _fp = tmpfileplus("/tmp/", "kramimage-", suffix, &pathname, keep);
+    _fp = tmpfileplus("/tmp/", prefix, suffix, &pathname, keep);
     if (!_fp) {
         return false;
     }
@@ -124,14 +124,16 @@ size_t FileHelper::pagesize()
 {
     static size_t pagesize = 0;
     if (pagesize == 0) {
-#if KRAM_MAC || KRAM_IOS || KRAM_LINUX
+#if KRAM_APPLE || KRAM_LINUX
         pagesize = getpagesize();
 #elif KRAM_WIN
+        // win has mostly 4k, then 1MB/2MB large page size
         SYSTEM_INFO systemInfo;
         GetNativeSystemInfo(&systemInfo);
         pagesize = systemInfo.dwPageSize;
 #else
-        pagesize = 4 * 1024;  // how to determine on Win/Linux?
+        // TODO: Android 15 has variable page size (16K and 4K)
+        pagesize = 4 * 1024; // how to determine on Win/Android?
 #endif
     }
     return pagesize;
@@ -139,8 +141,10 @@ size_t FileHelper::pagesize()
 
 bool FileHelper::copyTemporaryFileTo(const char* dstFilename)
 {
-    if (!_fp) return false;
-    if (_filename.empty()) return false;
+    if (!_fp)
+        return false;
+    if (_filename.empty())
+        return false;
 
     // since we're not closing, need to flush output
     fflush(_fp);
@@ -196,6 +200,8 @@ bool FileHelper::open(const char* filename, const char* access)
 {
     close();
 
+    _filename = filename;
+
     if (strstr(access, "w") != nullptr) {
         _fp = fopen_mkdir(filename, access);
     }
@@ -207,7 +213,6 @@ bool FileHelper::open(const char* filename, const char* access)
         return false;
     }
 
-    _filename = filename;
     return true;
 }
 
@@ -217,7 +222,7 @@ void FileHelper::close()
         return;
     }
 
-    // temp files are auto-deleted on fclose, since they've been "keep" is 0
+    // temp files are auto-deleted on fclose when "keep" is 0
     fclose(_fp);
 
     _isTmpFile = false;
@@ -255,9 +260,8 @@ bool FileHelper::exists(const char* filename) const
 bool FileHelper::isDirectory(const char* filename) const
 {
     struct stat stats;
-    if( stat(filename,&stats) == 0 )
-    {
-        if( stats.st_mode & S_IFDIR )
+    if (stat(filename, &stats) == 0) {
+        if (stats.st_mode & S_IFDIR)
             return true;
     }
     return false;
@@ -290,4 +294,4 @@ uint64_t FileHelper::modificationTimestamp(const char* filename)
     return stats.st_mtime;
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramFileHelper.h b/libkram/kram/KramFileHelper.h
index 84e6902f..9d076937 100644
--- a/libkram/kram/KramFileHelper.h
+++ b/libkram/kram/KramFileHelper.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -12,23 +12,24 @@
 //#include "KramConfig.h"
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // Use this to help open/close files, since dtor is scoped, or caller can close()
-// Also allows write to temp file, then rename over the destination file.  This
-// avoids leaving unfinished files around when
+// Also allows write to temp file, then rename over the destination file.
+// This avoids leaving unfinished files around.
 class FileHelper {
 public:
     ~FileHelper();
+    bool isOpen() const { return _fp != nullptr; }
 
     bool isDirectory(const char* filename) const;
-    
+
     bool exists(const char* filename) const;
 
     bool open(const char* filename, const char* access);
 
     // this file is auto-deleted by close(), is that okay with renameFile use?
-    bool openTemporaryFile(const char* suffix, const char* access);
+    bool openTemporaryFile(const char* prefix, const char* suffix, const char* access);
 
     // mainly for tmp files, file can be closed, but this does rename of tmp file.
     // may fail if tmp file and dst are different volumes.
@@ -54,10 +55,13 @@ class FileHelper {
 
     static size_t pagesize();
 
+    // Can retreive if open called (even on failure)
+    const string& filename() const { return _filename; }
+
 private:
     FILE* _fp = nullptr;
     string _filename;
     bool _isTmpFile = false;
 };
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramFileIO.cpp b/libkram/kram/KramFileIO.cpp
new file mode 100644
index 00000000..fa43fd14
--- /dev/null
+++ b/libkram/kram/KramFileIO.cpp
@@ -0,0 +1,162 @@
+// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#include "KramFileIO.h"
+
+//#include <stdio.h>
+//#include "KramFileHelper.h"
+
+//#include <algorithm>
+
+// TODO: move to common header
+#define nl "\n"
+
+namespace kram {
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
+
+void FileIO::writePad(int paddingSize)
+{
+    if (paddingSize <= 1) {
+        return;
+    }
+
+    constexpr int maxPaddingSize = 16;
+    constexpr uint8_t padding[maxPaddingSize] = {0};
+
+    paddingSize = std::max(paddingSize, maxPaddingSize);
+
+    size_t dataSize = tell();
+
+    // pad to 16 byte alignment
+    size_t valuePadding = (paddingSize - 1) - ((dataSize + (paddingSize - 1)) % paddingSize);
+
+    // write padding out
+    writeArray8u(padding, (int)valuePadding);
+}
+
+void FileIO::readPad(int paddingSize)
+{
+    if (paddingSize <= 1) {
+        return;
+    }
+    constexpr int maxPaddingSize = 16;
+    uint8_t padding[maxPaddingSize] = {0};
+
+    paddingSize = std::max(paddingSize, maxPaddingSize);
+
+    size_t dataSize = tell();
+
+    // pad to paddingSize
+    size_t valuePadding = (paddingSize - 1) - ((dataSize + (paddingSize - 1)) % paddingSize);
+
+    // skip padding
+    readArray8u(padding, (int)valuePadding);
+}
+
+int FileIO::tell()
+{
+    if (isFile() && fp) {
+        return (int)ftell(fp);
+    }
+    else if (isData() && _data) {
+        return dataLocation;
+    }
+    else if (isMemory() && mem) {
+        return dataLocation;
+    }
+    else {
+        KASSERT(false);
+        return 0;
+    }
+}
+void FileIO::seek(int tell_)
+{
+    if (tell_ < 0) {
+        KASSERT(false);
+        return;
+    }
+
+    if (isFile() && fp) {
+        fseek(fp, (size_t)tell_, SEEK_SET);
+    }
+    else if (isData() && _data) {
+        dataLocation = STL_NAMESPACE::clamp(tell_, 0, dataLength);
+    }
+    else if (isMemory() && mem) {
+        dataLocation = STL_NAMESPACE::clamp(tell_, 0, dataLength);
+    }
+    else {
+        KASSERT(false);
+    }
+}
+
+void FileIO::read(void* data_, int size, int count)
+{
+    size_t numberOfBytes = size * count;
+    if (isFile() && fp) {
+        size_t readBytes = fread(data_, 1, numberOfBytes, fp);
+        if (readBytes != numberOfBytes) {
+            _isFailed = true;
+        }
+    }
+    else if (isData() && _data) {
+        if (dataLocation + numberOfBytes <= dataLength) {
+            memcpy(data_, _data + dataLocation, numberOfBytes);
+            dataLocation += numberOfBytes;
+        }
+        else {
+            _isFailed = true;
+        }
+    }
+    else if (isMemory() && mem) {
+        if (dataLocation + numberOfBytes <= dataLength) {
+            memcpy(data_, _data + dataLocation, numberOfBytes);
+            dataLocation += numberOfBytes;
+        }
+        else {
+            _isFailed = true;
+        }
+    }
+}
+
+void FileIO::write(const void* data_, int size, int count)
+{
+    if (_isReadOnly) {
+        KASSERT(false);
+        return;
+    }
+
+    int numberOfBytes = size * count;
+    if (isFile() && fp) {
+        size_t writeBytes = fwrite(data_, 1, numberOfBytes, fp);
+        if (writeBytes != numberOfBytes) {
+            _isFailed = true;
+        }
+    }
+    else if (isData() && _data) {
+        if (dataLocation + numberOfBytes <= dataLength) {
+            memcpy(const_cast<uint8_t*>(_data) + dataLocation, data_, numberOfBytes);
+            dataLocation += numberOfBytes;
+        }
+        else {
+            _isFailed = true;
+        }
+    }
+    else if (isMemory() && mem) {
+        int totalBytes = dataLocation + numberOfBytes;
+        if (totalBytes <= dataLength) {
+            mem->resize(totalBytes);
+            _data = mem->data();
+            dataLength = totalBytes;
+        }
+
+        // TOOD: handle resize failure?
+        
+        memcpy(const_cast<uint8_t*>(_data) + dataLocation, data_, numberOfBytes);
+        dataLocation += numberOfBytes;
+    }
+}
+
+} //namespace kram
diff --git a/libkram/kram/KramFileIO.h b/libkram/kram/KramFileIO.h
new file mode 100644
index 00000000..be5d4e91
--- /dev/null
+++ b/libkram/kram/KramFileIO.h
@@ -0,0 +1,171 @@
+// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h> // for FILE
+
+namespace kram {
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
+
+// Unifies binary reads/writes from/to a mmap, buffer, or file pointer.
+struct FileIO {
+private:
+    FILE* fp = nullptr;
+
+    // can point mmap to this
+    const uint8_t* _data = nullptr;
+    int dataLength = 0;
+    int dataLocation = 0;
+
+    bool _isReadOnly = false;
+    bool _isResizeable = false;
+    bool _isFailed = false;
+    
+    // dynamic vector
+    vector<uint8_t>* mem = nullptr;
+
+public:
+    // FileIO doesn't deal with lifetime of the incoming data
+    // may eventually have helpers return a FileIO object
+
+    // read/write to file
+    FileIO(FILE* fp_, bool isReadOnly = false)
+        : fp(fp_), _isReadOnly(isReadOnly), _isResizeable(!isReadOnly)
+    {
+    }
+
+    // fixed data area for reads/writes, reads are for mmap
+    FileIO(const uint8_t* data_, int dataLength_)
+        : _data(data_), dataLength(dataLength_), _isReadOnly(true)
+    {
+    }
+    FileIO(uint8_t* data_, int dataLength_)
+        : _data(data_), dataLength(dataLength_), _isReadOnly(false)
+    {
+    }
+
+    // read/write and resizable memory
+    FileIO(const vector<uint8_t>* mem_)
+        : _data(mem_->data()), dataLength((int)mem_->size()), _isReadOnly(false), _isResizeable(true)
+    {
+    }
+    FileIO(vector<uint8_t>* mem_)
+        : mem(mem_), dataLength((int)mem_->size()), _isReadOnly(false), _isResizeable(true)
+    {
+    }
+
+    bool isFile() const { return fp != nullptr; }
+    bool isData() const { return _data != nullptr; }
+    bool isMemory() const { return mem != nullptr; }
+    bool isFailed() const { return _isFailed; }
+    
+    void writeArray32u(const uint32_t* data, int count) { write(data, sizeof(uint32_t), count); }
+    void writeArray16u(const uint16_t* data, int count) { write(data, sizeof(uint16_t), count); }
+    void writeArray8u(const uint8_t* data, int count) { write(data, sizeof(uint8_t), count); }
+
+    void writeArray32i(const int32_t* data, int count) { write(data, sizeof(int32_t), count); }
+    void writeArray16i(const int16_t* data, int count) { write(data, sizeof(int16_t), count); }
+    void writeArray8i(const int8_t* data, int count) { write(data, sizeof(int8_t), count); }
+
+    // API has to be explicit on writes about type - signed vs. unsigned due to promotion
+    // could use const & instead but then can't enforce types written out
+    // might switch to writeArray8
+    void write32u(uint32_t data) { writeArray32u(&data, 1); }
+    void write16u(uint16_t data) { writeArray16u(&data, 1); }
+    void write8u(uint8_t data) { writeArray8u(&data, 1); }
+
+    void write32i(int32_t data) { writeArray32i(&data, 1); }
+    void write16i(int16_t data) { writeArray16i(&data, 1); }
+    void write8i(int8_t data) { writeArray8i(&data, 1); }
+
+    void writeArray32f(const float* data, int count) { write(data, sizeof(float), count); }
+    void write32f(float data) { writeArray32f(&data, 1); }
+
+    void writePad(int paddingSize);
+
+    void readPad(int paddingSize);
+
+    // simd too?
+#if USE_SIMDLIB
+#if SIMD_FLOAT
+    void write32fx2(float2 v) { writeArray32fx2(&v, 1); }
+    void write32fx3(float3 v) { writeArray32fx3(&v, 1); }
+    void write32fx4(float4 v) { writeArray32fx4(&v, 1); }
+    
+    void writeArray32fx2(const float2* v, int count) { writeArray32f((const float*)v, 2*count); }
+    void writeArray32fx3(const float3p* v, int count) { writeArray32f((const float*)v, 3); }
+    void writeArray32fx4(const float4* v, int count) { writeArray32f((const float*)v, 4*count); }
+    
+    // TODO: add read calls
+    // TODO: handle float3 to float3p
+#endif
+    
+#if SIMD_INT
+    void write32ix2(int2 v) { writeArray32ix2(&v, 1); }
+    void write32ix3(int3 v) { writeArray32ix3(&v, 1); }
+    void write32ix4(int4 v) { writeArray32ix4(&v, 1); }
+    
+    void writeArray32ix2(const int2* v, int count) { writeArray32i((const int32_t*)v, 2*count); }
+    void writeArray32ix3(const int3p* v, int count) { writeArray32i((const int32_t*)v, 3*count); }
+    void writeArray32ix4(const int4* v, int count) { writeArray32i((const int32_t*)v, 4*count); }
+    
+#endif
+ 
+#if SIMD_SHORT
+    void write16ix2(short2 v) { writeArray16ix2(&v, 1); }
+    void write16ix3(short3 v) { writeArray16ix3(&v, 1); }
+    void write16ix4(short4 v) { writeArray16ix4(&v, 1); }
+    
+    void writeArray16ix2(const short2* v, int count) { writeArray16i((const short*)v, 2*count); }
+    void writeArray16ix3(const short3p* v, int count) { writeArray16i((const short*)v, 3*count); }
+    void writeArray16ix4(const short4* v, int count) { writeArray16i((const short*)v, 4*count); }
+#endif
+
+#if SIMD_CHAR
+    void write8ix2(char2 v) { writeArray8ix2(&v, 1); }
+    void write8ix3(char3 v) { writeArray8ix3(&v, 1); }
+    void write8ix4(char4 v) { writeArray8ix4(&v, 1); }
+    
+    void writeArray8ix2(const char2* v, int count) { writeArray8i((const int8_t*)v, 2*count); }
+    void writeArray8ix3(const char3p* v, int count) { writeArray8i((const int8_t*)v, 3*count); }
+    void writeArray8ix4(const char4* v, int count) { writeArray8i((const int8_t*)v, 4*count); }
+#endif
+#endif
+    
+    void readArray32f(float* data, int count) { read(data, sizeof(float), count); }
+    void read32f(float& data) { readArray32f(&data, 1); }
+
+    void readArray32u(uint32_t* data, int count) { read(data, sizeof(uint32_t), count); }
+    void readArray16u(uint16_t* data, int count) { read(data, sizeof(uint16_t), count); }
+    void readArray8u(uint8_t* data, int count) { read(data, sizeof(uint8_t), count); }
+
+    void read32u(uint32_t& data) { readArray32u(&data, 1); }
+    void read16u(uint16_t& data) { readArray16u(&data, 1); }
+    void read8u(uint8_t& data) { readArray8u(&data, 1); }
+
+    void readArray32i(int32_t* data, int count) { read(data, sizeof(int32_t), count); }
+    void readArray16i(int16_t* data, int count) { read(data, sizeof(int16_t), count); }
+    void readArray8i(int8_t* data, int count) { read(data, sizeof(int8_t), count); }
+
+    void read32i(int32_t& data) { readArray32i(&data, 1); }
+    void read16i(int16_t& data) { readArray16i(&data, 1); }
+    void read8i(int8_t& data) { readArray8i(&data, 1); }
+
+    // seek/tell
+    int tell();
+    void seek(int tell_);
+
+private:
+    // binary reads/writes
+    void read(void* data_, int size, int count);
+    void write(const void* data_, int size, int count);
+};
+
+// to better distinguish mmap/buffer io
+using DataIO = FileIO;
+
+} //namespace kram
diff --git a/libkram/kram/KramFmt.h b/libkram/kram/KramFmt.h
index 8bb43060..45d0022a 100644
--- a/libkram/kram/KramFmt.h
+++ b/libkram/kram/KramFmt.h
@@ -1,7 +1,9 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
+#if 1 // !KRAM_VISION // this is breaking link on visionOS
+
 #pragma once
 
 #include <cassert>
@@ -10,7 +12,6 @@
 //#include "KramConfig.h"
 
 #include "KramLog.h"
-
 #include "core.h" // really fmt/core.h
 #include "format.h" // really fmt/format.h - for FMT_STRING
 
@@ -20,8 +21,8 @@
 namespace kram {
 
 int32_t logMessage(const char* group, int32_t logLevel,
-                const char* file, int32_t line, const char* func,
-                fmt::string_view format, fmt::format_args args);
+                   const char* file, int32_t line, const char* func,
+                   fmt::string_view format, fmt::format_args args);
 
 // This is a way to convert to single function call, so handling
 // can be buriend within that.
@@ -66,4 +67,6 @@ int32_t append_sprintf_fmt(string& s, const S& format, Args&&... args)
     return append_sprintf_impl(s, format, fmt::make_format_args(args...));
 }
 
-}  // namespace kram
+} // namespace kram
+
+#endif
diff --git a/libkram/kram/KramImage.cpp b/libkram/kram/KramImage.cpp
index eeea2f2d..954315c7 100644
--- a/libkram/kram/KramImage.cpp
+++ b/libkram/kram/KramImage.cpp
@@ -1,25 +1,24 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #include "KramImage.h"
 
-
 #if COMPILE_ATE
-#include "ateencoder.h"  // astc/bc encoder, apple only
+#include "ateencoder.h" // astc/bc encoder, apple only
 #endif
 
 #if COMPILE_ETCENC
-#include "EtcImage.h"  // etc encoder
+#include "EtcImage.h" // etc encoder
 #endif
 
 #if COMPILE_SQUISH
-#include "squish.h"  // bc encoder
+#include "squish.h" // bc encoder
 #endif
 
 #if COMPILE_COMP
-#include "bc6h_encode.h"  // bc encoder
-#include "bc6h_decode.h"  // bc decoder
+#include "bc6h_decode.h" // bc decoder
+#include "bc6h_encode.h" // bc encoder
 #endif
 
 #if COMPILE_BCENC
@@ -28,12 +27,12 @@
 #define RGBCX_USE_SMALLER_TABLES 1
 
 #include "bc7decomp.h"
-#include "bc7enc.h"  // bc encoder
+#include "bc7enc.h" // bc encoder
 #include "rgbcx.h"
 #endif
 
 #if COMPILE_ASTCENC
-#include "astcenc.h"  // astc encoder
+#include "astcenc.h" // astc encoder
 
 // hack to improve block generation on L1 and LA encoding
 //extern thread_local int32_t gAstcenc_UniqueChannelsInPartitioning;
@@ -61,8 +60,8 @@
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
-using namespace simd;
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
 
 template <typename T>
 void pointFilterImage(int32_t w, int32_t h, const T* srcImage,
@@ -139,7 +138,7 @@ bool Image::convertToFourChannel(const KTXImage& image, uint32_t mipNumber)
 {
     if (mipNumber >= image.mipLevels.size())
         return false;
-    
+
     const auto& srcMipLevel = image.mipLevels[mipNumber];
 
     // copy the data into a contiguous array
@@ -189,7 +188,7 @@ bool Image::convertToFourChannel(const KTXImage& image, uint32_t mipNumber)
 
                 for (int32_t x = 0; x < _width; ++x) {
                     int32_t srcX = (y0 + x) * numSrcChannels;
-                    int32_t dstX = (y0 + x);  // * numDstChannels;
+                    int32_t dstX = (y0 + x); // * numDstChannels;
 
                     for (int32_t i = 0; i < numSrcChannels; ++i) {
                         *(&dstTemp.r + i) = srcPixels[srcX + i];
@@ -216,7 +215,7 @@ bool Image::convertToFourChannel(const KTXImage& image, uint32_t mipNumber)
 
             const half* srcPixels = (const half*)(srcLevelData + mipBaseOffset);
 
-            half4 dstTemp = toHalf4(float4m(0.0f, 0.0f, 0.0f, 1.0f));
+            half4 dstTemp = half4m(float4m(0.0f, 0.0f, 0.0f, 1.0f));
 
             for (int32_t y = 0; y < _height; ++y) {
                 int32_t y0 = y * _width;
@@ -227,11 +226,11 @@ bool Image::convertToFourChannel(const KTXImage& image, uint32_t mipNumber)
 
                     // copy in available values
                     for (int32_t i = 0; i < numSrcChannels; ++i) {
-                        dstTemp.v[i] = srcPixels[srcX + i];
+                        dstTemp[i] = srcPixels[srcX + i];
                     }
 
                     // use AVX to convert
-                    dstPixels[dstX] = toFloat4(dstTemp);
+                    dstPixels[dstX] = float4m(dstTemp);
                 }
             }
             break;
@@ -299,11 +298,11 @@ bool Image::convertToFourChannelForThumbnail(const KTXImage& image, uint32_t mip
 {
     if (mipNumber >= image.mipLevels.size())
         return false;
-    
+
     const auto& srcMipLevel = image.mipLevels[mipNumber];
-    
+
     uint32_t chunkCount = chunksY();
-    
+
     // copy the data into a contiguous array
     // a verticaly chunked image, will be converted to chunks in encode
     uint32_t width, height, depth;
@@ -351,7 +350,7 @@ bool Image::convertToFourChannelForThumbnail(const KTXImage& image, uint32_t mip
 
                 for (int32_t x = 0; x < _width; ++x) {
                     int32_t srcX = (y0 + x) * numSrcChannels;
-                    int32_t dstX = (y0 + x);  // * numDstChannels;
+                    int32_t dstX = (y0 + x); // * numDstChannels;
 
                     for (int32_t i = 0; i < numSrcChannels; ++i) {
                         *(&dstTemp.r + i) = srcPixels[srcX + i];
@@ -378,7 +377,7 @@ bool Image::convertToFourChannelForThumbnail(const KTXImage& image, uint32_t mip
 
             const half* srcPixels = (const half*)(srcLevelData + mipBaseOffset);
 
-            half4 dstTemp = toHalf4(float4m(0.0f, 0.0f, 0.0f, 1.0f));
+            half4 dstTemp = half4m(float4m(0.0f, 0.0f, 0.0f, 1.0f));
 
             for (int32_t y = 0; y < _height; ++y) {
                 int32_t y0 = y * _width;
@@ -389,12 +388,12 @@ bool Image::convertToFourChannelForThumbnail(const KTXImage& image, uint32_t mip
 
                     // copy in available values
                     for (int32_t i = 0; i < numSrcChannels; ++i) {
-                        dstTemp.v[i] = srcPixels[srcX + i];
+                        dstTemp[i] = srcPixels[srcX + i];
                     }
 
                     // use AVX to convert
                     // This is a simple saturate to unorm8
-                    dstPixels[dstX] = ColorFromUnormFloat4(toFloat4(dstTemp));
+                    dstPixels[dstX] = ColorFromUnormFloat4(float4m(dstTemp));
                 }
             }
             break;
@@ -451,9 +450,9 @@ bool Image::loadImageFromPixels(const vector<Color>& pixels, int32_t width,
 
     // if true, then don't actually know this unless walk the pixels.
     // Format can also affect this, since 1/2 channel don't have color or alpha.
-    _hasColor = hasColor;  // grayscale or no rgb when false
+    _hasColor = hasColor; // grayscale or no rgb when false
     _hasAlpha = hasAlpha;
-    
+
     // always assumes 4 rgba8 channels
     // _pixels.resize(4 * _width * _height);
     assert((int32_t)pixels.size() == (width * height));
@@ -477,8 +476,7 @@ void Image::setSrgbState(bool isSrgb, bool hasSrgbBlock, bool hasNonSrgbBlocks)
 // being set ot 0.  This runs counter to ASTC L+A mode though which eliminates
 // the endpoint storage.
 void KramEncoder::averageChannelsInBlock(
-    const char* averageChannels, const KTXImage& image, ImageData& srcImage
-) const  // otherwise, it's BlueAlpha averaging
+    const char* averageChannels, const KTXImage& image, ImageData& srcImage) const // otherwise, it's BlueAlpha averaging
 {
     int32_t w = srcImage.width;
     int32_t h = srcImage.height;
@@ -494,9 +492,9 @@ void KramEncoder::averageChannelsInBlock(
     // these then don't affect the fitting, but do affect endpoint storage (f.e.
     // RGB instead of L+A) must be done before the encode due to complexity of
     // BC6/7 and ASTC
-    
+
     Int2 blockDims = image.blockDims();
-  
+
     for (int32_t yy = 0; yy < h; yy += blockDims.y) {
         for (int32_t xx = 0; xx < w; xx += blockDims.x) {
             // compute clamped blockDims
@@ -566,7 +564,7 @@ static bool writeDataAtOffset(const uint8_t* data, size_t dataSize, size_t dataO
 
 bool KramDecoder::decode(const KTXImage& srcImage, FILE* dstFile, const KramDecoderParams& params) const
 {
-    KTXImage dstImage;  // thrown out, data written to file
+    KTXImage dstImage; // thrown out, data written to file
     return decodeImpl(srcImage, dstFile, dstImage, params);
 }
 
@@ -578,7 +576,7 @@ bool KramDecoder::decode(const KTXImage& srcImage, KTXImage& dstImage, const Kra
 bool KramDecoder::decodeBlocks(
     int32_t w, int32_t h,
     const uint8_t* blockData, uint32_t blockDataSize, MyMTLPixelFormat blockFormat,
-    vector<uint8_t>& outputTexture,  // currently Color
+    vector<uint8_t>& outputTexture, // currently Color
     const KramDecoderParams& params) const
 {
     bool success = false;
@@ -587,7 +585,7 @@ bool KramDecoder::decodeBlocks(
     // or may want to disable if decoders don't gen correct output
     TexEncoder decoder = params.decoder;
     MyMTLTextureType textureType = MyMTLTextureType2D; // Note: this is a lie to get decode to occur
-    
+
     if (!validateFormatAndDecoder(textureType, blockFormat, decoder)) {
         KLOGE("Kram", "block decode only supports specific block types");
         return false;
@@ -625,9 +623,9 @@ bool KramDecoder::decodeBlocks(
     bool isVerbose = params.isVerbose;
     const string& swizzleText = params.swizzleText;
     bool isHDR = isHdrFormat(blockFormat);
-    
+
     // start decoding after format pulled from KTX file
-    if (isExplicitFormat(blockFormat)) { 
+    if (isExplicitFormat(blockFormat)) {
         // Could convert r/rg/rgb/rgba8 and 16f/32f to rgba8u image for png 8-bit output
         // for now just copying these to ktx format which supports these formats
     }
@@ -656,21 +654,20 @@ bool KramDecoder::decodeBlocks(
                     // Clear to 0001
                     // TODO: could only do for bc4/5
                     Color pixels[blockDim * blockDim] = {};
-                    for (uint32_t i = 0, iEnd = blockDim*blockDim; i < iEnd; ++i)
-                    {
+                    for (uint32_t i = 0, iEnd = blockDim * blockDim; i < iEnd; ++i) {
                         pixels[i].a = 255;
                     }
-                    
+
                     // TODO: need this for bc4/5/6sn on other decoders (ate + squish)
                     // but have to run through all blocks before passing.  Here doing one block
                     // at a time.  EAC_R11/RG11sn already do this conversion on decode.
-                    
+
                     // Switch from unorm to snorm if needed
                     uint16_t* e0;
                     uint16_t* e1;
 
                     e0 = (uint16_t*)&srcBlock[0];
-                    
+
                     if (blockFormat == MyMTLPixelFormatBC4_RSnorm) {
                         // 2 8-bit endpoints
                         remapFromSignedBCEndpoint88(*e0);
@@ -678,11 +675,11 @@ bool KramDecoder::decodeBlocks(
                     else if (blockFormat == MyMTLPixelFormatBC5_RGSnorm) {
                         // 4 8-bit endpoints
                         remapFromSignedBCEndpoint88(*e0);
-                        
-                        e1 = (uint16_t*)&srcBlock[4*2];
+
+                        e1 = (uint16_t*)&srcBlock[4 * 2];
                         remapFromSignedBCEndpoint88(*e1);
                     }
-                    
+
                     // decode into temp 4x4 pixels
                     success = true;
 
@@ -697,13 +694,13 @@ bool KramDecoder::decodeBlocks(
                             // Returns false if the block uses 3 color punchthrough alpha mode.
                             rgbcx::unpack_bc3(srcBlock, pixels);
                             break;
-                            
+
                         // writes r packed
                         case MyMTLPixelFormatBC4_RSnorm:
                         case MyMTLPixelFormatBC4_RUnorm:
                             rgbcx::unpack_bc4(srcBlock, (uint8_t*)pixels);
                             break;
-                            
+
                         // writes rg packed
                         case MyMTLPixelFormatBC5_RGSnorm:
                         case MyMTLPixelFormatBC5_RGUnorm:
@@ -720,10 +717,10 @@ bool KramDecoder::decodeBlocks(
                             for (uint32_t i = 0; i < 16; ++i) {
                                 srcBlockForDecompress[i] = srcBlock[i];
                             }
-                            
+
                             BC6HBlockDecoder decoderCompressenator;
                             decoderCompressenator.DecompressBlock(pixelsFloat, srcBlockForDecompress);
-                            
+
                             // losing snorm and chopping to 8-bit
                             for (uint32_t i = 0; i < 16; ++i) {
                                 pixels[i] = ColorFromUnormFloat4(*(const float4*)&pixelsFloat[i]);
@@ -731,7 +728,7 @@ bool KramDecoder::decodeBlocks(
                             break;
                         }
 #endif
-                            
+
                         case MyMTLPixelFormatBC7_RGBAUnorm:
                         case MyMTLPixelFormatBC7_RGBAUnorm_sRGB:
                             bc7decomp::unpack_bc7(srcBlock, (bc7decomp::color_rgba*)pixels);
@@ -757,7 +754,7 @@ bool KramDecoder::decodeBlocks(
                         for (int32_t bx = 0; bx < blockDim; ++bx) {
                             int32_t xx = x + bx;
                             if (xx >= w) {
-                                break;  // go to next y above
+                                break; // go to next y above
                             }
 
                             const Color& c = pixels[by * blockDim + bx];
@@ -801,7 +798,7 @@ bool KramDecoder::decodeBlocks(
                 // only handles bc1,3,4,5
                 // TODO: colors still don't look correct on rs, rgs.  Above it always requests unorm.
                 squish::DecompressImage(outputTexture.data(), w, h, srcData, format);
-                
+
                 success = true;
             }
         }
@@ -809,7 +806,7 @@ bool KramDecoder::decodeBlocks(
 #if COMPILE_ATE
         else if (useATE) {
             ATEEncoder encoder;
-            
+
             // TODO: colors still don't look correct on rs, rgs
             // docs mention needing to pass float pixels for snorm, but always using unorm decode format now
             success = encoder.Decode(blockFormat, blockDataSize, blockDims.y,
@@ -877,7 +874,7 @@ bool KramDecoder::decodeBlocks(
             astcenc_image dstImageASTC;
             dstImageASTC.dim_x = w;
             dstImageASTC.dim_y = h;
-            dstImageASTC.dim_z = 1;  // Not using 3D blocks, not supported on iOS
+            dstImageASTC.dim_z = 1; // Not using 3D blocks, not supported on iOS
             //dstImageASTC.dim_pad = 0;
             dstImageASTC.data_type = ASTCENC_TYPE_U8;
 
@@ -888,9 +885,9 @@ bool KramDecoder::decodeBlocks(
             uint32_t srcDataLength = blockDataSize;
 
             astcenc_profile profile;
-            profile = ASTCENC_PRF_LDR;  // isSrgb ? ASTCENC_PRF_LDR_SRGB : ASTCENC_PRF_LDR;
+            profile = ASTCENC_PRF_LDR; // isSrgb ? ASTCENC_PRF_LDR_SRGB : ASTCENC_PRF_LDR;
             if (isHDR) {
-                profile = ASTCENC_PRF_HDR;  // TODO: also ASTCENC_PRF_HDR_RGB_LDR_A
+                profile = ASTCENC_PRF_HDR; // TODO: also ASTCENC_PRF_HDR_RGB_LDR_A
             }
 
             astcenc_config config;
@@ -960,7 +957,7 @@ bool KramDecoder::decodeImpl(const KTXImage& srcImage, FILE* dstFile, KTXImage&
 
     // setup dstImage
     //KTXImage dstImage;
-    dstImage = srcImage;  // copy src (name-value pairs copied too)
+    dstImage = srcImage; // copy src (name-value pairs copied too)
 
     // important otherwise offsets are wrong if src is ktx2
     if (srcImage.skipImageLength) {
@@ -985,7 +982,7 @@ bool KramDecoder::decodeImpl(const KTXImage& srcImage, FILE* dstFile, KTXImage&
 
     dstHeader.initFormatGL(dstPixelFormat);
     dstImage.pixelFormat = dstPixelFormat;
-    dstImage.addFormatProps();  // update format prop
+    dstImage.addFormatProps(); // update format prop
 
     vector<uint8_t> propsData;
     dstImage.toPropsData(propsData);
@@ -1030,7 +1027,7 @@ bool KramDecoder::decodeImpl(const KTXImage& srcImage, FILE* dstFile, KTXImage&
     bool success = true;
 
     vector<uint8_t> mipStorage;
-    mipStorage.resize(srcImage.mipLengthLargest() * numChunks);  // enough to hold biggest mip
+    mipStorage.resize(srcImage.mipLengthLargest() * numChunks); // enough to hold biggest mip
 
     for (uint32_t i = 0; i < srcImage.mipLevels.size(); ++i) {
         // DONE: to decode compressed KTX2 want to walk all chunks of a single level
@@ -1229,7 +1226,7 @@ enum KHR_DF_CHANNEL {
     // ETC2
     //KHR_DF_CHANNEL_ETC2_RED = 0,
     KHR_DF_CHANNEL_ETC2_GREEN = 1,
-    KHR_DF_CHANNEL_ETC2_COLOR = 2,  // RGB
+    KHR_DF_CHANNEL_ETC2_COLOR = 2, // RGB
     KHR_DF_CHANNEL_ETC2_ALPHA = 15,
 
     // ASTC
@@ -1241,7 +1238,7 @@ enum KHR_DF_PRIMARIES {
 };
 
 enum KHR_DF_TRANSFER {
-    KHR_DF_TRANSFER_LINEAR = 1,  // ?
+    KHR_DF_TRANSFER_LINEAR = 1, // ?
     KHR_DF_TRANSFER_SRGB = 2,
 };
 
@@ -1255,8 +1252,8 @@ struct KTX2DescriptorChannelBlock {
     // 32-bits
     uint16_t bitOffset = 0;
     uint8_t bitLength = 0;
-    uint8_t channelType : 4;  // RED, GREEN, BLUE, RRR, GGG
-    uint8_t FSEL : 4;         // L is low bit - Float, Signed, Exponent, Linear (used on Alpha)
+    uint8_t channelType : 4; // RED, GREEN, BLUE, RRR, GGG
+    uint8_t FSEL : 4; // L is low bit - Float, Signed, Exponent, Linear (used on Alpha)
 
     // 32-bits
     uint8_t samplePositions[4] = {0};
@@ -1269,12 +1266,12 @@ struct KTX2DescriptorChannelBlock {
 struct KTX2DescriptorFileBlock {
     KTX2DescriptorFileBlock(MyMTLPixelFormat format, bool isPremul, bool isCompressed);
 
-    uint32_t totalSize = 0;  // descriptorBlockSize + 4
+    uint32_t totalSize = 0; // descriptorBlockSize + 4
 
     uint32_t vendorID : 18;
     uint32_t descriptorType : 14;
     uint16_t versionNumber = 2;
-    uint16_t descriptorBlockSize = 0;  // 24B + channels (doesn't include totalSize)
+    uint16_t descriptorBlockSize = 0; // 24B + channels (doesn't include totalSize)
 
     uint8_t colorModel = 0;
     uint8_t colorPrimaries = 0;
@@ -1285,7 +1282,7 @@ struct KTX2DescriptorFileBlock {
     uint8_t bytesPlane[8] = {0};
 
     // now 16 bytes for each channel present
-    KTX2DescriptorChannelBlock channels[4];  // max channels
+    KTX2DescriptorChannelBlock channels[4]; // max channels
 };
 
 KTX2DescriptorFileBlock::KTX2DescriptorFileBlock(MyMTLPixelFormat format, bool isPremul, bool isCompressed)
@@ -1331,12 +1328,12 @@ KTX2DescriptorFileBlock::KTX2DescriptorFileBlock(MyMTLPixelFormat format, bool i
         if (isFloat) {
             // This is for BC6H, TODO: might be half only so test for isHalf?
             if (isSigned) {
-                c.sampleLower = 0xBF800000U;  // -1.0f;
-                c.sampleUpper = 0x7F800000U;  //  1.0f;
+                c.sampleLower = 0xBF800000U; // -1.0f;
+                c.sampleUpper = 0x7F800000U; //  1.0f;
             }
             else {
-                c.sampleLower = 0xBF800000U;  //  -1.0f;
-                c.sampleUpper = 0x7F800000U;  //   1.0f;
+                c.sampleLower = 0xBF800000U; //  -1.0f;
+                c.sampleUpper = 0x7F800000U; //   1.0f;
             }
         }
         else if (isSigned) {
@@ -1347,7 +1344,7 @@ KTX2DescriptorFileBlock::KTX2DescriptorFileBlock(MyMTLPixelFormat format, bool i
 
     // set this since it applies to so many block formats
     channels[0].bitOffset = 0;
-    channels[0].bitLength = blockSize * 8 - 1;  // needs to be split of channel bits
+    channels[0].bitLength = blockSize * 8 - 1; // needs to be split of channel bits
 
     switch (format) {
         case MyMTLPixelFormatBC1_RGBA:
@@ -1446,9 +1443,9 @@ void KramEncoder::addBaseProps(const ImageInfo& info, KTXImage& dstImage) const
     if (info.swizzleText == "gggr")
         postSwizzleText = "ag01";
     else if (info.swizzleText == "rrrg")
-        postSwizzleText = "ga01";  // or ra01
+        postSwizzleText = "ga01"; // or ra01
     else if (info.swizzleText == "rrr1")
-        postSwizzleText = "r001";  // to match up with BC4/EAC_R11
+        postSwizzleText = "r001"; // to match up with BC4/EAC_R11
 
     dstImage.addSwizzleProps(info.swizzleText.c_str(), postSwizzleText.c_str());
 
@@ -1466,11 +1463,10 @@ void KramEncoder::addBaseProps(const ImageInfo& info, KTXImage& dstImage) const
             dstImage.addChannelProps("Alb.r,Alb.g,Alb.b,Alb.a");
         }
     }
-    else if (info.isSourcePremultiplied)
-    {
+    else if (info.isSourcePremultiplied) {
         dstImage.addChannelProps("Alb.ra,Alb.ga,Alb.ba,Alb.a");
     }
-    
+
     // TODO: texture encode can depend on wrap vs. clamp state (f.e. normal map gen, sdf)
     // and formsts like PVRTC must know wrap/clamp before encode
     // address: Wrap, Clamp, MirrorWrap, MirrorClamp, BorderClamp, BorderClamp0
@@ -1487,10 +1483,10 @@ void KramEncoder::addBaseProps(const ImageInfo& info, KTXImage& dstImage) const
     }
 
     if (info.doMipmaps) {
-        dstImage.addFilterProps("Lin,Lin,Lin");  // min,mag,mip
+        dstImage.addFilterProps("Lin,Lin,Lin"); // min,mag,mip
     }
     else {
-        dstImage.addFilterProps("Lin,Lin,X");  // min,mag,mip
+        dstImage.addFilterProps("Lin,Lin,X"); // min,mag,mip
     }
 
     // This is hash of source png/ktx file (use xxhash32 or crc32)
@@ -1540,7 +1536,7 @@ bool KramEncoder::encodeImpl(ImageInfo& info, Image& singleImage, FILE* dstFile,
     // whd might be changed by initMipLevels based on min/max mip size
     dstImage.width = w;
     dstImage.height = h;
-    dstImage.depth = header.pixelDepth;  // from validate above
+    dstImage.depth = header.pixelDepth; // from validate above
 
     dstImage.initMipLevels(info.doMipmaps, info.mipMinSize, info.mipMaxSize, info.mipSkip, mipConstructData.numSkippedMips);
 
@@ -1577,12 +1573,12 @@ bool KramEncoder::saveKTX2(const KTXImage& srcImage, const KTX2Compressor& compr
     // TODO: move this propsData into KTXImage
     vector<uint8_t> propsData;
     srcImage.toPropsData(propsData);
-    
+
     // now convert from ktx1 to ktx2
     const KTXHeader& header = srcImage.header;
-    
+
     KTXImage dummyImage; // unused, just passed to reference
-    
+
     KTX2Header header2;
 
     header2.vkFormat = vulkanType(srcImage.pixelFormat);
@@ -1688,7 +1684,7 @@ bool KramEncoder::saveKTX2(const KTXImage& srcImage, const KTX2Compressor& compr
 
         // allocate big enough to hold entire uncompressed level
         vector<uint8_t> compressedData;
-        compressedData.resize(mz_compressBound(ktx2Levels[0].length));  // largest mip
+        compressedData.resize(mz_compressBound(ktx2Levels[0].length)); // largest mip
         size_t compressedDataSize = 0;
 
         // reuse a context here
@@ -1784,7 +1780,7 @@ bool KramEncoder::saveKTX2(const KTXImage& srcImage, const KTX2Compressor& compr
             return false;
         }
     }
-    
+
     return true;
 }
 
@@ -1817,7 +1813,7 @@ bool KramEncoder::writeKTX1FileOrImage(
 
         for (int32_t i = 0; i < (int32_t)dstImage.mipLevels.size(); ++i) {
             auto& level = dstImage.mipLevels[i];
-            level.offset = lastMipOffset + 4;  // offset by length
+            level.offset = lastMipOffset + 4; // offset by length
 
             lastMipOffset = level.offset + level.length * numChunks;
         }
@@ -1850,10 +1846,11 @@ bool KramEncoder::writeKTX1FileOrImage(
     return true;
 }
 
-bool KramEncoder::saveKTX1(const KTXImage& image, FILE* dstFile) const {
+bool KramEncoder::saveKTX1(const KTXImage& image, FILE* dstFile) const
+{
     // write the header out
     KTXHeader headerCopy = image.header;
-    
+
     // fixup header for 1d array
     if (image.textureType == MyMTLTextureType1DArray) {
         headerCopy.pixelHeight = 0;
@@ -1862,59 +1859,59 @@ bool KramEncoder::saveKTX1(const KTXImage& image, FILE* dstFile) const {
 
     // This is unused
     KTXImage dummyImage;
-    
+
     vector<uint8_t> propsData;
     image.toPropsData(propsData);
     headerCopy.bytesOfKeyValueData = (uint32_t)vsizeof(propsData);
-    
+
     uint32_t dstOffset = 0;
-    
+
     if (!writeDataAtOffset((const uint8_t*)&headerCopy, sizeof(KTXHeader), 0, dstFile, dummyImage)) {
         return false;
     }
     dstOffset += sizeof(KTXHeader);
-    
+
     // write out the props
     if (!writeDataAtOffset(propsData.data(), headerCopy.bytesOfKeyValueData, sizeof(KTXHeader), dstFile, dummyImage)) {
         return false;
     }
     dstOffset += headerCopy.bytesOfKeyValueData;
-    
+
     // build and write out the mip data
-    
+
     // This may not have been allocated, might be aliasing original
     const uint8_t* mipLevelData = image.fileData;
     const auto& mipLevels = image.mipLevels;
-    
+
     // KTX writes largest mips first
-    
+
     uint32_t numChunks = image.totalChunks();
     for (uint32_t mipNum = 0; mipNum < image.mipCount(); ++mipNum) {
         // ktx weirdly writes size differently for cube, but not cube array
         // also this completely throws off block alignment
         uint32_t mipStorageSize = mipLevels[mipNum].length;
         uint32_t levelDataSize = mipStorageSize * numChunks;
-        
+
         // cube stores size of one face, ugh
         if (image.textureType != MyMTLTextureTypeCube) {
             mipStorageSize *= numChunks;
         }
-        
+
         size_t chunkOffset = image.chunkOffset(mipNum, 0);
-        
+
         // write length of mip
         if (!writeDataAtOffset((const uint8_t*)&mipStorageSize, sizeof(uint32_t), dstOffset, dstFile, dummyImage)) {
             return false;
         }
         dstOffset += sizeof(uint32_t);
-        
+
         // write the level pixels
         if (!writeDataAtOffset(mipLevelData + chunkOffset, levelDataSize, dstOffset, dstFile, dummyImage)) {
             return false;
         }
         dstOffset += levelDataSize;
     }
-    
+
     return true;
 }
 
@@ -1994,7 +1991,7 @@ bool KramEncoder::createMipsFromChunks(
     KTXImage& dstImage) const
 {
     Timer totalTimer;
-    
+
     // ----------------------------------------------------
 
     // set the structure fields and allocate it, only need enough to hold single
@@ -2089,7 +2086,7 @@ bool KramEncoder::createMipsFromChunks(
             // so large mips even if clamped with -mipmax allocate to largest mip size (2k x 2k @16 = 64MB)
             // have to build the mips off that.  srgb and premul is why fp32 is
             // needed, and need to downsample in linear space.
-            
+
             srcImage.pixelsHalf = halfImage.data();
         }
     }
@@ -2102,9 +2099,12 @@ bool KramEncoder::createMipsFromChunks(
     int32_t srcTopMipWidth = srcImage.width;
     int32_t srcTopMipHeight = srcImage.height;
 
+    // Need this to restore the pointers after mip gen
+    const ImageData srcImageSaved = srcImage;
+    
     for (int32_t chunk = 0; chunk < numChunks; ++chunk) {
         Timer timerBuildMips;
-        
+
         // this needs to append before chunkOffset copy below
         w = srcTopMipWidth;
         h = srcTopMipHeight;
@@ -2115,10 +2115,15 @@ bool KramEncoder::createMipsFromChunks(
         // reset these dimensions, or the mip mapping drops them to 1x1
         srcImage.width = w;
         srcImage.height = h;
-
+        
+        // restore the pointers
+        srcImage.pixels = srcImageSaved.pixels;
+        srcImage.pixelsHalf = srcImageSaved.pixelsHalf;
+        srcImage.pixelsFloat = srcImageSaved.pixelsFloat;
+        
         if (info.isHDR) {
             // TODO: should this support halfImage too?
-            
+
             if (isMultichunk) {
                 const float4* srcPixels = (const float4*)singleImage.pixelsFloat().data();
                 for (int32_t y = 0; y < h; ++y) {
@@ -2171,21 +2176,21 @@ bool KramEncoder::createMipsFromChunks(
         // mipgen and encoding are separated.  This simplifies mipFlood and
         // channel averaging.
         const int32_t numMipLevels = (int32_t)dstMipLevels.size();
-       
+
         vector<ImageData> dstMipImages;
         dstMipImages.resize(numMipLevels);
-        
+
         // mip1...n are held here
         vector<Color> mipPixels;
         vector<half4> mipPixelsHalf;
         vector<float4> mipPixelsFloat;
-        
+
         {
             ImageData dstImageData = srcImage;
             dstImageData.isSRGB = isSrgbFormat(info.pixelFormat);
-            
+
             int32_t numSkippedMips = data.numSkippedMips;
-            
+
             if (info.doSDF) {
                 // count up pixels needed for all mips of this chunk
                 uint32_t numPixels = 0;
@@ -2196,23 +2201,23 @@ bool KramEncoder::createMipsFromChunks(
                     mipDown(w, h, d, mipLevel + numSkippedMips);
                     numPixels += w * h;
                 }
-                
+
                 // now allocate enough memory to hold all the mips
                 mipPixels.resize(numPixels);
-                
+
                 size_t pixelOffset = 0;
                 for (int32_t mipLevel = 0; mipLevel < numMipLevels; ++mipLevel) {
                     //const auto& dstMipLevel = dstMipLevels[mipLevel];
                     ImageData& dstMipImage = dstMipImages[mipLevel];
-                    
+
                     dstMipImage = dstImageData; // settings replaced in mipmap call
                     dstMipImage.pixels = mipPixels.data() + pixelOffset;
-                    
+
                     // sdf mipper has to build from largest sourceImage
                     // but it can in-place write to the same dstImage
                     // But not doing in-place mips anymore.
                     sdfMipper.mipmap(dstMipImage, mipLevel + numSkippedMips);
-                    
+
                     // assumes depth = 1
                     pixelOffset += dstMipImage.width * dstMipImage.height;
                 }
@@ -2223,15 +2228,15 @@ bool KramEncoder::createMipsFromChunks(
                     for (int32_t i = 0; i < numSkippedMips; ++i) {
                         // have to build the submips even with skipMip
                         mipper.mipmap(srcImage, dstImageData);
-                        
+
                         // dst becomes src for next in-place mipmap
                         srcImage = dstImageData;
                     }
                 }
-                
+
                 // allocate memory for mips
                 dstMipImages[0] = dstImageData;
-                
+
                 // count up pixels needed for all sub mips of this chunk
                 uint32_t numPixels = 0;
                 for (int32_t mipLevel = 1; mipLevel < numMipLevels; ++mipLevel) {
@@ -2241,7 +2246,7 @@ bool KramEncoder::createMipsFromChunks(
                     mipDown(w, h, d, mipLevel + numSkippedMips);
                     numPixels += w * h;
                 }
-                
+
                 // This is more memory than in-place, but the submips
                 // are only 1/3rd the memory of the main mip
                 mipPixels.resize(numPixels);
@@ -2249,40 +2254,40 @@ bool KramEncoder::createMipsFromChunks(
                     mipPixelsFloat.resize(numPixels);
                 else if (srcImage.pixelsHalf)
                     mipPixelsHalf.resize(numPixels);
-                
+
                 size_t pixelOffset = 0;
                 for (int32_t mipLevel = 1; mipLevel < numMipLevels; ++mipLevel) {
                     ImageData& dstMipImage = dstMipImages[mipLevel];
                     dstMipImage.isSRGB = dstImageData.isSRGB;
-                    
+
                     dstMipImage.pixels = mipPixels.data() + pixelOffset;
                     if (srcImage.pixelsFloat)
                         dstMipImage.pixelsFloat = mipPixelsFloat.data() + pixelOffset;
                     else if (srcImage.pixelsHalf)
                         dstMipImage.pixelsHalf = mipPixelsHalf.data() + pixelOffset;
-                      
+
                     mipper.mipmap(srcImage, dstMipImage);
-                    
+
                     // dst becomes src for next mipmap
                     // preserve the isSRGB state
                     bool isSRGBSrc = srcImage.isSRGB;
-                    srcImage = dstMipImage;
+                    srcImage = dstMipImage; // this is changing srcImage.pixels
                     srcImage.isSRGB = isSRGBSrc;
-                    
+
                     pixelOffset += dstMipImage.width * dstMipImage.height;
                 }
-                
+
                 // Now can run mip flooding on image
                 if (info.doMipflood) {
                     mipper.mipflood(dstMipImages);
                 }
-                
+
                 // apply average channels, now that unique mips
                 bool isFloat = srcImage.pixelsHalf || srcImage.pixelsFloat;
                 if (!info.averageChannels.empty() && !isFloat) {
                     for (int32_t mipLevel = 0; mipLevel < numMipLevels; ++mipLevel) {
                         ImageData& dstMipImage = dstMipImages[mipLevel];
-                        
+
                         // this isn't applied to srgb data (what about premul?)
                         averageChannelsInBlock(info.averageChannels.c_str(), dstImage,
                                                dstMipImage);
@@ -2290,15 +2295,15 @@ bool KramEncoder::createMipsFromChunks(
                 }
             }
         }
-        
+
         timerBuildMips.stop();
-        
+
         if (info.isVerbose) {
             KLOGI("Image", "Chunk %d source %d miplevels in %0.3fms\n",
                   chunk, numMipLevels,
-                  timerBuildMips.timeElapsedMillis() );
+                  timerBuildMips.timeElapsedMillis());
         }
-        
+
         //----------------------------------------------
 
         for (int32_t mipLevel = 0; mipLevel < numMipLevels; ++mipLevel) {
@@ -2307,7 +2312,7 @@ bool KramEncoder::createMipsFromChunks(
 
             w = dstImageData.width;
             h = dstImageData.height;
-            
+
             // size of one mip, not levelSize = numChunks * mipStorageSize
             size_t mipStorageSize = dstMipLevel.length;
 
@@ -2360,18 +2365,18 @@ bool KramEncoder::createMipsFromChunks(
             }
         }
     }
-    
+
     if (info.isVerbose) {
         KLOGI("Image", "Total time in %0.3fms\n",
-              totalTimer.timeElapsedMillis() );
+              totalTimer.timeElapsedMillis());
     }
-    
-//    Timer test;
-//    test.stop();
-//
-//    KLOGI("Image", "Test time in %0.3fms\n",
-//          test.timeElapsedMillis() );
-    
+
+    // Timer test;
+    // test.stop();
+    //
+    // KLOGI("Image", "Test time in %0.3fms\n",
+    //       test.timeElapsedMillis() );
+
     return true;
 }
 
@@ -2411,10 +2416,13 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
                     switch (count) {
                         case 4:
                             dst[count * i + 3] = src[i].a;
+                            [[fallthrough]];
                         case 3:
                             dst[count * i + 2] = src[i].b;
+                            [[fallthrough]];
                         case 2:
                             dst[count * i + 1] = src[i].g;
+                            [[fallthrough]];
                         case 1:
                             dst[count * i + 0] = src[i].r;
                     }
@@ -2435,15 +2443,18 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
 
                 // assumes we don't need to align r16f rows to 4 bytes
                 for (int32_t i = 0, iEnd = w * h; i < iEnd; ++i) {
-                    half4 src16 = toHalf4(src[0]);
+                    half4 src16 = half4m(src[0]);
 
                     switch (count) {
                         case 4:
                             dst[count * i + 3] = src16.w;
+                            [[fallthrough]];
                         case 3:
                             dst[count * i + 2] = src16.z;
+                            [[fallthrough]];
                         case 2:
                             dst[count * i + 1] = src16.y;
+                            [[fallthrough]];
                         case 1:
                             dst[count * i + 0] = src16.x;
                     }
@@ -2464,10 +2475,13 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
                     switch (count) {
                         case 4:
                             dst[count * i + 3] = src[i].w;
+                            [[fallthrough]];
                         case 3:
                             dst[count * i + 2] = src[i].z;
+                            [[fallthrough]];
                         case 2:
                             dst[count * i + 1] = src[i].y;
+                            [[fallthrough]];
                         case 1:
                             dst[count * i + 0] = src[i].x;
                     }
@@ -2502,7 +2516,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             // on color_grid-a.png to etc2rgb.  So stop using it.  The error calc is called
             // millions of times, so any work done there adds up.
 
-            bool useRec709 = false;  // info.isColorWeighted;
+            bool useRec709 = false; // info.isColorWeighted;
 
             switch (info.pixelFormat) {
                 case MyMTLPixelFormatEAC_R11Unorm:
@@ -2556,7 +2570,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             Etc::Image::EncodingStatus status;
 
             // TODO: have encoder setting to enable multipass
-            bool doSinglepass = true;  // || (effort == 100.0f); // problem is 100% quality also runs all passes
+            bool doSinglepass = true; // || (effort == 100.0f); // problem is 100% quality also runs all passes
             if (doSinglepass) {
                 // single pass iterates each block until done
                 status = imageEtc.EncodeSinglepass(effort, outputTexture.data.data());
@@ -2624,7 +2638,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
                 else if (info.quality <= 90) {
                     uberLevel = 1;
                     maxPartitions = 64;
-                    bc7params.m_try_least_squares = true;  // true = 0.7s on test case
+                    bc7params.m_try_least_squares = true; // true = 0.7s on test case
                     bc7params.m_mode17_partition_estimation_filterbank = true;
                 }
                 else {
@@ -2693,17 +2707,17 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
                     // , so opaque textures repro as 254 alpha on Toof-a.png.
                     // ate sets pbits on mode 6 for same block.  Also fixed mip weights in non-pow2 mipper.
 
-                    //                    bool doPrintBlock = false;
-                    //                    if (bx == 8 && by == 1) {
-                    //                        int32_t bp = 0;
-                    //                        bp = bp;
-                    //                        doPrintBlock = true;
-                    //                    }
+                    // bool doPrintBlock = false;
+                    // if (bx == 8 && by == 1) {
+                    //     int32_t bp = 0;
+                    //     bp = bp;
+                    //     doPrintBlock = true;
+                    // }
 
                     // could tie to quality parameter, high quality uses the two
                     // modes of bc3/4/5.
                     bool useHighQuality = true;
-                    
+
                     switch (info.pixelFormat) {
                         case MyMTLPixelFormatBC1_RGBA:
                         case MyMTLPixelFormatBC1_RGBA_sRGB: {
@@ -2743,11 +2757,11 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
                         case MyMTLPixelFormatBC6H_RGBFloat: {
                             CMP_BC6H_BLOCK_PARAMETERS options;
                             options.isSigned = info.isSigned;
-                            
+
                             BC6HBlockEncoder encoderCompressenator(options);
-                            
+
                             // TODO: this needs HDR data
-                            float   srcPixelCopyFloat[16][4];
+                            float srcPixelCopyFloat[16][4];
                             for (int i = 0; i < 16; ++i) {
                                 srcPixelCopyFloat[i][0] = srcPixelCopy[i * 4 + 0];
                                 srcPixelCopyFloat[i][1] = srcPixelCopy[i * 4 + 1];
@@ -2762,9 +2776,9 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
                         case MyMTLPixelFormatBC7_RGBAUnorm_sRGB: {
                             bc7enc_compress_block(dstBlock, srcPixelCopy, &bc7params);
 
-                            //                            if (doPrintBlock) {
-                            //                                printBCBlock(dstBlock, info.pixelFormat);
-                            //                            }
+                            // if (doPrintBlock) {
+                            //     printBCBlock(dstBlock, info.pixelFormat);
+                            // }
                             break;
                         }
                         default: {
@@ -2819,7 +2833,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
         else if (info.useSquish) {
             static const float* noWeights = NULL;
             static const float perceptualWeights[3] = {
-                0.2126f, 0.7152f, 0.0722f};  // weight g > r > b
+                0.2126f, 0.7152f, 0.0722f}; // weight g > r > b
 
             const float* weights =
                 info.isColorWeighted ? &perceptualWeights[0] : noWeights;
@@ -2827,13 +2841,13 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             int32_t flags = 0;
 
             if (info.quality <= 10)
-                flags = squish::kColourRangeFit;  // fast but inferior, only uses corner of color cube
+                flags = squish::kColourRangeFit; // fast but inferior, only uses corner of color cube
             else if (info.quality <= 90)
-                flags = squish::kColourClusterFit;  // decent speed and quality, fits to best line
+                flags = squish::kColourClusterFit; // decent speed and quality, fits to best line
             else
-                flags = squish::kColourIterativeClusterFit;  // very slow, but
-                                                             // slighting better
-                                                             // quality
+                flags = squish::kColourIterativeClusterFit; // very slow, but
+                                                            // slighting better
+                                                            // quality
 
             squish::TexFormat format = squish::kBC1;
 
@@ -2939,7 +2953,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             enum ChannelType {
                 kChannelTypeOneR1 = 1,
                 kChannelTypeTwoAG = 2,
-                kChannelTypeTwoNormalAG = 3,  // not channel count
+                kChannelTypeTwoNormalAG = 3, // not channel count
                 kChannelTypeNormalFour = 4,
             };
 
@@ -2947,7 +2961,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             ChannelType channelType = kChannelTypeNormalFour;
 
             if (info.isNormal) {
-                channelType = kChannelTypeTwoNormalAG;  // changes error metric
+                channelType = kChannelTypeTwoNormalAG; // changes error metric
                 assert(info.swizzleText == "rrrg" || info.swizzleText == "gggr");
             }
             else if (info.swizzleText == "rrrg" || info.swizzleText == "gggr") {
@@ -2961,7 +2975,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             profile = info.isSRGBDst ? ASTCENC_PRF_LDR_SRGB : ASTCENC_PRF_LDR;
             if (info.isHDR) {
                 profile =
-                    ASTCENC_PRF_HDR;  // TODO: also ASTCENC_PRF_HDR_RGB_LDR_A
+                    ASTCENC_PRF_HDR; // TODO: also ASTCENC_PRF_HDR_RGB_LDR_A
             }
 
             // not generating 3d ASTC ever, even for 3D textures
@@ -2971,7 +2985,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             uint32_t flags = 0;
 
             if (channelType == kChannelTypeTwoNormalAG) {
-                flags |= ASTCENC_FLG_MAP_NORMAL;  // weights r and a channels only in error calc
+                flags |= ASTCENC_FLG_MAP_NORMAL; // weights r and a channels only in error calc
             }
             else if (info.isColorWeighted) {
                 flags |= ASTCENC_FLG_USE_PERCEPTUAL;
@@ -2995,23 +3009,23 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             // Using this option for compression give 10-20% more performance, depending on
             // block size, so is highly recommended.
             flags |= ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
-            
+
             // convert quality to present
             float quality = info.quality;
 
-            //            ASTCENC_PRE_FAST;
-            //            if (info.quality <= 10) {
-            //                preset = ASTCENC_PRE_FAST;
-            //            }
-            //            else if (info.quality <= 50) {
-            //                preset = ASTCENC_PRE_MEDIUM;
-            //            }
-            //            else if (info.quality < 90) {
-            //                preset = ASTCENC_PRE_THOROUGH;
-            //            }
-            //            else {
-            //                preset = ASTCENC_PRE_EXHAUSTIVE;
-            //            }
+            // ASTCENC_PRE_FAST;
+            // if (info.quality <= 10) {
+            //     preset = ASTCENC_PRE_FAST;
+            // }
+            // else if (info.quality <= 50) {
+            //     preset = ASTCENC_PRE_MEDIUM;
+            // }
+            // else if (info.quality < 90) {
+            //     preset = ASTCENC_PRE_THOROUGH;
+            // }
+            // else {
+            //     preset = ASTCENC_PRE_EXHAUSTIVE;
+            // }
 
             astcenc_config config;
             astcenc_error error = astcenc_config_init(
@@ -3024,19 +3038,19 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             // pull from rgb = y, a = x, to match up with astcenc internals
             if (channelType == kChannelTypeOneR1) {
                 config.cw_r_weight = 1.0f;
-                config.cw_g_weight = 0.0f;  // rgb same value
+                config.cw_g_weight = 0.0f; // rgb same value
                 config.cw_b_weight = 0.0f;
-                config.cw_a_weight = 0.0f;  // set to 0 to indicate alpha error doesn't matter (always 1)
+                config.cw_a_weight = 0.0f; // set to 0 to indicate alpha error doesn't matter (always 1)
             }
             else if (channelType == kChannelTypeTwoAG) {
                 config.cw_r_weight = 1.0f;
-                config.cw_g_weight = 0.0f;  // rgb same value
+                config.cw_g_weight = 0.0f; // rgb same value
                 config.cw_b_weight = 0.0f;
                 config.cw_a_weight = 1.0f;
             }
             else if (channelType == kChannelTypeTwoNormalAG) {
                 config.cw_r_weight = 1.0f;
-                config.cw_g_weight = 0.0f;  // rgb same value
+                config.cw_g_weight = 0.0f; // rgb same value
                 config.cw_b_weight = 0.0f;
                 config.cw_a_weight = 1.0f;
             }
@@ -3060,7 +3074,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             astcenc_image srcImage;
             srcImage.dim_x = w;
             srcImage.dim_y = h;
-            srcImage.dim_z = 1;  // Not using 3D blocks, not supported on iOS
+            srcImage.dim_z = 1; // Not using 3D blocks, not supported on iOS
             //srcImage.dim_pad = 0;
 
             // data is triple-pointer so it can work with 3d textures, but only
@@ -3123,7 +3137,7 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
             error = astcenc_compress_image(
                 codec_context, &srcImage, &swizzleEncode,
                 outputTexture.data.data(), mipStorageSize,
-                0);  // threadIndex
+                0); // threadIndex
 #endif
 
             // Or should this context only be freed after all mips?
@@ -3140,4 +3154,4 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image,
     return false;
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramImage.h b/libkram/kram/KramImage.h
index 7de5ea2d..05d8eb02 100644
--- a/libkram/kram/KramImage.h
+++ b/libkram/kram/KramImage.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -7,15 +7,15 @@
 //#include <string>
 //#include <vector>
 
-#include "KTXImage.h"  // for MyMTLTextureType
+#include "KTXImage.h" // for MyMTLTextureType
 //#include "KramConfig.h"
 #include "KramImageInfo.h"
 #include "KramMipper.h"
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
-using namespace simd;
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
 
 class Mipper;
 class KTXHeader;
@@ -42,11 +42,11 @@ class Image {
     bool loadImageFromPixels(const vector<Color>& pixels,
                              int32_t width, int32_t height,
                              bool hasColor, bool hasAlpha);
-    
+
     // set state off png blocks
     void setSrgbState(bool isSrgb, bool hasSrgbBlock, bool hasNonSrgbBlocks);
     void setBackgroundState(bool hasBlackBackground) { _hasBlackBackground = hasBlackBackground; }
-    
+
     // convert mip level of explicit format to single-image
     bool loadImageFromKTX(const KTXImage& image, uint32_t mipNumber = 0);
 
@@ -72,9 +72,9 @@ class Image {
     bool isSrgb() const { return _isSrgb; }
     bool hasSrgbBlock() const { return _hasSrgbBlock; }
     bool hasNonSrgbBlocks() const { return _hasNonSrgbBlocks; }
-    
+
     bool hasBlackBackground() const { return _hasBlackBackground; }
-    
+
     // if converted a KTX/2 image to Image, then this field will be non-zero
     uint32_t chunksY() const { return _chunksY; }
     void setChunksY(uint32_t chunksY) { _chunksY = chunksY; }
@@ -100,10 +100,10 @@ class Image {
     bool _isSrgb = false;
     bool _hasNonSrgbBlocks = false;
     bool _hasSrgbBlock = false;
-    
+
     // track to fix Apple Finder previews that are always white background
     bool _hasBlackBackground = false;
-    
+
     // this is the entire strip data, float version can be passed for HDR
     // sources always 4 channels RGBA for 8 and 32f data.  16f promoted to 32f.
     vector<Color> _pixels;
@@ -115,7 +115,7 @@ class Image {
 
 class KramDecoderParams {
 public:
-    TexEncoder decoder = kTexEncoderUnknown;  // will pick best available from format
+    TexEncoder decoder = kTexEncoderUnknown; // will pick best available from format
     bool isVerbose = false;
     string swizzleText;
 };
@@ -132,7 +132,7 @@ class KramDecoder {
     bool decodeBlocks(
         int32_t w, int32_t h,
         const uint8_t* blockData, uint32_t numBlocks, MyMTLPixelFormat blockFormat,
-        vector<uint8_t>& dstPixels,  // currently Color
+        vector<uint8_t>& dstPixels, // currently Color
         const KramDecoderParams& params) const;
 
 private:
@@ -155,7 +155,7 @@ class KramEncoder {
 
     // can save out to ktx2 directly, this can supercompress mips
     bool saveKTX2(const KTXImage& srcImage, const KTX2Compressor& compressor, FILE* dstFile) const;
-    
+
 private:
     bool encodeImpl(ImageInfo& info, Image& singleImage, FILE* dstFile, KTXImage& dstImage) const;
 
@@ -187,4 +187,4 @@ class KramEncoder {
     void addBaseProps(const ImageInfo& info, KTXImage& dstImage) const;
 };
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramImageInfo.cpp b/libkram/kram/KramImageInfo.cpp
index 70c20bd0..cc8fdb1c 100644
--- a/libkram/kram/KramImageInfo.cpp
+++ b/libkram/kram/KramImageInfo.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -12,8 +12,8 @@
 #endif
 
 namespace kram {
-using namespace NAMESPACE_STL;
-using namespace simd;
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
 
 #define isStringEqual(lhs, rhs) (strcmp(lhs, rhs) == 0)
 
@@ -36,9 +36,9 @@ MyMTLTextureType parseTextureType(const char* typeName)
     else if (isStringEqual(typeName, "2d")) {
         type = MyMTLTextureType2D;
     }
-    //    else if (isStringEqual(typeName, "1d")) {
-    //        type = MyMTLTextureType1D;
-    //    }
+    // else if (isStringEqual(typeName, "1d")) {
+    //     type = MyMTLTextureType1D;
+    // }
     else if (isStringEqual(typeName, "cube")) {
         type = MyMTLTextureTypeCube;
     }
@@ -62,11 +62,11 @@ TexEncoder parseEncoder(const char* encoder)
         textureEncoder = kTexEncoderBcenc;
     }
     else if (isStringEqual(encoder,
-                           "ate")) {  // platform specific, no sources
+                           "ate")) { // platform specific, no sources
         textureEncoder = kTexEncoderATE;
     }
     else if (isStringEqual(encoder,
-                           "astcenc")) {  // platform specific, no sources
+                           "astcenc")) { // platform specific, no sources
         textureEncoder = kTexEncoderAstcenc;
     }
 
@@ -77,16 +77,16 @@ static MyMTLPixelFormat parseFormat(ImageInfoArgs& infoArgs)
 {
     MyMTLPixelFormat format = MyMTLPixelFormatInvalid;
     const char* formatString = infoArgs.formatString.c_str();
-    
+
     bool isSRGBDst = infoArgs.isSRGBDst;
-    
+
     // bc
     if (isStringEqual(formatString, "bc1")) {
         format = isSRGBDst ? MyMTLPixelFormatBC1_RGBA_sRGB : MyMTLPixelFormatBC1_RGBA;
     }
-    //    else if (isStringEqual(formatString, "bc2")) {
-    //        format = MyMTLPixelFormatBC2_RGBA;
-    //    }
+    // else if (isStringEqual(formatString, "bc2")) {
+    //     format = MyMTLPixelFormatBC2_RGBA;
+    // }
     else if (isStringEqual(formatString, "bc3")) {
         format = isSRGBDst ? MyMTLPixelFormatBC3_RGBA_sRGB : MyMTLPixelFormatBC3_RGBA;
     }
@@ -113,7 +113,7 @@ static MyMTLPixelFormat parseFormat(ImageInfoArgs& infoArgs)
     else if (isStringEqual(formatString, "etc2rgb")) {
         format = isSRGBDst ? MyMTLPixelFormatETC2_RGB8_sRGB : MyMTLPixelFormatETC2_RGB8;
     }
-    else if (isStringEqual(formatString, "etc2rgba")) {  // for rgb/rgba
+    else if (isStringEqual(formatString, "etc2rgba")) { // for rgb/rgba
         format = isSRGBDst ? MyMTLPixelFormatEAC_RGBA8_sRGB : MyMTLPixelFormatEAC_RGBA8;
     }
 
@@ -121,16 +121,20 @@ static MyMTLPixelFormat parseFormat(ImageInfoArgs& infoArgs)
     // or RGBA to save endpoint storage dual plane can occur for more than just
     // RGB+A, any one channel can be a plane to itself if encoder supports
     else if (isStringEqual(formatString, "astc4x4")) {
-        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_4x4_HDR : isSRGBDst ? MyMTLPixelFormatASTC_4x4_sRGB : MyMTLPixelFormatASTC_4x4_LDR;
+        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_4x4_HDR : isSRGBDst ? MyMTLPixelFormatASTC_4x4_sRGB
+                                                                           : MyMTLPixelFormatASTC_4x4_LDR;
     }
     else if (isStringEqual(formatString, "astc5x5")) {
-        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_5x5_HDR : isSRGBDst ? MyMTLPixelFormatASTC_5x5_sRGB : MyMTLPixelFormatASTC_5x5_LDR;
+        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_5x5_HDR : isSRGBDst ? MyMTLPixelFormatASTC_5x5_sRGB
+                                                                           : MyMTLPixelFormatASTC_5x5_LDR;
     }
     else if (isStringEqual(formatString, "astc6x6")) {
-        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_6x6_HDR : isSRGBDst ? MyMTLPixelFormatASTC_6x6_sRGB : MyMTLPixelFormatASTC_6x6_LDR;
+        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_6x6_HDR : isSRGBDst ? MyMTLPixelFormatASTC_6x6_sRGB
+                                                                           : MyMTLPixelFormatASTC_6x6_LDR;
     }
     else if (isStringEqual(formatString, "astc8x8")) {
-        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_8x8_HDR : isSRGBDst ? MyMTLPixelFormatASTC_8x8_sRGB : MyMTLPixelFormatASTC_8x8_LDR;
+        format = infoArgs.isHDR ? MyMTLPixelFormatASTC_8x8_HDR : isSRGBDst ? MyMTLPixelFormatASTC_8x8_sRGB
+                                                                           : MyMTLPixelFormatASTC_8x8_LDR;
     }
 
     // explicit formats
@@ -143,7 +147,7 @@ static MyMTLPixelFormat parseFormat(ImageInfoArgs& infoArgs)
         format = // isSRGBDst ? MyMTLPixelFormatRG8Unorm_sRGB :
             MyMTLPixelFormatRG8Unorm;
     }
-    else if (isStringEqual(formatString, "rgba8")) {  // for rgb/rgba
+    else if (isStringEqual(formatString, "rgba8")) { // for rgb/rgba
         format = isSRGBDst ? MyMTLPixelFormatRGBA8Unorm_sRGB : MyMTLPixelFormatRGBA8Unorm;
     }
 
@@ -153,7 +157,7 @@ static MyMTLPixelFormat parseFormat(ImageInfoArgs& infoArgs)
     else if (isStringEqual(formatString, "rg16f")) {
         format = MyMTLPixelFormatRG16Float;
     }
-    else if (isStringEqual(formatString, "rgba16f")) {  // for rgb/rgba
+    else if (isStringEqual(formatString, "rgba16f")) { // for rgb/rgba
         format = MyMTLPixelFormatRGBA16Float;
     }
 
@@ -163,7 +167,7 @@ static MyMTLPixelFormat parseFormat(ImageInfoArgs& infoArgs)
     else if (isStringEqual(formatString, "rg32f")) {
         format = MyMTLPixelFormatRG32Float;
     }
-    else if (isStringEqual(formatString, "rgba32f")) {  // for rgb/rgba
+    else if (isStringEqual(formatString, "rgba32f")) { // for rgb/rgba
         format = MyMTLPixelFormatRGBA32Float;
     }
 
@@ -345,7 +349,7 @@ static const MyMTLPixelFormat kEncodingFormatsBcenc[] =
         MyMTLPixelFormatBC6H_RGBUfloat,
         MyMTLPixelFormatBC6H_RGBFloat,
 #endif
-        
+
         MyMTLPixelFormatBC7_RGBAUnorm,
         MyMTLPixelFormatBC7_RGBAUnorm_sRGB,
 };
@@ -564,11 +568,11 @@ bool validateFormatAndEncoder(ImageInfoArgs& infoArgs)
 
     // check arguments
     // flag unsupported formats
-//    if (format == MyMTLPixelFormatBC6H_RGBUfloat ||
-//        format == MyMTLPixelFormatBC6H_RGBFloat) {
-//        KLOGE("ImageInfo", "bc6 not supported\n");
-//        error = true;
-//    }
+    // if (format == MyMTLPixelFormatBC6H_RGBUfloat ||
+    //     format == MyMTLPixelFormatBC6H_RGBFloat) {
+    //     KLOGE("ImageInfo", "bc6 not supported\n");
+    //     error = true;
+    // }
 
     infoArgs.pixelFormat = format;
 
@@ -718,7 +722,7 @@ bool validateTextureType(MyMTLTextureType textureType, int32_t& w, int32_t& h,
             if (w != (int32_t)(h * numSlices)) {
                 return false;
             }
-            w = h;  // assume square
+            w = h; // assume square
 
             for (int32_t i = 0; i < (int32_t)numSlices; ++i) {
                 Int2 chunkOffset = {w * i, 0};
@@ -731,7 +735,7 @@ bool validateTextureType(MyMTLTextureType textureType, int32_t& w, int32_t& h,
             if (h != (int32_t)(w * numSlices)) {
                 return false;
             }
-            h = w;  // assume square
+            h = w; // assume square
 
             for (int32_t i = 0; i < (int32_t)numSlices; ++i) {
                 Int2 chunkOffset = {0, h * i};
@@ -788,7 +792,7 @@ bool validateTextureType(MyMTLTextureType textureType, int32_t& w, int32_t& h,
                     return false;
                 }
 
-                w = h;  // assume square
+                w = h; // assume square
                 for (int32_t i = 0; i < (int32_t)header.numberOfArrayElements; ++i) {
                     Int2 chunkOffset = {w * i, 0};
                     chunkOffsets.push_back(chunkOffset);
@@ -800,7 +804,7 @@ bool validateTextureType(MyMTLTextureType textureType, int32_t& w, int32_t& h,
                     return false;
                 }
 
-                h = w;  // assume square
+                h = w; // assume square
                 for (int32_t i = 0; i < (int32_t)header.numberOfArrayElements; ++i) {
                     Int2 chunkOffset = {0, h * i};
                     chunkOffsets.push_back(chunkOffset);
@@ -1011,19 +1015,19 @@ void ImageInfo::initWithArgs(const ImageInfoArgs& args)
     isPrezero = false;
     isPremultiplied = false;
     isSourcePremultiplied = false;
-    
+
     if (args.isSourcePremultiplied)
         isSourcePremultiplied = true;
     else if (args.isPremultiplied)
         isPremultiplied = true;
     else if (args.isPrezero)
         isPrezero = true;
-    
+
     isNormal = args.isNormal;
 
     doSDF = args.doSDF;
     sdfThreshold = args.sdfThreshold;
-    
+
     //skipImageLength = args.skipImageLength;
 
     // mips
@@ -1075,7 +1079,7 @@ void ImageInfo::initWithArgs(const ImageInfoArgs& args)
     isSRGBSrc = args.isSRGBSrc;
     isSRGBSrcFlag = args.isSRGBSrcFlag;
     isSRGBDst = isSrgbFormat(pixelFormat);
-    
+
     hasAlpha = true;
     hasColor = true;
     if (!isAlphaFormat(pixelFormat))
@@ -1184,9 +1188,9 @@ void ImageInfo::initWithSourceImage(Image& sourceImage)
     // Note: srgb flags are unreliable in png since most tools use linear
     // RGBA8 blends and just write out the pixel as is (f.e. Photoshop, figma, etc).
     // TODO: offer mode to use srg image srgb state if author has fixed up
-     if (isSRGBSrcFlag)
+    if (isSRGBSrcFlag)
         isSRGBSrc = sourceImage.isSrgb();
-    
+
     // this implies color is stored in rgb
     if (isSRGBDst) {
         isColorWeighted = hasColor;
@@ -1209,15 +1213,15 @@ void ImageInfo::initWithSourceImage(Image& sourceImage)
 
         // averaging all the values in 8-bit space, so only apply to lin. rgbs
         switch (pixelFormat) {
-            case MyMTLPixelFormatETC2_RGB8:  // 3 channel
-            case MyMTLPixelFormatEAC_RGBA8:  // 4 channel
+            case MyMTLPixelFormatETC2_RGB8: // 3 channel
+            case MyMTLPixelFormatEAC_RGBA8: // 4 channel
 
             case MyMTLPixelFormatASTC_4x4_LDR:
             case MyMTLPixelFormatASTC_5x5_LDR:
             case MyMTLPixelFormatASTC_6x6_LDR:
             case MyMTLPixelFormatASTC_8x8_LDR:
 
-            case MyMTLPixelFormatBC1_RGBA:  // 3 channel RGB only
+            case MyMTLPixelFormatBC1_RGBA: // 3 channel RGB only
             case MyMTLPixelFormatBC3_RGBA:
             // case MyMTLPixelFormatBC6H_RGBFloat:
             // case MyMTLPixelFormatBC6H_RGBUfloat:
@@ -1275,8 +1279,8 @@ void ImageInfo::heightToNormals(int32_t w, int32_t h,
 
     // 2.0 is distance betwen +1 and -1
     // don't scale by this, want caller to be able to pass 1.0 as default scale not 2.0
-    float scaleX = scale;  // / 2.0;
-    float scaleY = scale;  // / 2.0;
+    float scaleX = scale; // / 2.0;
+    float scaleY = scale; // / 2.0;
 
     if (!isFloat) {
         scaleX /= 255.0f;
@@ -1366,7 +1370,7 @@ void ImageInfo::heightToNormals(int32_t w, int32_t h,
             else {
                 // cross pattern
                 // height channel is in x
-                uint8_t cN = srcPixels8[ym + x].r;  // assumes first elem (.r) is height channel
+                uint8_t cN = srcPixels8[ym + x].r; // assumes first elem (.r) is height channel
                 uint8_t cS = srcPixels8[yp + x].r;
                 uint8_t cE = srcPixels8[y0 + xp].r;
                 uint8_t cW = srcPixels8[y0 + xm].r;
@@ -1418,8 +1422,8 @@ const char* encoderName(TexEncoder encoder)
         case kTexEncoderUnknown:
             return "Unknown";
         default:
-            return "Unknown";  // to fix Visual Studio C4715
+            return "Unknown"; // to fix Visual Studio C4715
     }
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramImageInfo.h b/libkram/kram/KramImageInfo.h
index fafe74b5..5ced8c8a 100644
--- a/libkram/kram/KramImageInfo.h
+++ b/libkram/kram/KramImageInfo.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -9,28 +9,28 @@
 
 //#include "KramConfig.h"
 #include "KTXImage.h"
-#include "KramMipper.h"  // for Color
+#include "KramMipper.h" // for Color
 
 namespace kram {
 class Image;
 
-using namespace simd;
-using namespace NAMESPACE_STL;
+using namespace SIMD_NAMESPACE;
+using namespace STL_NAMESPACE;
 
 // each encoder has it's own set of outputs, can request encoder if overlap
 enum TexEncoder {
-    kTexEncoderUnknown = 0,  // pick best encoder
+    kTexEncoderUnknown = 0, // pick best encoder
 
-    kTexEncoderExplicit,  // r,rg,rgba 8|16f|32f
+    kTexEncoderExplicit, // r,rg,rgba 8|16f|32f
 
-    kTexEncoderATE,  // bc1,3,4,5,7,  and astc4x4,8x8 (macOS/iOS only),
-                     // different lib versions and support based on OS version
+    kTexEncoderATE, // bc1,3,4,5,7,  and astc4x4,8x8 (macOS/iOS only),
+                    // different lib versions and support based on OS version
 
-    kTexEncoderSquish,  // bc1,2,3,4,5
+    kTexEncoderSquish, // bc1,2,3,4,5
 
-    kTexEncoderBcenc,  // bc1,3,4,5,7
+    kTexEncoderBcenc, // bc1,3,4,5,7
 
-    kTexEncoderEtcenc,  // etc-r,rg11, etc2, no HDR format
+    kTexEncoderEtcenc, // etc-r,rg11, etc2, no HDR format
 
     kTexEncoderAstcenc,
 };
@@ -41,28 +41,28 @@ class ImageInfoArgs {
     MyMTLTextureType textureType = MyMTLTextureType2D;
     TexEncoder textureEncoder = kTexEncoderUnknown;
     MyMTLPixelFormat pixelFormat = MyMTLPixelFormatInvalid;
-    string formatString = "";  // will convert to pixelFormat
+    string formatString = ""; // will convert to pixelFormat
 
     int32_t mipMinSize = 1;
     int32_t mipMaxSize = 32 * 1024;
     int32_t mipSkip = 0;
 
-    int32_t quality = 49;  // may want float
+    int32_t quality = 49; // may want float
 
     // ktx2 has a compression type and level
     KTX2Compressor compressor;
     bool isKTX2 = false;
 
-    bool doMipmaps = true;  // default to mips on
+    bool doMipmaps = true; // default to mips on
     bool doMipflood = false;
     bool isVerbose = false;
     bool doSDF = false;
-    
+
     bool isSourcePremultiplied = false; // skip further premul of src
     bool isPremultiplied = false;
     bool isPrezero = false;
-    
-    bool isNormal = false;  // signed, but may be stored unorm and swizzled (f.e. astc/bc3nm gggr or rrrg)
+
+    bool isNormal = false; // signed, but may be stored unorm and swizzled (f.e. astc/bc3nm gggr or rrrg)
 
     // can pick a smaller format if alpha = 1 (only for bc and etc)
     bool optimizeFormatForOpaque = false;
@@ -73,10 +73,10 @@ class ImageInfoArgs {
     bool isSRGBSrc = false;
     bool isSRGBSrcFlag = false;
     bool isSRGBDst = false;
-    
+
     // For dst. TODO: could have signed source passed in
     bool isSigned = false;
-    
+
     // Applies to src.  But also have hdr specific output formats.
     bool isHDR = false;
 
@@ -92,7 +92,7 @@ class ImageInfoArgs {
     int32_t chunksX = 0;
     int32_t chunksY = 0;
     int32_t chunksCount = 0;
-    
+
     int32_t sdfThreshold = 120;
 };
 
@@ -142,14 +142,14 @@ class ImageInfo {
     bool hasAlpha = false;
     bool isSRGBSrc = false;
     bool isSRGBSrcFlag = false;
-    
+
     // output image state
     bool isSRGBDst = false;
     bool isSigned = false;
     bool isNormal = false;
     bool isColorWeighted = false;
     bool isSourcePremultiplied = false;
-    bool isPremultiplied = false;  // don't premul
+    bool isPremultiplied = false; // don't premul
     bool isPrezero = false;
     bool isHDR = false;
 
@@ -157,7 +157,7 @@ class ImageInfo {
     bool doMipmaps = false;
     bool doMipflood = false;
     bool optimizeFormatForOpaque = false;
-    
+
     bool isVerbose = false;
 
     // compression format
@@ -183,12 +183,12 @@ class ImageInfo {
 
     int32_t mipMinSize = 1;
     int32_t mipMaxSize = 32 * 1024;
-    int32_t mipSkip = 0;  // count of large mips to skip
+    int32_t mipSkip = 0; // count of large mips to skip
 
     int32_t chunksX = 0;
     int32_t chunksY = 0;
     int32_t chunksCount = 0;
-    
+
     // This converts incoming image channel to bitmap
     int32_t sdfThreshold = 120;
 };
@@ -215,4 +215,4 @@ bool isEncoderAvailable(TexEncoder encoder);
 
 const char* encoderName(TexEncoder encoder);
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramLib.h b/libkram/kram/KramLib.h
index 6739d68b..7eb3dc8f 100644
--- a/libkram/kram/KramLib.h
+++ b/libkram/kram/KramLib.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -13,6 +13,7 @@
 #include "KTXImage.h"
 #include "Kram.h"
 #include "KramFileHelper.h"
+#include "KramFileIO.h"
 #include "KramImage.h"
 #include "KramImageInfo.h"
 #include "KramLog.h"
diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp
index c3227d91..fb0ccb7f 100644
--- a/libkram/kram/KramLog.cpp
+++ b/libkram/kram/KramLog.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -6,7 +6,7 @@
 
 //#include <string>
 
-#if KRAM_IOS || KRAM_MAC
+#if KRAM_APPLE
 #define KRAM_LOG_STACKTRACE KRAM_DEBUG
 #elif KRAM_WIN
 // TODO: need to debug code before enabling
@@ -21,9 +21,13 @@
 #include <mutex>
 
 #if KRAM_WIN
-#include <windows.h>
 #include <intrin.h> // for AddressOfReturnAdress, ReturnAddress
 
+#ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+
 #if KRAM_LOG_STACKTRACE
 // There is a DbgHelp.lib that is redistributable
 #include <dbghelp.h>
@@ -33,84 +37,81 @@
 #elif KRAM_ANDROID
 #include <log.h>
 
-#elif KRAM_IOS || KRAM_MAC
-#include <os/log.h>
+#elif KRAM_APPLE
 #include <cxxabi.h> // demangle
-#include <dlfcn.h>  // address to symbol
+#include <dlfcn.h> // address to symbol
 #include <execinfo.h>
+#include <os/log.h>
 #endif
 
 #include "KramFmt.h"
 #include "KramTimer.h"
+#include "TaskSystem.h"
+
+//#if !KRAM_VISION // this is breaking link on visionOS
 #include "format.h" // really fmt/format.h
+//#endif
 
 namespace kram {
 
-// Pulled in from TaskSystem.cpp
-constexpr const uint32_t kMaxThreadName = 32;
-extern void getCurrentThreadName(char name[kMaxThreadName]);
-
 using mymutex = std::recursive_mutex;
 using mylock = std::unique_lock<mymutex>;
 
-using namespace NAMESPACE_STL;
-
+using namespace STL_NAMESPACE;
 
 #if KRAM_WIN
 // https://stackoverflow.com/questions/18547251/when-i-use-strlcpy-function-in-c-the-compilor-give-me-an-error
 
 // '_cups_strlcat()' - Safely concatenate two strings.
-size_t                    // O - Length of string
-strlcat(char       *dst,  // O - Destination string
-        const char *src,  // I - Source string
-        size_t     size)  // I - Size of destination string buffer
+size_t // O - Length of string
+strlcat(char* dst, // O - Destination string
+        const char* src, // I - Source string
+        size_t size) // I - Size of destination string buffer
 {
-  size_t    srclen;         // Length of source string
-  size_t    dstlen;         // Length of destination string
+    size_t srclen; // Length of source string
+    size_t dstlen; // Length of destination string
 
+    // Figure out how much room is left...
+    dstlen = strlen(dst);
+    size -= dstlen + 1;
 
-   // Figure out how much room is left...
-  dstlen = strlen(dst);
-  size   -= dstlen + 1;
+    if (!size)
+        return (dstlen); // No room, return immediately...
 
-  if (!size)
-    return (dstlen);        // No room, return immediately...
+    // Figure out how much room is needed...
+    srclen = strlen(src);
 
-  // Figure out how much room is needed...
-  srclen = strlen(src);
+    // Copy the appropriate amount...
+    if (srclen > size)
+        srclen = size;
 
-  // Copy the appropriate amount...
-  if (srclen > size)
-    srclen = size;
+    memcpy(dst + dstlen, src, srclen);
+    dst[dstlen + srclen] = '\0';
 
-  memcpy(dst + dstlen, src, srclen);
-  dst[dstlen + srclen] = '\0';
-
-  return (dstlen + srclen);
+    return (dstlen + srclen);
 }
 
 // '_cups_strlcpy()' - Safely copy two strings.
-size_t                          // O - Length of string
-strlcpy(char       *dst,        // O - Destination string
-        const char *src,        // I - Source string
-        size_t      size)       // I - Size of destination string buffer
+size_t // O - Length of string
+strlcpy(char* dst, // O - Destination string
+        const char* src, // I - Source string
+        size_t size) // I - Size of destination string buffer
 {
-  size_t    srclen; // Length of source string
-
+    size_t srclen; // Length of source string
 
-  // Figure out how much room is needed...
-  size --;
+    // Figure out how much room is needed...
+    size--;
 
-  srclen = strlen(src);
+    srclen = strlen(src);
 
-  // Copy the appropriate amount...
-  if (srclen > size)
-    srclen = size;
+    // Copy the appropriate amount...
+    if (srclen > size)
+        srclen = size;
 
-  memcpy(dst, src, srclen);
-  dst[srclen] = '\0';
+    memcpy(dst, src, srclen);
+    dst[srclen] = '\0';
 
-  return (srclen);
+    return (srclen);
 }
 #endif
 
@@ -118,41 +119,39 @@ strlcpy(char       *dst,        // O - Destination string
 
 #if KRAM_WIN
 // https://learn.microsoft.com/en-us/windows/win32/debug/retrieving-symbol-information-by-address?redirectedfrom=MSDN
-class AddressHelper
-{
+class AddressHelper {
 private:
-    
     HANDLE m_process = 0;
-    
+
 public:
     AddressHelper()
     {
         m_process = GetCurrentProcess();
-        
+
         // produces line number and demangles name
         SymSetOptions(SYMOPT_LOAD_LINES | SYMOPT_UNDNAME);
-        
+
         // load the symbols
         SymInitialize(m_process, NULL, TRUE);
     }
-    
+
     ~AddressHelper()
     {
         SymCleanup(m_process);
     }
-    
+
     bool isStackTraceSupported() const { return true; }
-    
+
     bool getAddressInfo(const void* address, string& symbolName, string& filename, uint32_t& line)
     {
         string.clear();
         filename.clear();
         line = 0;
-        
+
         IMAGEHLP_LINE64 loc = {};
         loc.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
-        DWORD  displacement = 0;
-        
+        DWORD displacement = 0;
+
         // This grabs the symbol name
         char buffer[sizeof(SYMBOL_INFO) + MAX_SYM_NAME * sizeof(TCHAR)] = {};
         SYMBOL_INFO& symbol = *(SYMBOL_INFO*)buffer;
@@ -160,38 +159,38 @@ class AddressHelper
         symbol.MaxNameLen = MAX_SYM_NAME;
         SymFromAddr(process, (ULONG64)address, &displacement, &symbol);
         symbolName = symbol.Name;
-        
+
         // all demangle ops are single-threaded, so run this under log mutex
         if (!SymGetLineFromAddr64(m_process, (DWORD64)address, &displacement, &loc))
             return false;
-        
+
         filename = loc.Filename;
         line = loc.LineNumber;
         return true;
     }
-    
+
     void getStackInfo(string& stackInfo, uint32_t skipStackLevels)
     {
         string symbolName, filename;
         uint32_t line = 0;
-        
+
         const uint32_t kMaxStackTrace = 128;
         void* stacktrace[kMaxStackTrace] = {};
-        
+
         // Can use this hash to uniquely identify stacks that are the same
         ULONG stackTraceHash = 0;
-        
+
         // This provides the symbols
         uint32_t frameCount = CaptureStackBackTrace(skipStackLevels, kMaxStackTrace, stacktrace, &stackTraceHash);
-        
-        for(uint32_t i = 0; i < frameCount; ++i) {
+
+        for (uint32_t i = 0; i < frameCount; ++i) {
             if (getAddressInfo(stacktrace[i], symbolName, filename, line))
                 append_sprintf(stackInfo, "%s:%u: %s", filename.c_str(), line, symbolName.c_str());
-            
+
             // Note: can get Module with a different call if above fails
         }
     }
-    
+
     // See here on using StackWalk64 to walk up the SEH context.
     // https://stackoverflow.com/questions/22467604/how-can-you-use-capturestackbacktrace-to-capture-the-exception-stack-not-the-ca
 };
@@ -211,7 +210,6 @@ class AddressHelper
 // here's nm on osx
 // https://opensource.apple.com/source/cctools/cctools-622.5.1/misc/nm.c
 
-
 // The dladdr() function is available only in dynamically linked programs.
 // #include <dlfcn.h>
 // int dladdr(const void *addr, Dl_info *info);
@@ -220,52 +218,53 @@ class AddressHelper
 // Here's supposed to be code that deals with static libs too
 // https://stackoverflow.com/questions/19848567/how-to-get-the-build-uuid-in-runtime-and-the-image-base-address/19859516#19859516
 
-class AddressHelper
-{
+class AddressHelper {
 private:
-    void substr(string& str, const char* start, const char* end) {
+    void substr(string& str, const char* start, const char* end)
+    {
         str = str.substr(start - str.c_str(), end - start);
     }
-    
-    const char* strrchr(const char* start, const char* end, char c) {
+
+    const char* strrchr(const char* start, const char* end, char c)
+    {
         while (end > start) {
             end--;
             if (*end == c)
                 return end;
         }
-        
+
         return nullptr;
     }
-    
+
     void demangleSymbol(string& symbolName)
     {
         size_t size = 0;
         int status = 0;
-        
+
         // This one is getting chopped up incorrect
         // 10  AppKit                              0x0000000193079730 __24-[NSViewController view]_block_invoke + 28
-        
+
         // Some other examples
         // 14  AppKit                              0x0000000192c7b230 NSPerformVisuallyAtomicChange + 108
         // 24  kramv                               0x0000000104c6b4e0 main + 76
-        
+
         const char* text = symbolName.c_str();
         // chop off the "+ 132" offset
         const char* plusOffsetEnd = strstr(text, " +");
         const char* objCStart = strstr(text, " -");
         if (!objCStart)
             objCStart = strstr(text, " __24-"); // block invoke
-        
+
         const char* cppStart = strstr(text, " _ZN4");
         const char* spaceStart = plusOffsetEnd ? strrchr(text, plusOffsetEnd, ' ') : nullptr;
-        
+
         if (objCStart)
-            substr(symbolName, objCStart+1, plusOffsetEnd);
+            substr(symbolName, objCStart + 1, plusOffsetEnd);
         else if (cppStart)
-            substr(symbolName, cppStart+1, plusOffsetEnd);
+            substr(symbolName, cppStart + 1, plusOffsetEnd);
         else if (spaceStart)
-            substr(symbolName, spaceStart+1, plusOffsetEnd);
-        
+            substr(symbolName, spaceStart + 1, plusOffsetEnd);
+
         // Note: some objC does need demangle
         if (cppStart) {
             // This allocates memory using malloc
@@ -278,83 +277,82 @@ class AddressHelper
             }
         }
     }
-    
+
 public:
     bool isStackTraceSupported() const { return true; }
-   
+
     bool getAddressInfo(const void* address, string& symbolName, string& filename, uint32_t& line)
     {
-        void* callstack[1] = { (void*)address };
-        
+        void* callstack[1] = {(void*)address};
+
         // this allocates memory
         char** strs = backtrace_symbols(callstack, 1);
-        
+
         // may need -no_pie to turn off ASLR, also don't reuse stack-frame reg
         // Will have to parse symbolName, filename, line
         symbolName = strs[0];
-        
+
         free(strs);
-        
+
         // TODO: figure out file/line lookup, don't want to fire nm/addr2line process each time
         // Those are GPL poison into codebases.  But they no doubt do a ton
         // or work on each launch to then lookup 1+ symbols.
-        // Apple doesn't even have addr2line, and needs to use atos. 
+        // Apple doesn't even have addr2line, and needs to use atos.
         // But atos doesn't exist except on dev systems.
         // atos goes to private framework CoreSymbolication.  Ugh.
         // There is also boost::stack_trace which does gen a valid stack somehow.
-        
+
         // CoreSymbolicate might have calls
         // https://opensource.apple.com/source/xnu/xnu-3789.21.4/tools/tests/darwintests/backtracing.c.auto.html
-        
+
         // https://developer.apple.com/documentation/xcode/adding-identifiable-symbol-names-to-a-crash-report
         // https://developer.apple.com/documentation/xcode/analyzing-a-crash-report
-        
+
         // Note: this can provide the file/line, but requires calling out to external process
         // also nm and addr2line
         // posix_spawn("atos -o kramv.app.dSYM/Contents/Resources/DWARF/kramv -arch arm64 -l %p", address);
-        
+
         filename.clear();
         line = 0;
-        
+
         demangleSymbol(symbolName);
-        
+
         return true;
     }
-    
+
     void getStackInfo(string& stackInfo, uint32_t skipStackLevels)
     {
         void* callstack[128];
         uint32_t frames = backtrace(callstack, 128);
-        
+
         // Also this call, but can't use it to lookup a symbol, and it's ObjC.
         // but it just returns the same data as below (no file/line).
         // +[NSThread callStackSymbols]
-        
+
         // backtrace_symbols() attempts to transform a call stack obtained by
         // backtrace() into an array of human-readable strings using dladdr().
         char** strs = backtrace_symbols(callstack, frames);
         string symbolName;
         for (uint32_t i = skipStackLevels; i < frames; ++i) {
             symbolName = strs[i];
-            
+
             demangleSymbol(symbolName);
-            
-            append_sprintf(stackInfo, "[%2u] ", i-skipStackLevels);
+
+            append_sprintf(stackInfo, "[%2u] ", i - skipStackLevels);
             stackInfo += symbolName;
             stackInfo += "\n";
         }
-        
+
         free(strs);
     }
-    
+
     // nm is typically used to decode, but that's an executable
 };
 #endif
 
 #else
 
-class AddressHelper
-{
+class AddressHelper {
 public:
     bool isStackTraceSupported() const { return false; }
     bool getAddressInfo(const void* address, string& symbolName, string& filename, uint32_t& line) { return false; }
@@ -365,7 +363,6 @@ class AddressHelper
 
 static AddressHelper gAddressHelper;
 
-
 // TODO: install assert handler to intercept, and also add a verify (assert that leaves source in)
 //void __assert(const char *expression, const char *file, int32_t line) {
 //
@@ -435,8 +432,22 @@ int32_t append_sprintf(string& str, const char* format, ...)
     return len;
 }
 
+string format(const char* format, ...)
+{
+    string str;
+
+    va_list args;
+    va_start(args, format);
+    /*int32_t len = */ vsprintf(str, format, args);
+    va_end(args);
+
+    return str;
+}
+
 //----------------------------------
 
+#if 1 // !KRAM_VISION // this is breaking link on visionOS
+
 static size_t my_formatted_size(fmt::string_view format, fmt::format_args args)
 {
     auto buf = fmt::detail::counting_buffer<>();
@@ -448,14 +459,14 @@ static size_t my_formatted_size(fmt::string_view format, fmt::format_args args)
 int32_t append_sprintf_impl(string& str, fmt::string_view format, fmt::format_args args)
 {
     size_t size = my_formatted_size(format, args);
-    
+
     // TODO: write directly to end of str
     string text = vformat(format, args);
-    
+
     // this does all formatting work
     str.resize(str.size() + size);
     str.insert(str.back(), text);
-    
+
     return size; // how many chars appended, no real failure case yet
 }
 
@@ -466,6 +477,8 @@ int32_t sprintf_impl(string& str, fmt::string_view format, fmt::format_args args
     return append_sprintf_impl(str, format, args);
 }
 
+#endif
+
 //----------------------------------
 
 bool startsWith(const char* str, const string& substring)
@@ -484,12 +497,12 @@ bool endsWith(const string& value, const string& ending)
     if (value.size() < ending.size())
         return false;
     uint32_t start = value.size() - ending.size();
-        
+
     for (uint32_t i = 0; i < ending.size(); ++i) {
         if (value[start + i] != ending[i])
             return false;
     }
-    
+
     return true;
 }
 
@@ -513,32 +526,31 @@ inline void OutputDebugStringU(LPCSTR lpOutputString, uint32_t len8)
 {
     // empty string
     if (len8 == 0) return;
-    
+
     // Run the conversion twice, first to get length, then to do the conversion
     int len16 = MultiByteToWideChar(CP_UTF8, 0, lpOutputString, (int)len8, nullptr, 0);
-    
+
     // watch out for large len16
-    if (len16 == 0 || len16 > 128*1024) return;
-    
+    if (len16 == 0 || len16 > 128 * 1024) return;
+
     wchar_t* strWide = (wchar_t*)_malloca(len16 * sizeof(wchar_t));
-    
+
     // ran out of stack
     if (!strWide) return;
-    
+
     MultiByteToWideChar(CP_UTF8, 0, lpOutputString, (int)len8, strWide, len16);
-    
+
     ULONG_PTR args[4] = {
         (ULONG_PTR)len16 + 1, (ULONG_PTR)strWide,
-        (ULONG_PTR)len8 + 1, (ULONG_PTR)lpOutputString
-    };
-    
+        (ULONG_PTR)len8 + 1, (ULONG_PTR)lpOutputString};
+
     // TODO: note that there is a limit to the length of this string
     // so may want to split up the string in a loop.
-    
+
     RaiseException(0x4001000A, 0, 4, args); // DBG_PRINTEXCEPTION_WIDE_C
-    
+
     _freea(strWide);
-    
+
     // Can't use OutputDebugStringW.
     // OutputDebugStringW converts the specified string based on the current system
     // locale information and passes it to OutputDebugStringA to be displayed. As a
@@ -548,14 +560,13 @@ inline void OutputDebugStringU(LPCSTR lpOutputString, uint32_t len8)
 
 #endif
 
-struct LogState
-{
+struct LogState {
     mymutex lock;
     string errorLogCaptureText;
     string buffer;
     bool isErrorLogCapture = false;
     uint32_t counter = 0;
-    
+
 #if KRAM_WIN
     bool isWindowsGuiApp = false; // default isConsole
     bool isWindowsDebugger = false;
@@ -586,29 +597,27 @@ void getErrorLogCaptureText(string& text)
     }
 }
 
-struct LogMessage
-{
+struct LogMessage {
     const char* group;
     int32_t logLevel;
-    
+
     // from macro
     const char* file;
     int32_t line;
     const char* func;
-    
+
     // embelished
     const char* threadName;
     double timestamp;
-    
+
     void* dso;
     void* returnAddress;
-    
+
     const char* msg;
     bool msgHasNewline;
 };
 
-enum DebuggerType
-{
+enum DebuggerType {
     DebuggerOutputDebugString,
     DebuggerOSLog,
     DebuggerLogcat,
@@ -617,42 +626,48 @@ enum DebuggerType
 
 constexpr const uint32_t kMaxTokens = 32;
 
-static const char* getFormatTokens(char tokens[kMaxTokens], const LogMessage& msg, DebuggerType type) 
+static const char* getFormatTokens(char tokens[kMaxTokens], const LogMessage& msg, DebuggerType type)
 {
 #if KRAM_WIN
     if (msg.logLevel <= LogLevelInfo) {
         strlcpy(tokens, "m\n", kMaxTokens);
     }
     else if (msg.file) {
-        strlcpy(tokens, "[l] g m\n" "F: L: t u\n", kMaxTokens);
+        strlcpy(tokens,
+                "[l] g m\n"
+                "F: L: t u\n",
+                kMaxTokens);
     }
     else {
         strlcpy(tokens, "[l] g m\n", kMaxTokens);
     }
 #elif KRAM_ANDROID
     // Android logcat has level, tag, file/line passed in the mesasge
-   strlcpy(tokens, "m\n", kMaxTokens);
+    strlcpy(tokens, "m\n", kMaxTokens);
 #else
     // copy of formatters above
     if (msg.logLevel <= LogLevelInfo) {
         strlcpy(tokens, "m\n", kMaxTokens);
     }
     else if (msg.file) {
-        strlcpy(tokens, "[l] g m\n" "F: L: t u\n", kMaxTokens);
+        strlcpy(tokens,
+                "[l] g m\n"
+                "F: L: t u\n",
+                kMaxTokens);
     }
     else {
         strlcpy(tokens, "[l] g m\n", kMaxTokens);
     }
-    
+
     bool printStacksForErrors = false;
     if (printStacksForErrors && gAddressHelper.isStackTraceSupported() && msg.logLevel >= LogLevelError) {
-        
         // can just report the caller, and not a full stack
         // already have function, so returnAddress printing is the same.
         /* if (msg.returnAddress) {
             strlcat(tokens, "s\n", kMaxTokens);
         }
-        else */ {
+        else */
+        {
             strlcat(tokens, "S", kMaxTokens);
         }
     }
@@ -660,14 +675,13 @@ static const char* getFormatTokens(char tokens[kMaxTokens], const LogMessage& ms
     return tokens;
 }
 
-
 static void formatMessage(string& buffer, const LogMessage& msg, const char* tokens)
 {
     buffer.clear();
-   
+
     char c = 0;
     while ((c = *tokens++) != 0) {
-        switch(c) {
+        switch (c) {
             case ' ':
             case ':':
             case '[':
@@ -675,12 +689,12 @@ static void formatMessage(string& buffer, const LogMessage& msg, const char* tok
             case '\n':
                 buffer += c;
                 break;
-                
+
             case 'l':
             case 'L': { // level
                 bool isVerbose = c == 'L';
                 const char* level = "";
-                switch(msg.logLevel) {
+                switch (msg.logLevel) {
                     case LogLevelDebug:
                         level = isVerbose ? "debug" : "D";
                         break;
@@ -701,7 +715,7 @@ static void formatMessage(string& buffer, const LogMessage& msg, const char* tok
                 buffer += msg.group;
                 break;
             }
-                
+
             case 's': { // return address (1 line stack)
                 if (msg.returnAddress) {
                     string symbolName, filename;
@@ -723,19 +737,19 @@ static void formatMessage(string& buffer, const LogMessage& msg, const char* tok
                 if (msg.func) {
                     buffer += msg.func;
                     int32_t len = (int32_t)strlen(msg.func);
-                    if (len > 1 && msg.func[len-1] != ']')
+                    if (len > 1 && msg.func[len - 1] != ']')
                         buffer += "()";
                 }
                 break;
             }
-                
+
             case 'd': { // date/timestamp
                 if (msg.timestamp != 0.0) {
                     append_sprintf(buffer, "%f", msg.timestamp);
                 }
                 break;
             }
-                
+
             case 't': { // thread
                 if (msg.threadName) {
                     buffer += msg.threadName;
@@ -751,7 +765,7 @@ static void formatMessage(string& buffer, const LogMessage& msg, const char* tok
                 }
                 break;
             }
-                
+
             case 'f': // file:line
             case 'F': {
                 if (msg.file) {
@@ -761,9 +775,9 @@ static void formatMessage(string& buffer, const LogMessage& msg, const char* tok
                     const char fileSeparator = '/';
 #endif
                     bool isVerbose = c == 'L';
-                    
+
                     const char* filename = msg.file;
-                    
+
                     // shorten filename
                     if (!isVerbose) {
                         const char* shortFilename = strrchr(filename, fileSeparator);
@@ -772,7 +786,7 @@ static void formatMessage(string& buffer, const LogMessage& msg, const char* tok
                             filename = shortFilename;
                         }
                     }
-                    
+
 #if KRAM_WIN
                     // format needed for Visual Studio to collect/clickthrough
                     append_sprintf(buffer, "%s(%d)", filename, msg.line);
@@ -787,8 +801,8 @@ static void formatMessage(string& buffer, const LogMessage& msg, const char* tok
     }
 }
 
-
-bool isMessageFiltered(const LogMessage& msg) {
+bool isMessageFiltered(const LogMessage& msg)
+{
 #if KRAM_RELEASE
     if (msg.logLevel == LogLevelDebug)
         return true;
@@ -798,12 +812,12 @@ bool isMessageFiltered(const LogMessage& msg) {
 void setMessageFields(LogMessage& msg, char threadName[kMaxThreadName])
 {
     const char* text = msg.msg;
-    
+
     msg.msgHasNewline = false;
     int32_t len = (int32_t)strlen(text);
     if (len >= 1 && text[len - 1] == '\n')
         msg.msgHasNewline = true;
-    
+
     // Note: this could analyze the format tokens for all reporters.
     // Also may want a log file with own formatting/fields.
 #if KRAM_ANDROID
@@ -814,12 +828,12 @@ void setMessageFields(LogMessage& msg, char threadName[kMaxThreadName])
     if (msg.logLevel <= LogLevelInfo)
         return;
 #endif
-    
+
     // fill out thread name
     getCurrentThreadName(threadName);
     if (threadName[0] != 0)
         msg.threadName = threadName;
-    
+
     // retrieve timestamp
     msg.timestamp = currentTimestamp();
 }
@@ -827,49 +841,48 @@ void setMessageFields(LogMessage& msg, char threadName[kMaxThreadName])
 static int32_t logMessageImpl(const LogMessage& msg)
 {
     // TODO: add any filtering up here, or before msg is built
-    
+
     mylock lock(gLogState.lock);
-    
+
     // this means caller needs to know all errors to display in the hud
     if (gLogState.isErrorLogCapture && msg.logLevel == LogLevelError) {
         gLogState.errorLogCaptureText += msg.msg;
         if (!msg.msgHasNewline)
             gLogState.errorLogCaptureText += "\n";
     }
-    
+
     // format into a buffer (it's under lock, so can use static)
     string& buffer = gLogState.buffer;
-    
+
     gLogState.counter++;
-    
+
     int32_t status = (msg.logLevel == LogLevelError) ? 1 : 0;
-    
+
 #if KRAM_WIN
-    
+
     // This is only needed for Window Gui.
     // Assumes gui app didn't call AllocConsole.
     if (gLogState.counter == 1) {
         bool hasConsole = ::GetStdHandle(STD_OUTPUT_HANDLE) != nullptr;
-        
+
         // only way to debug a gui app without console is to attach debugger
         gLogState.isWindowsGuiApp = !hasConsole;
     }
     // TODO: test IsDebuggerPresent once per frame, not on every log
     gLogState.isWindowsDebugger = ::IsDebuggerPresent();
-    
+
     if (gLogState.isWindowsGuiApp && !gLogState.isWindowsDebugger)
         return status;
-    
-    
+
     if (gLogState.isWindowsGuiApp) {
         char tokens[kMaxTokens] = {};
         getFormatTokens(tokens, msg, DebuggerOutputDebugString);
         formatMessage(buffer, msg, tokens);
-        
+
         // TODO: split string up into multiple logs
         // this is limited to 32K
         // OutputDebugString(buffer.c_str());
-        
+
         // This supports UTF8 strings by converting them to wide.
         // TODO: Wine doesn't handle.
         OutputDebugStringU(buffer.c_str(), buffer.size());
@@ -878,7 +891,7 @@ static int32_t logMessageImpl(const LogMessage& msg)
         char tokens[kMaxTokens] = {};
         getFormatTokens(tokens, msg, Debugger);
         formatMessage(buffer, msg, tokens);
-        
+
         // avoid double print to debugger
         FILE* fp = stdout;
         fwrite(buffer.c_str(), 1, buffer.size(), fp);
@@ -896,7 +909,7 @@ static int32_t logMessageImpl(const LogMessage& msg)
         case LogLevelInfo:
             osLogLevel = ANDROID_LOG_INFO;
             break;
-            
+
         case LogLevelWarning:
             osLogLevel = ANDROID_LOG_WARNING;
             break;
@@ -904,39 +917,39 @@ static int32_t logMessageImpl(const LogMessage& msg)
             osLogLevel = ANDROID_LOG_ERROR;
             break;
     }
-    
+
     if (!__android_log_is_loggable(osLogLevel, msg.group, __android_log_get_minimum_priority())) // will be default level if not set
         return status;
-    
+
     char tokens[kMaxTokens] = {};
     getFormatTokens(tokens, msg, DebuggerLogcat);
     formatMessage(buffer, msg, tokens);
-    
+
     // TODO: split string up into multiple logs by /n
     // this can only write 4K - 80 chars at time, don't use print it's 1023
     // API 30
     __android_log_message msg = {
-        LOG_ID_MAIN, msg.file, msg.line, buffer.c_str(), osLogLevel, sizeof(__android_log_message), msg.group
-    };
+        LOG_ID_MAIN, msg.file, msg.line, buffer.c_str(), osLogLevel, sizeof(__android_log_message), msg.group};
     __android_log_write_log_message(msg);
 #else
-    
-#if KRAM_IOS || KRAM_MAC
+
+#if KRAM_APPLE
     // test os_log
-    
-    static bool useOSLog = true;
-    if (useOSLog)
-    {
+
+    // TODO: setting useOSLog to true, breaks all output from kramc
+    //  but it shows up in debugger.  So stop using it.
+    static bool useOSLog = false;
+    if (useOSLog) {
         char tokens[kMaxTokens] = {};
         getFormatTokens(tokens, msg, DebuggerOSLog);
         formatMessage(buffer, msg, tokens);
-        
+
         // os_log reports this as the callsite, and doesn't jump to another file
         // or if the dso is even passed from this file, the file/line aren't correct.
         // So os_log_impl is grabbing return address whithin the function that can't be set.
         // So have to inject the NSLog, os_log, syslog calls directly into code, but that
         // not feasible.   This will at least color the mesages.
-        
+
         auto osLogLevel = OS_LOG_TYPE_INFO;
         switch (msg.logLevel) {
             case LogLevelDebug:
@@ -945,7 +958,7 @@ static int32_t logMessageImpl(const LogMessage& msg)
             case LogLevelInfo:
                 osLogLevel = OS_LOG_TYPE_INFO;
                 break;
-                
+
             case LogLevelWarning:
                 osLogLevel = OS_LOG_TYPE_ERROR; // no warning level
                 break;
@@ -953,10 +966,10 @@ static int32_t logMessageImpl(const LogMessage& msg)
                 osLogLevel = OS_LOG_TYPE_FAULT;
                 break;
         }
-        
+
         // TODO: have kramc and kramv using this logger, can we get at subsystem?
         const char* subsystem = "com.hialec.kram";
-        
+
         os_log_with_type(os_log_create(subsystem, msg.group), osLogLevel, "%{public}s", buffer.c_str());
     }
     else
@@ -965,7 +978,7 @@ static int32_t logMessageImpl(const LogMessage& msg)
         char tokens[kMaxTokens] = {};
         getFormatTokens(tokens, msg, Debugger);
         formatMessage(buffer, msg, tokens);
-        
+
         FILE* fp = stdout;
         fwrite(buffer.c_str(), 1, buffer.size(), fp);
         // if heavy logging, then could delay fflush
@@ -973,19 +986,17 @@ static int32_t logMessageImpl(const LogMessage& msg)
     }
 #endif
 
-    return status;  // reserved for later
+    return status; // reserved for later
 }
 
-
-                     
 int32_t logMessage(const char* group, int32_t logLevel,
-                          const char* file, int32_t line, const char* func,
-                          const char* fmt, ...)
+                   const char* file, int32_t line, const char* func,
+                   const char* fmt, ...)
 {
     void* dso = nullptr;
     void* logAddress = nullptr;
-    
-#if KRAM_IOS || KRAM_MAC
+
+#if KRAM_APPLE
     dso = &__dso_handle; // may need to come from call site for the mach_header of .o
     logAddress = __builtin_return_address(0); // or __builtin_frame_address(0))
 #elif KRAM_WIN
@@ -994,21 +1005,21 @@ int32_t logMessage(const char* group, int32_t logLevel,
     // from DbgHelp.dll
     logAddress = _ReturnAddress(); // or _AddressOfReturnAddress()
 #endif
-    
+
     LogMessage logMessage = {
         group, logLevel,
-        file, line, func, 
+        file, line, func,
         nullptr, 0.0, // threadname, timestamp
-        
+
         // must set -no_pie to use __builtin_return_address to turn off ASLR
         dso, logAddress,
         nullptr, false, // msg, msgHasNewline
     };
-    
+
     if (isMessageFiltered(logMessage)) {
         return 0;
     }
-    
+
     // convert var ags to a msg
     const char* msg = nullptr;
 
@@ -1029,43 +1040,44 @@ int32_t logMessage(const char* group, int32_t logLevel,
         int res = vsprintf(str, fmt, args);
         va_end(args);
         if (res < 0) return 0;
-        
+
         msg = str.c_str();
     }
-    
+
     logMessage.msg = msg;
-    
+
     char threadName[kMaxThreadName] = {};
     setMessageFields(logMessage, threadName);
     return logMessageImpl(logMessage);
 }
 
-
 // This is the api reference for fmt.
 // Might be able to use std::format in C++20 instead, but nice
 // to have full source to impl to fix things in fmt.
 // https://fmt.dev/latest/api.html#_CPPv4IDpEN3fmt14formatted_sizeE6size_t13format_stringIDp1TEDpRR1T
 
-// TODO: can this use NAMESPACE_STL::string_view instead ?
+#if 1 // !KRAM_VISION // exceptions causing this not to link
+
+// TODO: can this use STL_NAMESPACE::string_view instead ?
 int32_t logMessage(const char* group, int32_t logLevel,
-                          const char* file, int32_t line, const char* func,
-                          fmt::string_view format, fmt::format_args args)
+                   const char* file, int32_t line, const char* func,
+                   fmt::string_view format, fmt::format_args args)
 {
     // TODO: size_t size = std::formatted_size(format, args);
     // and then reserve that space in str.  Use that for impl of append_format.
     // can then append to existing string (see vsprintf)
-#if KRAM_IOS || KRAM_MAC
+#if KRAM_APPLE
     void* dso = &__dso_handle;
     void* logAddress = __builtin_return_address(0); // or __builtin_frame_address(0))
 #else
     void* dso = nullptr;
     void* logAddress = nullptr;
 #endif
-    
+
     LogMessage logMessage = {
         group, logLevel,
         file, line, func, nullptr, 0.0, // threadName, timestamp
-        
+
         // must set -no_pie to use __builtin_return_address to turn off ASLR
         dso, logAddress,
         nullptr, false, // msg, msgHasNewline
@@ -1073,15 +1085,17 @@ int32_t logMessage(const char* group, int32_t logLevel,
     if (isMessageFiltered(logMessage)) {
         return 0;
     }
-    
+
     string str = fmt::vformat(format, args);
     const char* msg = str.c_str();
-    
+
     logMessage.msg = msg;
-    
+
     char threadName[kMaxThreadName] = {};
     setMessageFields(logMessage, threadName);
     return logMessageImpl(logMessage);
 }
 
-}  // namespace kram
+#endif
+
+} // namespace kram
diff --git a/libkram/kram/KramLog.h b/libkram/kram/KramLog.h
index 090f0b69..355c12fb 100644
--- a/libkram/kram/KramLog.h
+++ b/libkram/kram/KramLog.h
@@ -1,10 +1,17 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #pragma once
 
 #include <cassert>
+
+// This has __printflike on macOS/Linux
+#if KRAM_WIN
+// TODO: fix for Win
+#else
+#include <sys/cdefs.h>
+#endif
 //#include <string>
 
 // #include "KramConfig.h"
@@ -37,7 +44,7 @@ extern int32_t logMessage(const char* group, int32_t logLevel,
 #define KVERIFY(x) KASSERT(x)
 #else
 #define KASSERT(x)
-#define KVERIFY(x) (x)
+#define KVERIFY(x) (void)(x)
 #endif
 
 // save code space, since file/func aren't output for debug/info
@@ -47,7 +54,7 @@ extern int32_t logMessage(const char* group, int32_t logLevel,
 #define KLOGE(group, fmt, ...) logMessage(group, kram::LogLevelError, __FILE__, __LINE__, __FUNCTION__, fmt, ##__VA_ARGS__)
 
 // TODO: move to Strings.h
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 // when set true, the internal string is cleared
 void setErrorLogCapture(bool enable);
@@ -69,6 +76,10 @@ int32_t append_sprintf(string& str, const char* format, ...) __printflike(2, 3);
 // returns length of chars appended, -1 if failure
 int32_t append_vsprintf(string& str, const char* format, va_list args);
 
+// formats and returns string
+// ugh - name conflict if format or format_string or format_str
+string format(const char* format, ...) __printflike(1, 2);
+
 bool startsWith(const char* str, const string& substring);
 
 bool endsWithExtension(const char* str, const string& substring);
@@ -106,5 +117,4 @@ inline uint32_t StringToUInt32(const char* num)
     return (int32_t)StringToUInt64(num);
 }
 
-
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramMipper.cpp b/libkram/kram/KramMipper.cpp
index 926fa9b2..dbe74b02 100644
--- a/libkram/kram/KramMipper.cpp
+++ b/libkram/kram/KramMipper.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -7,12 +7,12 @@
 //#include <algorithm>
 #include <cassert>
 
-#include "KTXImage.h"  // for mipDown
+#include "KTXImage.h" // for mipDown
 
 namespace kram {
 
-using namespace NAMESPACE_STL;
-using namespace simd;
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
 
 Mipper::Mipper() { initTables(); }
 
@@ -36,7 +36,7 @@ int32_t nextPow2(int32_t num)
 
 inline uint8_t floatToUint8(float value)
 {
-    return (uint8_t)roundf(value * 255.0f);  // or use 255.1f ?
+    return (uint8_t)roundf(value * 255.0f); // or use 255.1f ?
 }
 
 // same as ColorFromUnormFloat4
@@ -161,8 +161,8 @@ void Mipper::initTables()
         lin = srgbToLinearFunc(linearToSRGBFunc(lin));
 
         float s = 0.5;
-        s = srgbToLinearFunc(s);  // 0.21404
-        s = linearToSRGBFunc(s);  // back to 0.5
+        s = srgbToLinearFunc(s); // 0.21404
+        s = linearToSRGBFunc(s); // back to 0.5
     }
 #endif
 }
@@ -171,8 +171,8 @@ void Mipper::initPixelsHalfIfNeeded(ImageData& srcImage, bool doPremultiply, boo
                                     vector<half4>& halfImage) const
 {
     Color zeroColor = {0, 0, 0, 0};
-    float4 zeroColorf = float4m(0.0, 0.0f, 0.0f, 0.f);  // need a constant for this
-    half4 zeroColorh = toHalf4(zeroColorf);
+    float4 zeroColorf = float4m(0.0f); // need a constant for this
+    half4 zeroColorh = half4m(zeroColorf);
 
     int32_t w = srcImage.width;
     int32_t h = srcImage.height;
@@ -220,7 +220,7 @@ void Mipper::initPixelsHalfIfNeeded(ImageData& srcImage, bool doPremultiply, boo
                 // }
                 // else
                 {
-                    halfImage[y0 + x] = toHalf4(cFloat);
+                    halfImage[y0 + x] = half4m(cFloat);
                 }
 
                 // only have to rewrite src alpha/color if there is alpha and it's premul
@@ -248,7 +248,7 @@ void Mipper::initPixelsHalfIfNeeded(ImageData& srcImage, bool doPremultiply, boo
                 else {
                     float4 cFloat = {alphaToFloat[c0.r], alphaToFloat[c0.g],
                                      alphaToFloat[c0.b], alphaToFloat[c0.a]};
-                    halfImage[y0 + x] = toHalf4(cFloat);
+                    halfImage[y0 + x] = half4m(cFloat);
                 }
             }
         }
@@ -274,7 +274,7 @@ void Mipper::initPixelsHalfIfNeeded(ImageData& srcImage, bool doPremultiply, boo
                 // }
                 // else
                 {
-                    halfImage[y0 + x] = toHalf4(cFloat);
+                    halfImage[y0 + x] = half4m(cFloat);
                 }
 
                 // only have to rewrite color if there is alpha
@@ -352,43 +352,43 @@ void mipfloodBigMip(const ImageData& smallMip, ImageData& bigMip)
     // horizontal or vertically, so lower mip mapping not so easy
     // if we assume pow2, then simpler.  Could still have non-square
     // pow2, which don't want to read off end of buffer.
-    
+
     uint32_t w = bigMip.width;
     uint32_t h = bigMip.height;
-    
+
     uint32_t wDst = smallMip.width;
     uint32_t hDst = smallMip.height;
-    
+
     const uint8_t kAlphaThreshold = 0;
-    
+
     // now run through the pixels with 0 alpha, and flood them with pixel from below
     for (uint32_t y = 0; y < h; ++y) {
-       Color* srcRow = &bigMip.pixels[y * w];
-        uint32_t yDst = y/2;
+        Color* srcRow = &bigMip.pixels[y * w];
+        uint32_t yDst = y / 2;
         if (yDst >= hDst)
             yDst = hDst - 1;
-        
+
         const Color* dstRow = &smallMip.pixels[yDst * wDst];
-        
+
         for (uint32_t x = 0; x < w; ++x) {
             // skip any pixels above threshold
             Color& srcPixel = srcRow[x];
             if (srcPixel.a > kAlphaThreshold) continue;
-            
+
             // replace the rest
-            uint32_t xDst = x/2;
+            uint32_t xDst = x / 2;
             if (xDst == wDst)
                 xDst = wDst - 1;
-            
+
             Color dstPixel = dstRow[xDst];
             dstPixel.a = srcPixel.a;
-            
+
             // an invalid premul color with rgb > a, may want valid non-premul
             srcPixel = dstPixel;
         }
     }
 }
-         
+
 // Propogate up from bottom so that every 0 pixel gets a non-zero value.
 void Mipper::mipflood(vector<ImageData>& mips) const
 {
@@ -398,17 +398,15 @@ void Mipper::mipflood(vector<ImageData>& mips) const
     // Unclear why they didn't use premul instead, but maybe compression
     // quality was better.  So this masks the filtering errors of black halos.
     // https://www.youtube.com/watch?v=MKX45_riWQA?t=2991
-    
+
     int32_t numMipLevels = mips.size();
-        
+
     // this overwrites the existing mips
-    for (int32_t i = numMipLevels-1; i >= 1; --i)
-    {
-        mipfloodBigMip(mips[i], mips[i-1]);
+    for (int32_t i = numMipLevels - 1; i >= 1; --i) {
+        mipfloodBigMip(mips[i], mips[i - 1]);
     }
 }
 
-
 void Mipper::mipmap(const ImageData& srcImage, ImageData& dstImage) const
 {
     dstImage.width = srcImage.width;
@@ -441,8 +439,8 @@ void Mipper::mipmapLevelOdd(const ImageData& srcImage, ImageData& dstImage) cons
 
     int32_t dstIndex = 0;
 
-    bool isOddX = width & 1;
-    bool isOddY = height & 1;
+    bool isOddX = (width > 1) && (width & 1);
+    bool isOddY = (height > 1) && (height & 1);
 
     // advance always by 2, but sample from neighbors
     int32_t mipWidth = std::max(1, width / 2);
@@ -454,7 +452,7 @@ void Mipper::mipmapLevelOdd(const ImageData& srcImage, ImageData& dstImage) cons
     // After linear combine, convert back to srgb
     // mip source is always linear to build all levels.
     bool isSRGBDst = dstImage.isSRGB;
-    
+
     for (int32_t y = isOddY ? 1 : 0; y < height; y += 2) {
         int32_t ym = y - 1;
         int32_t y0 = y;
@@ -466,8 +464,21 @@ void Mipper::mipmapLevelOdd(const ImageData& srcImage, ImageData& dstImage) cons
         float y0w = mipHeight * invHeight;
         float y1w = mipY * invHeight;
 
-        if (!isOddY) {
-            ym = y;  // weight is 0
+        if (height == 3) {
+            ymw = 1.0f/3.0f;
+            y0w = 1.0f/3.0f;
+            y1w = 1.0f/3.0f;
+        }
+        else if (height == 1) {
+            ym = y; // weight is 0
+            y1 = y;
+            
+            ymw = 0.0f;
+            y0w = 1.0f;
+            y1w = 0.0f;
+        }
+        else if (!isOddY) {
+            ym = y; // weight is 0
 
             ymw = 0.0f;
             y0w = 0.5f;
@@ -495,8 +506,21 @@ void Mipper::mipmapLevelOdd(const ImageData& srcImage, ImageData& dstImage) cons
             float x0w = mipWidth * invWidth;
             float x1w = mipX * invWidth;
 
-            if (!isOddX) {
-                xm = x;  // weight is 0
+            if (width == 3) {
+                xmw = 1.0f/3.0f;
+                x0w = 1.0f/3.0f;
+                x1w = 1.0f/3.0f;
+            }
+            else if (width == 1) {
+                xm = x; // weight is 0
+                x1 = x;
+                
+                xmw = 0.0f;
+                x0w = 1.0f;
+                x1w = 0.0f;
+            }
+            else if (!isOddX) {
+                xm = x; // weight is 0
 
                 xmw = 0.0f;
                 x0w = 0.5f;
@@ -515,17 +539,17 @@ void Mipper::mipmapLevelOdd(const ImageData& srcImage, ImageData& dstImage) cons
             float4 c[9];
 
             if (srcHalf) {
-                c[0] = toFloat4(srcHalf[ym + xm]);
-                c[1] = toFloat4(srcHalf[ym + x0]);
-                c[2] = toFloat4(srcHalf[ym + x1]);
+                c[0] = float4m(srcHalf[ym + xm]);
+                c[1] = float4m(srcHalf[ym + x0]);
+                c[2] = float4m(srcHalf[ym + x1]);
 
-                c[3] = toFloat4(srcHalf[y0 + xm]);
-                c[4] = toFloat4(srcHalf[y0 + x0]);
-                c[5] = toFloat4(srcHalf[y0 + x1]);
+                c[3] = float4m(srcHalf[y0 + xm]);
+                c[4] = float4m(srcHalf[y0 + x0]);
+                c[5] = float4m(srcHalf[y0 + x1]);
 
-                c[6] = toFloat4(srcHalf[y1 + xm]);
-                c[7] = toFloat4(srcHalf[y1 + x0]);
-                c[8] = toFloat4(srcHalf[y1 + x1]);
+                c[6] = float4m(srcHalf[y1 + xm]);
+                c[7] = float4m(srcHalf[y1 + x0]);
+                c[8] = float4m(srcHalf[y1 + x1]);
             }
             else if (srcFloat) {
                 c[0] = srcFloat[ym + xm];
@@ -575,7 +599,7 @@ void Mipper::mipmapLevelOdd(const ImageData& srcImage, ImageData& dstImage) cons
 
             if (srcHalf) {
                 // overwrite float4 image
-                cDstHalf[dstIndex] = toHalf4(cFloat);
+                cDstHalf[dstIndex] = half4m(cFloat);
 
                 // assume hdr pulls from half/float data
                 if (!srcImage.isHDR) {
@@ -649,11 +673,11 @@ void Mipper::mipmapLevel(const ImageData& srcImage, ImageData& dstImage) const
     const half4* srcHalf = srcImage.pixelsHalf;
 
     // Note the ptrs above may point to same memory
-    
+
     // After linear combine, convert back to srgb
     // mip source is always linear to build all levels.
     bool isSRGBDst = dstImage.isSRGB;
-    
+
     int32_t dstIndex = 0;
 
     for (int32_t y = 0; y < height; y += 2) {
@@ -668,17 +692,17 @@ void Mipper::mipmapLevel(const ImageData& srcImage, ImageData& dstImage) const
 
             if (srcHalf) {
                 float4 c0, c1, c2, c3;
-                c0 = toFloat4(srcHalf[y0 + x0]);
-                c1 = toFloat4(srcHalf[y0 + x1]);
-                c2 = toFloat4(srcHalf[y1 + x0]);
-                c3 = toFloat4(srcHalf[y1 + x1]);
+                c0 = float4m(srcHalf[y0 + x0]);
+                c1 = float4m(srcHalf[y0 + x1]);
+                c2 = float4m(srcHalf[y1 + x0]);
+                c3 = float4m(srcHalf[y1 + x1]);
 
                 // mip filter is simple box filter
                 // assumes alpha premultiplied already
                 float4 cFloat = (c0 + c1 + c2 + c3) * 0.25;
 
                 // overwrite half4 image
-                cDstHalf[dstIndex] = toHalf4(cFloat);
+                cDstHalf[dstIndex] = half4m(cFloat);
 
                 // assume hdr pulls from half/float data
                 if (!srcImage.isHDR) {
@@ -746,4 +770,4 @@ void Mipper::mipmapLevel(const ImageData& srcImage, ImageData& dstImage) const
     }
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramMipper.h b/libkram/kram/KramMipper.h
index 7bc6ff0d..e06d227b 100644
--- a/libkram/kram/KramMipper.h
+++ b/libkram/kram/KramMipper.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -10,8 +10,8 @@
 //#include "KramConfig.h"
 
 namespace kram {
-using namespace NAMESPACE_STL;
-using namespace simd;
+using namespace STL_NAMESPACE;
+using namespace SIMD_NAMESPACE;
 
 // return whether num is pow2
 bool isPow2(int32_t num);
@@ -35,29 +35,28 @@ inline Color toPremul(Color c)
     return c;
 }
 
-
-inline float4 ColorToUnormFloat4(const Color &value)
+inline float4 ColorToUnormFloat4(const Color& value)
 {
     // simd lib can't ctor these even in C++, so will make abstracting harder
     float4 c = float4m((float)value.r, (float)value.g, (float)value.b, (float)value.a);
     return c / 255.0f;
 }
 
-inline float4 ColorToSnormFloat4(const Color &value)
+inline float4 ColorToSnormFloat4(const Color& value)
 {
     float4 c = float4m((float)value.r, (float)value.g, (float)value.b, (float)value.a);
     return (c - float4(128.0f)) / 255.0f;
 }
 
-inline Color ColorFromUnormFloat4(const float4 &value)
+inline Color ColorFromUnormFloat4(const float4& value)
 {
     float4 c = round(saturate(value) * 255.0f);
-    Color color = { (uint8_t)c.x, (uint8_t)c.y, (uint8_t)c.z, (uint8_t)c.w };
+    Color color = {(uint8_t)c.x, (uint8_t)c.y, (uint8_t)c.z, (uint8_t)c.w};
     return color;
 }
 
 // for signed bc4/5, remap the endpoints after unorm fit
-void remapToSignedBCEndpoint88(uint16_t &endpoint);
+void remapToSignedBCEndpoint88(uint16_t& endpoint);
 
 // for decoding bc4/5 snorm, convert block to unsigned endpoints before decode
 void remapFromSignedBCEndpoint88(uint16_t& endpoint);
@@ -71,16 +70,16 @@ class ImageData {
 public:
     // data can be mipped as 8u, 16f, or 32f.  Prefer smallest size.
     // half is used when srgb/premultiply is used.  32f is really only for r/rg/rgba32f mips.
-    Color *pixels = nullptr;
-    half4 *pixelsHalf = nullptr;    // optional
-    float4 *pixelsFloat = nullptr;  // optional
+    Color* pixels = nullptr;
+    half4* pixelsHalf = nullptr; // optional
+    float4* pixelsFloat = nullptr; // optional
 
     int32_t width = 0;
     int32_t height = 0;
     int32_t depth = 0;
 
     bool isSRGB = false;
-    bool isHDR = false;  // only updates pixelsFloat
+    bool isHDR = false; // only updates pixelsFloat
 };
 
 class Mipper {
@@ -102,7 +101,7 @@ class Mipper {
     // wherever the alpha is 0.  This is a form of cheap
     // dilation, but will result in invalid premul colors r > a.
     void mipflood(vector<ImageData>& srcImage) const;
-    
+
     // these use table lookups, so need to be class members
     float toLinear(uint8_t srgb) const { return srgbToLinear[srgb]; }
     float toAlphaFloat(uint8_t alpha) const { return alphaToFloat[alpha]; }
@@ -111,7 +110,6 @@ class Mipper {
 
     uint8_t toPremul(uint8_t channelIntensity, uint8_t alpha) const { return ((uint32_t)channelIntensity * (uint32_t)alpha) / 255; }
 
-    
 private:
     void initTables();
 
@@ -120,4 +118,4 @@ class Mipper {
     void mipmapLevelOdd(const ImageData& srcImage, ImageData& dstImage) const;
 };
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramMmapHelper.cpp b/libkram/kram/KramMmapHelper.cpp
index fa3b4a4b..e53d0b8d 100644
--- a/libkram/kram/KramMmapHelper.cpp
+++ b/libkram/kram/KramMmapHelper.cpp
@@ -1,23 +1,24 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #include "KramMmapHelper.h"
 
-// here's how to mmmap data, but NSData may have another way
+// here's how to mmmap data, but NSData has another way
+// +dataWithContentsOfURL:options:error: and NSDataReadingMappedIfSafe or NSDataReadingMappedAlways."
+
 #include <stdio.h>
 #include <sys/stat.h>
 
-#if KRAM_MAC || KRAM_IOS || KRAM_LINUX
+#if KRAM_APPLE || KRAM_LINUX
 #include <sys/mman.h>
 #include <unistd.h>
 #elif KRAM_WIN
-// portable mmap implementation, but only using on Win
-// TODO: this indicates that it leaks a CreateFileMapping handle, since it wanted to keep same mmap/munmap api
 #include "win_mmap.h"
 #endif
 
 MmapHelper::MmapHelper() {}
+
 MmapHelper::MmapHelper(MmapHelper &&rhs)
 {
     addr = rhs.addr;
@@ -30,7 +31,7 @@ MmapHelper::MmapHelper(MmapHelper &&rhs)
 
 MmapHelper::~MmapHelper() { close(); }
 
-bool MmapHelper::open(const char *filename)
+bool MmapHelper::open(const char* filename)
 {
     if (addr) {
         return false;
@@ -49,12 +50,14 @@ bool MmapHelper::open(const char *filename)
     }
     length = sb.st_size;
 
+    // Only offset needs padded to pagesize, but here offset is always 0
+
     // Stop padding out to page size, or do but then don't add to length, or will walk too far in memory
     // all remaining page data will be zero, but still want length to reflect actual length of file
     // need Windows equilvent of getpagesize() call before putting this back.  This was to use
     // with MTLBuffer no copy which has a strict page alignment requirement on start and size.
     //
-    //#if KRAM_MAC || KRAM_LINUX || KRAM_IOS
+    //#if KRAM_APPLE || KRAM_LINUX
     //    // pad it out to the page size (this can be 4k or 16k)
     //    // need this alignment, or it can't be converted to a MTLBuffer
     //    size_t pageSize = FileHelper::pagesize();
@@ -68,7 +71,7 @@ bool MmapHelper::open(const char *filename)
     // this needs to be MAP_SHARED or Metal can't reference with NoCopy
     addr =
         (const uint8_t *)mmap(nullptr, length, PROT_READ, MAP_SHARED, fd, 0);
-    fclose(fp);  // mmap keeps pages alive until munmap
+    fclose(fp); // mmap keeps pages alive until munmap
 
     if (addr == MAP_FAILED) {
         return false;
diff --git a/libkram/kram/KramMmapHelper.h b/libkram/kram/KramMmapHelper.h
index 125a4a6f..7f986018 100644
--- a/libkram/kram/KramMmapHelper.h
+++ b/libkram/kram/KramMmapHelper.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -19,10 +19,10 @@ class MmapHelper {
     bool open(const char *filename);
     void close();
 
-    const uint8_t *data() { return addr; }
+    const uint8_t* data() { return addr; }
     size_t dataLength() { return length; }
 
 private:
-    const uint8_t *addr = nullptr;
+    const uint8_t* addr = nullptr;
     size_t length = 0;
 };
diff --git a/libkram/kram/KramSDFMipper.cpp b/libkram/kram/KramSDFMipper.cpp
index 632be5f1..4b2fa977 100644
--- a/libkram/kram/KramSDFMipper.cpp
+++ b/libkram/kram/KramSDFMipper.cpp
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -6,13 +6,13 @@
 
 //#include <algorithm>
 
-#include "KTXImage.h"  // for mipDown
+#include "KTXImage.h" // for mipDown
 #include "KramMipper.h"
 #include "hedistance.h"
 
 namespace kram {
 using namespace heman;
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 void SDFMipper::init(ImageData& srcImage, uint8_t sdfThreshold, bool isVerbose_)
 {
@@ -22,15 +22,15 @@ void SDFMipper::init(ImageData& srcImage, uint8_t sdfThreshold, bool isVerbose_)
     maxD = 0.0;
     isVerbose = isVerbose_;
     threshold = sdfThreshold;
-    
+
     int32_t w = srcImage.width;
     int32_t h = srcImage.height;
 
     srcBitmap.resize(w * h);
 
     // store bitmap in 8-bit grayscale
-    const Color* pixels = srcImage.pixels;               // 4 bytes
-    uint8_t* dstImageData = (uint8_t*)srcBitmap.data();  // 1 byte
+    const Color* pixels = srcImage.pixels; // 4 bytes
+    uint8_t* dstImageData = (uint8_t*)srcBitmap.data(); // 1 byte
 
     for (int32_t y = 0; y < h; y++) {
         int32_t y0 = y * w;
@@ -65,7 +65,7 @@ void SDFMipper::mipmap(ImageData& dstImage, int32_t mipLevel)
     dstImage.width = w;
     dstImage.height = h;
 
-    Color* pixels = dstImage.pixels;  // 4 bytes
+    Color* pixels = dstImage.pixels; // 4 bytes
 
     // stuff back into the rgb channel of the dst texture to feed to encoder
     // have to do in reverse, since we're expanding 1 channel to 4
@@ -74,7 +74,7 @@ void SDFMipper::mipmap(ImageData& dstImage, int32_t mipLevel)
 
     heman_distance_create_sdf((const heman::my_image*)&srcBitmapImage, (heman::my_image*)&dst, maxD, isVerbose);
 
-    const uint8_t* srcImageData = (const uint8_t*)pixels;  // 1 byte
+    const uint8_t* srcImageData = (const uint8_t*)pixels; // 1 byte
 
     for (int32_t y = h - 1; y >= 0; y--) {
         int32_t y0 = y * w;
@@ -93,4 +93,4 @@ void SDFMipper::mipmap(ImageData& dstImage, int32_t mipLevel)
     }
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramSDFMipper.h b/libkram/kram/KramSDFMipper.h
index c9f0e187..f724ef73 100644
--- a/libkram/kram/KramSDFMipper.h
+++ b/libkram/kram/KramSDFMipper.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -9,7 +9,7 @@
 //#include "KramConfig.h"
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 class ImageData;
 
@@ -35,4 +35,4 @@ class SDFMipper {
     vector<uint8_t> srcBitmap;
 };
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramThreadPool.cpp b/libkram/kram/KramThreadPool.cpp
new file mode 100644
index 00000000..c71f10d1
--- /dev/null
+++ b/libkram/kram/KramThreadPool.cpp
@@ -0,0 +1,401 @@
+#include "KramThreadPool.h"
+
+// #include <atomic>
+//#include <queue>
+
+#if KRAM_WIN
+#ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+//#include <synchapi.h>
+#endif
+
+#if KRAM_LINUX
+#include <linux/futex.h>
+#endif
+
+// Remove this, and move out all the threading prims
+#include "TaskSystem.h"
+
+// Android is missing defines
+#if KRAM_ANDROID
+#ifndef SYS_futex
+# define SYS_futex __NR_futex
+#endif
+#ifndef FUTEX_WAIT_BITSET
+# define FUTEX_WAIT_BITSET 9
+#endif
+#ifndef FUTEX_WAKE_BITSET
+# define FUTEX_WAKE_BITSET 10
+#endif
+#ifndef FUTEX_PRIVATE_FLAG
+# define FUTEX_PRIVATE_FLAG 128
+#endif
+#endif
+
+// TODO: don't understand how jobs get distributed to the various queues
+// Especially if all jobs are coming in from the main/scheduler thread.
+//
+// TODO: add ability to grow/shrink workers
+//
+// TODO: sucky part of using Worker* is that these are spread across memory
+//   but can grow/shrink count.
+//
+// Inspired by notes from Andreas Fredriksson on building a better thread
+// pool. Otherwise, every pool just uses a single cv and mutex.  That cv
+// then has to do a lot of wakey wakey when the thread/core counts are high.
+//
+// Talks about Unity thread pool
+// https://unity.com/blog/engine-platform/improving-job-system-performance-2022-2-part-1
+//
+// https://unity.com/blog/engine-platform/improving-job-system-performance-2022-2-part-2
+
+namespace kram {
+using namespace STL_NAMESPACE;
+
+
+// futex is 0 when waiting, and 1+ when active.
+// futex wait and timout support with newer macOS API, but requires iOS17.4/macOS14.4.
+// #include "os/os_sync_wait_on_address.h"
+// int os_sync_wait_on_address(void *addr, uint64_t value, size_t size, os_sync_wait_on_address_flags_t flags);
+
+
+#if KRAM_MAC
+
+// C++20 calls below.
+// iOS 14/macOS 11
+
+void futex::wait(uint32_t expectedValue) {
+    auto monitor = __libcpp_atomic_monitor(&_value);
+    // Check again if we should still go to sleep.
+    if (_value.load(memory_order_relaxed) != expectedValue) {
+        return;
+    }
+    // Wait, but only if there's been no new notifications
+    // since we acquired the monitor.
+    __libcpp_atomic_wait(&_value, monitor);
+}
+
+void futex::notify_one() {
+    __cxx_atomic_notify_one(&_value);
+}
+
+void futex::notify_all() {
+    __cxx_atomic_notify_all(&_value);
+}
+
+
+#elif KRAM_WIN
+
+// Win8+
+
+void futex::wait(uint32_t expectedValue) {
+    // this waits until value shifts
+    WaitOnAddress(&_value, &expectedValue, sizeof(uint32_t), INFINITE);
+}
+
+void futex::notify_one() {
+    WakeByAddressSingle(&_value);
+}
+
+void futex::notify_all() {
+    WakeByAddressAll(&_value);
+}
+
+
+#elif KRAM_LINUX || KRAM_ANDROID
+
+// Linux 2.6.7
+// Only has uint32_t support
+
+void futex::wait(uint32_t expectedValue) {
+    syscall(SYS_futex, &_value, FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG,
+            NULL, NULL, expectedValue);
+}
+
+void futex::notify_one() {
+    syscall(SYS_futex, &_value, FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG,
+            NULL, NULL, 1);
+}
+
+void futex::notify_all() {
+    syscall(SYS_futex, &_value, FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG,
+            NULL, NULL, INT32_MAX); // TODO: UINT32_MAX?
+}
+
+#endif
+
+// Each thread has it's own queue
+
+// main thread is responsible for making sure one thread is awake
+// when it schedules a job.
+// but main no longer has to drive waking, since each worker can
+
+Scheduler::Scheduler(uint32_t numWorkers) {
+    // TODO: numWorkers must be even number?  If buddy are paired
+    // if each worker uses next as buddy, then can be odd.
+    
+    _schedulerThread = getCurrentThread();
+    
+    // TODO: should move the scheduler settings out
+    ThreadInfo infoMain = {"Sheduler", ThreadPriority::Interactive, 0};
+    setThreadInfo(infoMain);
+    
+    string name;
+    
+    for (uint32_t threadIndex = 0; threadIndex < numWorkers; ++threadIndex) {
+        // These have to be ptr, due to uncopyable futex/mutex
+        Worker* worker = new Worker();
+        sprintf(name, "Task%d", threadIndex);
+        worker->Init( name, threadIndex, this);
+        
+        _workers.push_back(worker);
+    }
+    
+    
+    // Note that running work on core0 when core0 may starve it
+    // from assigning work to threads.
+    
+    // start up the threads
+    
+    for (uint32_t threadIndex = 0; threadIndex < numWorkers; ++threadIndex) {
+        // Generate a name, also corresponds to core for affinity
+        // May want to include priority too.
+        Worker& worker = *_workers[threadIndex];
+        
+        _threads.emplace_back([threadIndex, &worker] {
+            
+            // This is setting affinity to threadIndex
+            ThreadInfo infoTask = {worker._name.c_str(), ThreadPriority::High, (int)threadIndex};
+            setThreadInfo(infoTask);
+            
+            worker.run();
+        });
+    }
+}
+
+
+// TODO: this is one way to pass pririty with priority
+//    template<typename F>
+//    void scheduleJob(int priority, F f) {
+//        sheduleJob(Job2(priority, f));
+//    }
+
+void Scheduler::scheduleJob(Job2& job) {
+    auto currentThread = getCurrentThread();
+
+    // job subtasks always first pushed to their own queue.
+    // if this is null, then it's either the scheduler thread or a random thread
+    //  trying to submit a job (which goes to scheduler).
+    Worker* worker = findWorker(currentThread);
+    
+    // Already on same thread.  That thread is awake.
+    // But another thread could be stealing a job,
+    // So for now project queue with mutex.
+    
+    if (currentThread == _schedulerThread || !worker) {
+        // Need to pick a best queue to put work on?
+        // otherwise everything gets stuck on scheduler queue
+        // and then is stolen off it.
+        
+        // Scheduler thread needs to ensure a worker is awake
+        // since it doesn't run it's own work?
+        
+        // Atomic count per Worker helps here.  Can read outside
+        // of lock, and can then spread work more evenly.
+        
+        uint32_t minQueue = 0;
+        uint32_t minQueueCount = _workers[0]->queueSize();
+        
+        for (uint32_t i = 1; i < _workers.size(); ++i) {
+            uint32_t queueCount = _workers[i]->queueSize();
+            if (queueCount < minQueueCount) {
+                minQueueCount = queueCount;
+                minQueue = i;
+            }
+        }
+        
+        worker = _workers[minQueue];
+        
+        {
+            lock_guard<mutex> lock(worker->_mutex);
+            worker->_queue.push(std::move(job));
+            worker->incQueueSize();
+            _stats.jobsTotal++;
+        }
+        
+        // here the scheduler or random thread needs to wake a worker
+        worker->_futex.notify_one();
+    }
+    else {
+        lock_guard<mutex> lock(worker->_mutex);
+        worker->_queue.push(std::move(job));
+        worker->incQueueSize();
+        _stats.jobsTotal++;
+        
+        // the job is already awake and scheduling to its own queue
+        // so don't need to notify.
+    }
+}
+
+void Scheduler::stop()
+{
+    // has to be called on scheduler thread
+    // just don't call from a worker
+    //KASSERT(getCurrentThread() == _schedulerThread);
+    
+    if (_isStop)
+        return;
+    
+    _isStop = true;
+    
+    for (uint32_t i = 0; i < _workers.size(); ++i) {
+        // wake it
+        _workers[i]->_futex.notify_one();
+        
+        // wait on thread to end
+        _threads[i].join();
+        
+        // since had to use ptrs, delete them
+        delete _workers[i];
+        _workers[i] = nullptr;
+    }
+}
+   
+
+bool Worker::stealFromOtherQueues(Job2& job)
+{
+    // Is this safe to test?
+    if (_scheduler->stats().jobsRemaining() == 0)
+        return false;
+    
+    bool found = false;
+    
+    auto& workers = _scheduler->workers();
+
+    // This will visit buddy and then the rest
+    for (uint32_t i = 0; i < workers.size()-1; ++i) {
+        Worker* worker = workers[(_workerId+1+i) % workers.size()];
+        
+        // This should never visit caller Worker.
+        KASSERT(worker != this);
+        
+        // loop of expensive queue mutex locks below searching for jobs
+        // use atomic queueSize per worker.  A little racy.
+//        if (worker->queueSize() == 0) {
+//            continue;
+//        }
+        
+        lock_guard<mutex> lock(worker->_mutex);
+        if (!worker->_queue.empty()) {
+            job = std::move(worker->_queue.top());
+            worker->_queue.pop();
+            worker->decQueueSize();
+            
+            SchedulerStats& stats = _scheduler->stats();
+            stats.jobsExecuting++;
+            
+            // stop search, since returning a job
+            found = true;
+            break;
+        }
+        
+    }
+
+    return found;
+}
+
+void Worker::wakeWorkers()
+{
+    // Is this safe to test?
+    if (_scheduler->stats().jobsRemaining() == 0)
+        return;
+    
+    // This takes responsibility off the main thread
+    // to keep waking threads to run tasks.
+    auto& workers = _scheduler->workers();
+    
+    Worker* buddy = workers[(_workerId+1) % workers.size()];
+    
+    if (!buddy->_isExecuting) {
+        buddy->_futex.notify_one();
+        return;
+    }
+    
+    // TODO: should we only wake as many workers as jobs
+    // what if they are already awake and working?
+    // uint32_t numJobs = _scheduler->stats().jobsRemaining();
+    
+    // Wrap around visit from just past buddy
+    for (uint32_t i = 0; i < workers.size()-2; ++i) {
+        Worker* worker = workers[(_workerId+2+i) % workers.size()];
+        if (!worker->_isExecuting) {
+            worker->_futex.notify_one();
+            break;
+        }
+    }
+}
+
+bool Worker::shouldSleep()
+{
+    // TODO: needs to be more complex
+    // for parallel task exectution.
+    
+    return true;
+}
+
+void Worker::run()
+{
+    SchedulerStats& stats = _scheduler->stats();
+    
+    while(!_scheduler->isStop()) {
+        // Take a job from our worker thread’s local queue
+        Job2 job;
+        bool found = false;
+        {
+            lock_guard<mutex> lock(_mutex);
+            if (!_queue.empty()) {
+                job = std::move(_queue.top());
+                _queue.pop();
+                decQueueSize();
+                stats.jobsExecuting++;
+                found = true;
+            }
+        }
+        
+        // If our queue is empty try to steal work from someone
+        // else's queue to help them out.
+        if(!found) {
+            found = stealFromOtherQueues(job);
+        }
+        
+        if(found) {
+            // If we found work, there may be more conditionally
+            // wake up other workers as necessary
+            wakeWorkers();
+            
+            // Any job spawned by job goes to same queue.
+            // But may get stolen by another thread.
+            // Try not to have tasks wait on sub-tasks
+            // or their thread is locked down.
+            _isExecuting = true;
+            job.execute();
+            _isExecuting = false;
+            
+            // these can change a little out of order
+            stats.jobsExecuting--;
+            stats.jobsTotal--;
+        }
+        
+        // Conditionally go to sleep (perhaps we were told there is a
+        // parallel job we can help with)
+        else if(shouldSleep()) {
+            // Put the thread to sleep until more jobs are scheduled.
+            // Wakes when value is non-zero and notify called.
+            _futex.wait(0);
+        }
+    }
+}
+
+} // namespace kram
diff --git a/libkram/kram/KramThreadPool.h b/libkram/kram/KramThreadPool.h
new file mode 100644
index 00000000..0f527455
--- /dev/null
+++ b/libkram/kram/KramThreadPool.h
@@ -0,0 +1,119 @@
+#include <queue>
+
+namespace kram {
+using namespace STL_NAMESPACE;
+
+// this must not rollover
+using AtomicValue = atomic<uint32_t>;
+
+// fast locking
+class futex {
+public: // for now leave this public
+    AtomicValue _value;
+    futex() = default;
+    
+public:
+    // wait.  wake when atomic does not match expectedValue and notify called
+    void wait(uint32_t expectedValue = 0);
+    
+    // wake first thread waiting
+    void notify_one();
+    
+    // wake all threads wiating
+    void notify_all();
+};
+    
+// No affinity needed.  OS can shedule threads from p to e core.
+// What about skipping HT though.
+class Scheduler;
+
+// This wraps priority and function together.
+// A priority queue can then return higher priority jobs.
+class Job2 {
+public:
+    int priority = 0; // smaller type?
+    function<void()> job;
+    
+    Job2() {}
+    Job2(int p, function<void()> f) : priority(p), job(f) {}
+    
+    bool operator<(const Job2& other) const {
+        return priority > other.priority; // Higher priority comes first
+    }
+    
+    void execute() { job(); }
+};
+
+class Worker {
+public:
+    string _name;
+    priority_queue<Job2> _queue;
+    mutex _mutex; // for queue
+    futex _futex; // to wait/notify threads, holds count of jobs in queue
+    Scheduler* _scheduler = nullptr;
+    uint32_t _workerId = 0;
+    bool _isExecuting = false;
+    
+    void Init(const string& name, uint32_t workerId, Scheduler* scheduler) {
+        _name = name;
+        _workerId = workerId;
+        _scheduler = scheduler;
+    }
+    
+    // could be const, but it's atomic so volatile
+    uint32_t queueSize() { return _futex._value; }
+    void incQueueSize() { _futex._value++; }
+    void decQueueSize() { _futex._value--; }
+    
+    void run();
+    
+private:
+    bool stealFromOtherQueues(Job2& job);
+    void wakeWorkers();
+    bool shouldSleep();
+};
+
+class SchedulerStats {
+public:
+    AtomicValue jobsTotal;
+    AtomicValue jobsExecuting;
+    uint32_t jobsRemaining() const { return jobsTotal - jobsExecuting; }
+};
+
+class Scheduler {
+public:
+    Scheduler(uint32_t numWorkers);
+    ~Scheduler() {
+        if (!_isStop) {
+            stop();
+        }
+    }
+
+    void scheduleJob(Job2& job);
+    
+    bool isStop() const { return _isStop; }
+    void stop();
+    
+    // Not really public API
+    vector<Worker*>& workers() { return _workers; }
+    
+    SchedulerStats& stats() { return _stats; }
+    
+private:
+    Worker* findWorker(thread::native_handle_type currentThread) {
+        for (uint32_t i = 0; i < (uint32_t)_workers.size(); ++i) {
+            if (_threads[i].native_handle() == currentThread) {
+                return _workers[i];
+            }
+        }
+        return nullptr;
+    }
+    
+    bool _isStop = false;
+    vector<Worker*> _workers;
+    vector<thread> _threads;
+    SchedulerStats _stats;
+    thread::native_handle_type _schedulerThread = 0;
+};
+
+} // namespace kram
diff --git a/libkram/kram/KramTimer.cpp b/libkram/kram/KramTimer.cpp
index b00b3a7a..01b9d566 100644
--- a/libkram/kram/KramTimer.cpp
+++ b/libkram/kram/KramTimer.cpp
@@ -1,20 +1,29 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
 #include "KramTimer.h"
 
-#if 1
+#include "TaskSystem.h"
 
 #if KRAM_WIN
+#ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
-#elif KRAM_MAC || KRAM_IOS
+#elif KRAM_APPLE
 #include <mach/mach_time.h>
+#elif KRAM_ANDROID
+#include <trace.h>
+#elif KRAM_LINUX
+#include <time.h> // needs librt.a
 #endif
 
+#define nl '\n'
+
 namespace kram {
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 #if KRAM_WIN
 
@@ -22,41 +31,96 @@ static double queryPeriod()
 {
     LARGE_INTEGER frequency;
     QueryPerformanceFrequency(&frequency);
-    
+
     // convert from nanos to seconds
     return 1.0 / double(frequency.QuadPart);
 };
 
 static uint64_t queryCounter()
 {
+    // This doesn't pause when app is paused.
+    // seems like it wouldn't pause when system is paused either.
+    // Needed for multi-core, multi-frequency systems.  This is
+    // a fixed rate timer, so frequency can be cached.
     LARGE_INTEGER counter;
     QueryPerformanceCounter(&counter);
     return counter.QuadPart;
 };
 
-#elif KRAM_IOS || KRAM_MAC
+#elif KRAM_APPLE
+
 
 static double queryPeriod()
 {
+    double period = 1.0;
+    
+    /* only needed for the mach calls
     mach_timebase_info_data_t timebase;
     mach_timebase_info(&timebase);
-    
+
     // https://eclecticlight.co/2020/11/27/inside-m1-macs-time-and-logs/
-    // On macOS Intel, nanosecondsPerTick are 1ns (1/1)
-    // On macOS M1, nanosecondsPerTick are 41.67ns (num/denom = 125/3)
-    double period = (double)timebase.numer / timebase.denom;
-    period *= 1e-9; // convert to seconds
+    // On macOS Intel, nanosecondsPerTick are 1ns (1/1) = 1Ghz.
+    // On macOS M1, nanosecondsPerTick are 41.67ns (num/denom = 125/3) = 24Mhz
+    // On M2, A16/A17 Pro, and armv8.6-A should be (1/1) = 1Ghz.
+    // So when 1/1, can avoid mul div below, seconds requires mul by 1e-9.
+    period = (double)timebase.numer / timebase.denom;
+    */
+    
+    period *= 1e-9; // convert nanos to seconds
+
+    return period;
+}
+
+static uint64_t queryCounter()
+{
+    uint64_t time = 0;
+    time = clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
     
+    // Mach absolute time will, in general, continue to count if your process is suspended in the background.
+    // However, if will stop counting if the CPU goes to sleep.
+
+    // Apple docs recommends these non-posix clock calls.
+    // They maybe salt these to avoid fingerprinting, but don't need permissions.
+    // Also they don't need period conversion to nanos.
+    
+    // With continuous time, can store one epoch time to convert to real timings.
+    // But not halting in debugger will skew timings.
+    // May want timeouts to use the absolute timer.
+    
+    // Really each core has different frequencies with P/E, so want a timer
+    // that is consistent.  Also the frequency can ramp up/down.  Timers
+    // like rdtsc drift when a process goes from one core to another.
+    
+    // increment when system sleeps
+    // time = mach_continuous_time();
+    
+    // no increment when system sleeps
+    //time = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
+    //time = mach_absolute_time();
+    
+    // Have gotten burned by these timers, unclear of precision.
+    // Tracy says these timers are bad, but uses them.
+    // C++11 has std::chrono::high_resolution_clock::now() in <chrono>
+    
+    return time;
+}
+
+#elif KRAM_LINUX
+
+static double queryPeriod()
+{
+    double period = 1e-9;
     return period;
 }
 
 static uint64_t queryCounter()
 {
-    // increment when app sleeps
-    // return mach_continuous_time();
+    timespec ts;
+    /*int result = */ clock_gettime(CLOCK_MONOTONIC, &ts);
+
+    uint64_t time = (uint64_t)ts.tv_nsec + ((uint64_t)ts.tv_sec * 1000000000ULL);
     
-    // no increment when app sleeps
-    return mach_absolute_time();
+    return time;
 }
 
 #endif
@@ -70,54 +134,319 @@ double currentTimestamp()
     return (double)delta * gQueryPeriod;
 }
 
-} // namespace kram
+//-------------------
 
-#else
+// TODO: also look into the Perfetto binary format and library/api.
+// This needs some daemon to flush data to.  Unclear if can route
+//  existing api and timings over to calls?
+// https://perfetto.dev/docs/instrumentation/tracing-sdk
 
-/*
-// see sources here
-// https://codebrowser.dev/llvm/libcxx/src/chrono.cpp.html
-// but steady on macOS uses clock_gettime(CLOCK_MONOTONIC_RAW, &tp)
-//   which should be mach_continuous_time()
-//
-// also see sources here for timers
-// https://opensource.apple.com/source/Libc/Libc-1158.1.2/gen/clock_gettime.c.auto.html
-// mach_continuous_time() vs. mach_absolute_time()
-// https://developer.apple.com/library/archive/qa/qa1398/_index.html
- 
-#if USE_EASTL
-#include "EASTL/chrono.h"
-#else
-#include <chrono>
+// TODO: escape strings, but it's just more work
+Perf* Perf::_instance = new Perf();
+
+thread_local uint32_t gPerfStackDepth = 0;
+
+PerfScope::PerfScope(const char* name_)
+    : name(name_), time(currentTimestamp())
+{
+    gPerfStackDepth++;
+
+#if KRAM_ANDROID
+    // TODO: also ATrace_isEnabled()
+    ATrace_beginSection(name, value);
 #endif
- 
-namespace kram {
+}
 
-using namespace NAMESPACE_STL;
+void PerfScope::close()
+{
+    if (time != 0.0) {
+        --gPerfStackDepth;
 
-#if USE_EASTL
-using namespace eastl::chrono;
+#if KRAM_ANDROID
+        ATrace_endSection();
+#endif
+        Perf::instance()->addTimer(name, time, currentTimestamp() - time);
+        time = 0.0;
+    }
+}
+
+void addPerfCounter(const char* name, int64_t value)
+{
+#if KRAM_ANDROID
+    // only int64_t support
+    ATrace_setCounter(name, value);
+#endif
+
+    Perf::instance()->addCounter(name, currentTimestamp(), value);
+}
+
+//---------------
+
+Perf::Perf()
+{
+    // TODO: should set alongside exe by default
+#if KRAM_WIN
+    setPerfDirectory("C:/traces/");
 #else
-using namespace std::chrono;
+    // sandboxed apps won't reach this, but unsandboxed exe can
+    setPerfDirectory("/Users/Alec/traces/");
 #endif
+}
 
-// high-res  (defaults to steady or system in libcxx)
-//using myclock = high_resolution_clock;
-//using myclock = system_clock;
-using myclock = steady_clock;
+void Perf::setPerfDirectory(const char* directoryName)
+{
+    _perfDirectory = directoryName;
+}
 
-static const myclock::time_point gStartTime = myclock::now();
+static bool useTempFile = false;
 
-double currentTimestamp()
+bool Perf::start(const char* name, bool isCompressed, uint32_t maxStackDepth)
 {
-    auto t = myclock::now();
-    duration<double, std::milli> timeSpan = t - gStartTime;
-    double count = (double)timeSpan.count() * 1e-3;
-    return count;
+    mylock lock(_mutex);
+
+    if (isRunning()) {
+        KLOGW("Perf", "start already called");
+        return true;
+    }
+
+    const char* ext = isCompressed ? ".perftrace.gz" : ".perftrace";
+    sprintf(_filename, "%s%s%s", _perfDirectory.c_str(), name, ext);
+
+    _maxStackDepth = maxStackDepth;
+
+    // write json as binary, so win doesn't replace \n with \r\n
+    if (useTempFile) {
+        if (!_fileHelper.openTemporaryFile("perf-", ext, "w+b")) {
+            KLOGW("Perf", "Could not open perf temp file");
+            return false;
+        }
+    }
+    else {
+        if (!_fileHelper.open(_filename.c_str(), "w+b")) {
+            KLOGW("Perf", "Could not open perf file %s", _filename.c_str());
+            return false;
+        }
+    }
+
+    if (!_stream.open(&_fileHelper, !isCompressed)) {
+        _fileHelper.close();
+        return false;
+    }
+
+    // Perf is considered running after this, since _startTime is non-zero
+
+    // TODO: store _startTime in json starting params
+    _startTime = currentTimestamp();
+
+    _threadIdToTidMap.clear();
+    _threadNames.clear();
+
+    string buf;
+
+    // displayTimeUnit must be ns (nanos) or ms (micros), default is ms
+    // "displayTimeUnit": "ns"
+    // want ms since it's less data if nanos truncated
+    sprintf(buf, R"({"traceEvents":[%c)", nl);
+    write(buf);
+
+    // can store file info here, only using one pid
+    uint32_t processId = 0;
+    const char* processName = "kram"; // TODO: add platform + config + app?
+
+    sprintf(buf, R"({"name":"process_name","ph":"M","pid":%u,"args":{"name":"%s"}},%c)",
+            processId, processName, nl);
+    write(buf);
+
+    return true;
 }
 
-}  // namespace kram
-*/
+void Perf::stop()
+{
+    mylock lock(_mutex);
+
+    if (!isRunning()) {
+        KLOGW("Perf", "stop called, but never started");
+        return;
+    }
+
+    // write end of array and object, and force flush
+    bool forceFlush = true;
+    string buf;
+    sprintf(buf, R"(]}%c)", nl);
+    write(buf, forceFlush);
+
+    _stream.close();
+
+    if (useTempFile) {
+        bool success = _fileHelper.copyTemporaryFileTo(_filename.c_str());
+        if (!success) {
+            KLOGW("Perf", "Couldn't move temp file");
+        }
+    }
+
+    _fileHelper.close();
+
+    _startTime = 0.0;
+}
 
+void Perf::openPerftrace()
+{
+    // system call isn't available on iOS
+    // also macOS sandbox prevents open call (could write and then open script).
+#if KRAM_MAC
+    mylock lock(_mutex);
+
+    // DONE: now open the file in kram-profile by opening it
+    // okay to use system, but it uses a global mutex on macOS
+    // Unclear if macOS can send compressed perftrace.gz file without failing
+    // but uncompressed perftrace file might be openable.
+    // Also sandbox and hardened runtime may interfere.
+
+    string buf;
+    sprintf(buf, "open %s", _filename.c_str());
+    system(buf.c_str());
 #endif
+}
+
+void Perf::write(const string& str, bool forceFlush)
+{
+    mylock lock(_mutex);
+
+    _buffer += str;
+
+    if (forceFlush || _buffer.size() >= _stream.compressLimit()) {
+        _stream.compress(Slice((uint8_t*)_buffer.data(), _buffer.size()), forceFlush);
+        _buffer.clear();
+    }
+}
+
+uint32_t Perf::addThreadIfNeeded()
+{
+    auto threadId = getCurrentThread();
+
+    // don't need this, it's already locked by caller
+    //mylock lock(_mutex);
+
+    auto it = _threadIdToTidMap.find(threadId);
+    if (it != _threadIdToTidMap.end()) {
+        return it->second;
+    }
+
+    // add the new name and tid
+    char threadName[kMaxThreadName];
+    getThreadName(threadId, threadName);
 
+    // don't really need to store name if not sorting, just need tid counter
+    uint32_t tid = _threadNames.size();
+    _threadNames.push_back(threadName);
+
+    _threadIdToTidMap.insert(make_pair(threadId, tid));
+
+    // this assumes the map is wiped each time
+    string buf;
+    sprintf(buf, R"({"name":"thread_name","ph":"M","tid":%u,"args":{"name":"%s"}},%c)",
+            tid, threadName, nl);
+    write(buf);
+
+    return tid;
+}
+
+void Perf::addTimer(const char* name, double time, double elapsed)
+{
+    if (!isRunning()) {
+        return;
+    }
+
+    // About Perfetto ts sorting.  This is now fixed to sort duration.
+    // https://github.com/google/perfetto/issues/878
+
+    if (_maxStackDepth && gPerfStackDepth >= _maxStackDepth)
+        return;
+
+    // zero out the time, so times are smaller to store
+    time -= _startTime;
+
+    // problem with duration is that existing events can overlap the start time
+    bool isClamped = time < 0.0;
+    if (isClamped) {
+        elapsed += time;
+        time = 0.0;
+    }
+    if (elapsed <= 0.0)
+        return;
+
+    // Catapult timings are suppoed to be in micros.
+    // Convert seconds to micros (as integer), lose nanos.  Note that
+    // Perfetto will convert all values to nanos anyways and lacks a ms format.
+    // Raw means nanos, and Seconds is too small of a fraction.
+    // Also printf does IEEE round to nearest even.
+    uint32_t timeDigits = 0; // or 3 for nanos
+    time *= 1e6;
+    elapsed *= 1e6;
+
+    // TODO: worth aliasing the strings, just replacing one string with another
+    // but less chars for id.
+
+    // now lock across isRunning, addThread, and write call
+    mylock lock(_mutex);
+    if (!isRunning()) {
+        return;
+    }
+    // This requires a lock, so buffering the events would help
+    // what about sorting the names instead of first-come, first-serve?
+    uint32_t tid = addThreadIfNeeded();
+
+    // write out the event in micros, default is displayed in ms
+    string buf;
+    sprintf(buf, R"({"name":"%s","ph":"X","tid":%d,"ts":%.*f,"dur":%.*f},%c)",
+            name, tid, timeDigits, time, timeDigits, elapsed, nl);
+    write(buf);
+}
+
+// Can also use begin/end but these aren't a atomic
+//  R"({"name":"%s","ph":"B","tid":%d,"ts":%.0f},%c)",
+//  R"({"ph":"E","tid":%d,"ts":%.0f},%c)",
+
+void Perf::addCounter(const char* name, double time, int64_t amount)
+{
+    if (!isRunning()) {
+        return;
+    }
+
+    // also reject nested counters off perf stack depth
+    if (_maxStackDepth && gPerfStackDepth >= _maxStackDepth)
+        return;
+
+    // zero out the time, so times are smaller to store
+    time -= _startTime;
+
+    // problem with duration is that events can occur outside the start time
+    if (time < 0.0) {
+        return;
+    }
+
+    // Catapult timings are supposed to be in micros.
+    // Convert seconds to micros (as integer), lose nanos.  Note that
+    // Perfetto will convert all values to nanos anyways.
+    // Raw means nanos, and Seconds is too small of a fraction.
+    // Also printf does IEEE round to nearest even.
+    // https://github.com/google/perfetto/issues/879
+
+    time *= 1e6;
+    uint32_t timeDigits = 0; // or 3 for nanos
+
+    // TODO: worth aliasing the strings?, just replacing one string with another
+    // but less chars for id.
+
+    // Note: can also have multiple named values passed in args
+    // Note: unclear if Perfetto can handle negative values
+
+    // write out the event in micros, default is displayed in ms
+    // lld not portable to Win
+    string buf;
+    sprintf(buf, R"({"name":"%s","ph":"C","ts":%.*f,"args":{"v":%lld}},%c)",
+            name, timeDigits, time, amount, nl);
+    write(buf);
+}
+
+} // namespace kram
diff --git a/libkram/kram/KramTimer.h b/libkram/kram/KramTimer.h
index 326aac76..c2e3bf44 100644
--- a/libkram/kram/KramTimer.h
+++ b/libkram/kram/KramTimer.h
@@ -1,4 +1,4 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
 // The license and copyright notice shall be included
 // in all copies or substantial portions of the Software.
 
@@ -6,6 +6,10 @@
 
 #include <cassert>
 
+// These are only here for Perf class
+#include "KramFileHelper.h"
+#include "KramZipStream.h"
+
 //#include "KramConfig.h"
 
 namespace kram {
@@ -41,12 +45,12 @@ class Timer {
         }
         return time;
     }
-    
+
     double timeElapsedMillis() const
     {
         return timeElapsed() * 1e3;
     }
-    
+
     bool isStopped() const { return _timeElapsed < 0.0; }
 
 private:
@@ -81,4 +85,76 @@ class TimerScope {
     Timer* _timer = nullptr;
 };
 
-}  // namespace kram
+// This implements PERF macros, sending timing data to kram-profile, perfetto, and/or Tracy.
+class Perf {
+public:
+    Perf();
+
+    void setPerfDirectory(const char* directoryName);
+
+    bool isRunning() const { return _startTime != 0.0; }
+
+    bool start(const char* filename, bool isCompressed = true, uint32_t maxStackDepth = 0);
+    void stop();
+
+    void addTimer(const char* name, double time, double elapsed);
+    void addCounter(const char* name, double time, int64_t value);
+
+    // This may fail on sandboxed app
+    void openPerftrace();
+
+    // singleton getter, but really want to split Perf from macros.
+    static Perf* instance() { return _instance; }
+
+    // on it's own track/tid, add a frame vsync marker
+    // TODO: void addFrameMarker(double time);
+
+private:
+    void write(const string& str, bool forceFlush = false);
+    uint32_t addThreadIfNeeded();
+
+    ZipStream _stream;
+    FileHelper _fileHelper;
+    double _startTime = 0.0;
+    string _filename;
+    string _perfDirectory;
+
+    using mymutex = recursive_mutex;
+    using mylock = unique_lock<mymutex>;
+
+    mymutex _mutex;
+    unordered_map<thread::native_handle_type, uint32_t> _threadIdToTidMap;
+    vector<string> _threadNames;
+    string _buffer;
+    uint32_t _maxStackDepth = 0; // 0 means no limit
+
+    static Perf* _instance;
+};
+
+class PerfScope {
+public:
+    // This means that the timers are running even when not profiling
+    PerfScope(const char* name_);
+    ~PerfScope() { close(); }
+
+    void close();
+
+private:
+    const char* name;
+    double time;
+};
+
+// This is here to split off Perf
+void addPerfCounter(const char* name, int64_t value);
+
+#define KPERF_SCOPENAME2(a, b) scope##b
+#define KPERF_SCOPENAME(b) KPERF_SCOPENAME2(scope, b)
+
+#define KPERFT(x) PerfScope KPERF_SCOPENAME(__COUNTER__)(x)
+
+#define KPERFT_START(num, x) PerfScope KPERF_SCOPENAME(num)(x)
+#define KPERFT_STOP(num) KPERF_SCOPENAME(num).close()
+
+#define KPERFC(x, value) addPerfCounter(x, value)
+
+} // namespace kram
diff --git a/libkram/kram/KramZipHelper.cpp b/libkram/kram/KramZipHelper.cpp
index d91b6a3f..d614f1cc 100644
--- a/libkram/kram/KramZipHelper.cpp
+++ b/libkram/kram/KramZipHelper.cpp
@@ -6,11 +6,12 @@
 
 #include "miniz.h"
 
-// test for perf of this compared to one in miniz also see 
+// test for perf of this compared to one in miniz also see
 // comments about faster algs.
 // libcompress can only encode lvl 5, but here it's only decompress.
+// This is failing on various ktx2 files in the mac archive
 #ifndef USE_LIBCOMPRESSION
-#define USE_LIBCOMPRESSION (KRAM_MAC || KRAM_IOS)
+#define USE_LIBCOMPRESSION 0 // KRAM_APPLE
 #endif
 
 #if USE_LIBCOMPRESSION
@@ -18,7 +19,7 @@
 #endif
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 ZipHelper::ZipHelper()
 {
@@ -113,13 +114,13 @@ void ZipHelper::initZipEntryTables()
 
         ZipEntry& zipEntry = _zipEntrys[index];
         zipEntry.fileIndex = stat.m_file_index;
-        zipEntry.filename = filename;  // can alias
+        zipEntry.filename = filename; // can alias
         zipEntry.uncompressedSize = stat.m_uncomp_size;
         zipEntry.compressedSize = stat.m_comp_size;
-        zipEntry.modificationDate = (int32_t)stat.m_time;  // really a time_t
+        zipEntry.modificationDate = (int32_t)stat.m_time; // really a time_t
 #undef crc32
         zipEntry.crc32 = stat.m_crc32;
-        
+
         // TODO: stat.m_time, state.m_crc32
 
         index++;
@@ -221,29 +222,29 @@ bool ZipHelper::extract(const ZipEntry& entry, void* buffer, uint64_t bufferSize
     // https://dougallj.wordpress.com/2022/08/20/faster-zlib-deflate-decompression-on-the-apple-m1-and-x86/
 
     // https://developer.apple.com/documentation/compression/1481000-compression_decode_buffer?language=objc
-    
+
     // This call is internal, so caller has already tested failure cases.
-    
+
 #if USE_LIBCOMPRESSION
     const uint8_t* data = mz_zip_reader_get_raw_data(zip.get(), entry.fileIndex);
     if (!data) {
         return false;
     }
-    // need to extra data and header
-    
+    // need to extract data and header
+    char scratchBuffer[compression_decode_scratch_buffer_size(COMPRESSION_ZLIB)];
+
     uint64_t bytesDecoded = compression_decode_buffer(
         (uint8_t*)buffer, entry.uncompressedSize,
         (const uint8_t*)data, entry.compressedSize,
-        NULL, // scratch-buffer that could speed up to pass
+        scratchBuffer,
         COMPRESSION_ZLIB);
-    
+
     bool success = false;
-    if (bytesDecoded == entry.uncompressedSize)
-    {
+    if (bytesDecoded == entry.uncompressedSize) {
         success = true;
     }
 #else
-    
+
     // this pulls pages from mmap, no allocations
     mz_bool success = mz_zip_reader_extract_to_mem(
         zip.get(), entry.fileIndex, buffer, bufferSize, 0);
@@ -273,7 +274,7 @@ bool ZipHelper::extractRaw(const char* filename, const uint8_t** bufferData, uin
     }
 
     *bufferData = data;
-    
+
     // This isn't correct, need to return comp_size.
     // Caller may need the uncompressed size though to decompress fully into.
     //bufferDataSize = stat.m_uncomp_size;
@@ -282,4 +283,4 @@ bool ZipHelper::extractRaw(const char* filename, const uint8_t** bufferData, uin
     return true;
 }
 
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramZipHelper.h b/libkram/kram/KramZipHelper.h
index 4e07af35..6f861b82 100644
--- a/libkram/kram/KramZipHelper.h
+++ b/libkram/kram/KramZipHelper.h
@@ -12,10 +12,10 @@ struct mz_zip_archive;
 namespace kram {
 
 //struct MmapHelper;
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 struct ZipEntry {
-    const char* filename;  // max 512, aliased
+    const char* filename; // max 512, aliased
     int32_t fileIndex;
 
     // attributes
@@ -66,11 +66,11 @@ struct ZipHelper {
     int32_t zipEntryIndex(const char* name) const;
 
 private:
-    std::unique_ptr<mz_zip_archive> zip;
+    unique_ptr<mz_zip_archive> zip;
     vector<ZipEntry> _zipEntrys;
 
-    const uint8_t* zipData;  // aliased
+    const uint8_t* zipData; // aliased
 
     vector<char> allFilenames;
 };
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/KramZipStream.cpp b/libkram/kram/KramZipStream.cpp
new file mode 100644
index 00000000..240695ad
--- /dev/null
+++ b/libkram/kram/KramZipStream.cpp
@@ -0,0 +1,186 @@
+#include "KramZipStream.h"
+
+#include "KramFileHelper.h"
+#include "miniz.h"
+
+namespace kram {
+using namespace STL_NAMESPACE;
+
+ZipStream::ZipStream()
+{
+    _stream = make_unique<mz_stream>();
+}
+
+// must be buried due to unique_ptr
+ZipStream::~ZipStream()
+{
+    close();
+}
+
+bool ZipStream::open(FileHelper* fileHelper, bool isUncompressed)
+{
+    _fileHelper = fileHelper;
+    if (!_fileHelper->isOpen()) {
+        return false;
+    }
+
+    _isUncompressed = isUncompressed;
+    if (_isUncompressed) {
+        return true;
+    }
+
+    memset(_stream.get(), 0, sizeof(mz_stream));
+
+    // https://www.zlib.net/zlib_how.html
+    // https://www.ietf.org/rfc/rfc1952.txt
+
+    // can also install custom allocators (allocates 256KB buffer otherwise)
+    // _stream->zalloc = NULL;
+    // _stream->zfree = NULL;
+    // _stream->opaque = NULL;
+    //
+    //Just making this double the default mz_stream buffer.
+    //Should be able to get about 2x compression (there an estimator in miniz).
+    //TODO: what if input is bigger than output buffer?
+    //The larger this number, the bigger the stall to compress.
+    _compressLimit = 2 * 256 * 1024;
+
+    // TODO: control level
+    // https://stackoverflow.com/questions/32225133/how-to-use-miniz-to-create-a-compressed-file-that-can-be-decompressd-by-gzip
+    // turning off zlib footer here with WINDOW_BITS
+    KVERIFY(mz_deflateInit2(_stream.get(), MZ_DEFAULT_LEVEL, MZ_DEFLATED, -MZ_DEFAULT_WINDOW_BITS, 9, MZ_DEFAULT_STRATEGY) == MZ_OK);
+
+    // These are all optional fields
+    enum GzipFlag : uint8_t {
+        kGzipFlagText = 1 << 0, // text 1, or ascii/uknown 0
+        kGzipFlagCRC = 1 << 1, // crc16 for header
+        kGzipFlagExtra = 1 << 2,
+        kGzipFlagName = 1 << 3, // null terminated filename
+        kGzipFlagComment = 1 << 4, // null terminated comment
+    };
+
+    enum GzipPlatform : uint8_t {
+        kGzipPlatformFAT = 0,
+        kGzipPlatformUnix = 3,
+        kGzipPlatformMac = 7,
+        kGzipPlatformNT = 11,
+        kGzipPlatformDefault = 255,
+    };
+
+    // for deflate, but seem of little utility
+    enum GzipCompression : uint8_t {
+        kGzipCompressionUnknown = 0,
+        kGzipCompressionSmallest = 2,
+        kGzipCompressionFastest = 4,
+    };
+
+    // gzip 10B header
+    const uint8_t header[10] = {
+        0x1f, 0x8b,
+        0x08, // (compression method - deflate)
+        0x00, // flags
+              // The time is in Unix format, i.e., seconds since 00:00:00 GMT, Jan.  1, 1970.
+        //0x00, 0x00, 0x00, 0x00,  // TODO: timestamp mtime - start of compression or of src file
+        0xAD, 0x38, 0x4D, 0x5E, // stolen from another file
+
+        kGzipCompressionUnknown, // compression id
+        kGzipPlatformUnix // os platform id
+    };
+
+    // Not writing any of the flagged fields.
+
+    // clear the data
+    _sourceCRC32 = MZ_CRC32_INIT; // is 0
+    _sourceSize = 0;
+
+    bool success = _fileHelper->write((const uint8_t*)&header, sizeof(header));
+    if (!success) {
+        KLOGE("ZipStream", "Could not write gzip header to %s", _fileHelper->filename().c_str());
+    }
+
+    return success;
+
+    // zlib is slightly different than gzip format (11B overhead)
+    // Could transfer zip crc32 and content over into a gzip file,
+    // but typical use case is that starting with uncompressed data.
+}
+
+void ZipStream::close()
+{
+    // this means it was already closed
+    if (!_fileHelper) {
+        return;
+    }
+
+    if (_isUncompressed) {
+        return;
+    }
+
+    // do this to end the stream and cleanup
+    KVERIFY(mz_deflateEnd(_stream.get()) == MZ_OK);
+
+    // can also reset and then reuse the stream, instead of end?
+    //mz_deflateReset(_stream.get());
+
+    // data is already all written, so just need the footer
+
+    const uint32_t footer[2] = {
+        _sourceCRC32,
+        (uint32_t)(_sourceSize & 0xFFFFFFFF)};
+
+    // gzip 8B trailer
+    // 4b crc checksum of original data (can use mz_crc32())
+    // 4b length of data (mod 0xFFFFFFFF), if bigger than 4gb then can only validate bottom 4B of length.
+    bool success = _fileHelper->write((const uint8_t*)&footer, sizeof(footer));
+    if (!success) {
+        KLOGE("ZipStream", "Could not write gzip footer to %s", _fileHelper->filename().c_str());
+    }
+
+    _fileHelper = nullptr;
+}
+
+Slice ZipStream::compressSlice(const Slice& in, bool finish)
+{
+    // If in.size is huge, then don't resize like this.
+    // But stream is assumed to take in smaller buffers
+    // and know compressed stream is smaller than input size
+    _compressed.resize(in.size());
+
+    _stream->avail_in = in.size();
+    _stream->next_in = in.data();
+
+    // Have to set these up, since array may have grown
+    _stream->avail_out = _compressed.size();
+    _stream->next_out = _compressed.data();
+
+    // Hope don't need to do this in a loop
+    int status = mz_deflate(_stream.get(), finish ? MZ_FINISH : MZ_SYNC_FLUSH);
+    if (finish)
+        KASSERT(status == MZ_STREAM_END);
+    else
+        KASSERT(status == MZ_OK);
+    (void)status;
+
+    // TODO: would be nice to skip crc32 work
+    _sourceSize += in.size();
+    _sourceCRC32 = mz_crc32(_sourceCRC32, in.data(), in.size());
+
+    // return the compressed output
+    int numBytesCompressed = _compressed.size() - _stream->avail_out;
+    return Slice(_compressed.data(), numBytesCompressed);
+}
+
+void ZipStream::compress(const Slice& uncompressedData, bool finish)
+{
+    if (_isUncompressed) {
+        _fileHelper->write(uncompressedData.data(), uncompressedData.size());
+        return;
+    }
+
+    Slice compressedSlice = compressSlice(uncompressedData, finish);
+
+    // This writes out to a fileHelper
+    _fileHelper->write(compressedSlice.data(), compressedSlice.size());
+}
+
+} // namespace kram
diff --git a/libkram/kram/KramZipStream.h b/libkram/kram/KramZipStream.h
new file mode 100644
index 00000000..bf920074
--- /dev/null
+++ b/libkram/kram/KramZipStream.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <span>
+
+#include "KramConfig.h"
+
+struct mz_stream;
+
+namespace kram {
+using namespace STL_NAMESPACE;
+
+class FileHelper;
+
+// This can be passed a count
+template <typename T>
+using Span = span<T, dynamic_extent>;
+using Slice = Span<uint8_t>;
+
+// Compressed stream interface.
+// Might have gzip, zlib, zip file support
+class ICompressedStream {
+public:
+    virtual ~ICompressedStream() {}
+
+    // compress and store the data
+    virtual void compress(const Slice& uncompressedData, bool finish) = 0;
+
+    // when reached then call compress
+    virtual uint32_t compressLimit() const = 0;
+};
+
+// Compress content into a gzip (.gz) file using deflate.
+// The bytes are then written out to a provided FileHelper.
+class ZipStream : public ICompressedStream {
+public:
+    ZipStream();
+    virtual ~ZipStream();
+
+    // writes opening header and closing footer
+    // Can disable compression for testing the src content.
+    bool open(FileHelper* fileHelper, bool isUncompressed = false);
+    void close();
+
+    // compress and write data to helper
+    virtual void compress(const Slice& uncompressedData, bool finish) override;
+
+    // test this for when to call compress
+    virtual uint32_t compressLimit() const override
+    {
+        return _compressLimit;
+    }
+
+private:
+    Slice compressSlice(const Slice& in, bool finish);
+
+    vector<uint8_t> _compressed;
+    unique_ptr<mz_stream> _stream;
+    FileHelper* _fileHelper = nullptr;
+
+    uint32_t _sourceCRC32 = 0;
+    size_t _sourceSize = 0;
+    uint32_t _compressLimit = 0;
+    bool _isUncompressed = false;
+};
+
+} // namespace kram
diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp
index ef2628f2..05c07f9a 100644
--- a/libkram/kram/TaskSystem.cpp
+++ b/libkram/kram/TaskSystem.cpp
@@ -1,61 +1,64 @@
 #include "TaskSystem.h"
 
-#if KRAM_MAC
-    // affinity
-    #include <mach/thread_act.h>
-    #include <mach/thread_policy.h>
-
-    #include <pthread/qos.h>
-    #include <pthread/pthread.h>
-    #include <sys/sysctl.h>
-#elif KRAM_IOS
-    #include <pthread/qos.h>
-    #include <sys/sysctl.h>
+// Stop using this, so can have unified kram lib
+//#if KRAM_MAC
+// affinity
+//#include <mach/thread_act.h>
+//#include <mach/thread_policy.h>
+//#endif
+
+#if KRAM_APPLE
+#include <pthread.h>
+#include <pthread/qos.h>
+#include <sys/sysctl.h>
 #elif KRAM_WIN
-    #include <windows.h>
-    #include <processthreadsapi.h>
+// annoying that windows.h has to be ordered first
+// clang-format off
+#ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+// clang-format off
+#include <processthreadsapi.h>
 #elif KRAM_ANDROID
-    #include <sys/resource.h>
+#include <sys/resource.h>
 #else
-    #include <pthread/pthread.h>
+#include <pthread.h>
 #endif
 
 // TODO: look at replacing this with Job Queue from Filament
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
-enum class CoreType : uint8_t
-{
+enum class CoreType : uint8_t {
     Little,
     // Medium,
     Big,
 };
 
-struct CoreNum
-{
+struct CoreNum {
     uint8_t index;
-//#if KRAM_WIN
-//    uint8_t group; // for Win only
-//#endif
+    //#if KRAM_WIN
+    //    uint8_t group; // for Win only
+    //#endif
     CoreType type;
 };
 
-struct CoreInfo
-{
+struct CoreInfo {
     // hyperthreading can result in logical = 2x physical cores (1.5x on Alderlake)
     uint32_t logicalCoreCount;
     uint32_t physicalCoreCount;
-    
+
     // ARM is has big-little and big-medium-little, no HT, 2/4, 4/4, 6/2, 8/2.
     // Intel x64 AlderLake has big-little. 24 threads (8x2HT/8)
     uint32_t bigCoreCount;
     uint32_t littleCoreCount;
-    
+
     // x64 under Rosetta2 on M1 Arm chip, no AVX only SSE 4.2
     uint32_t isTranslated;
     uint32_t isHyperthreaded;
-    
+
     vector<CoreNum> remapTable;
 };
 
@@ -63,14 +66,13 @@ struct CoreInfo
 // Helper function to count set bits in the processor mask.
 static DWORD CountSetBits(ULONG_PTR bitMask)
 {
-    DWORD LSHIFT = sizeof(ULONG_PTR)*8 - 1;
+    DWORD LSHIFT = sizeof(ULONG_PTR) * 8 - 1;
     DWORD bitSetCount = 0;
     ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
     DWORD i;
-    
-    for (i = 0; i <= LSHIFT; ++i)
-    {
-        bitSetCount += ((bitMask & bitTest)?1:0);
+
+    for (i = 0; i <= LSHIFT; ++i) {
+        bitSetCount += ((bitMask & bitTest) ? 1 : 0);
         bitTest /= 2;
     }
 
@@ -87,14 +89,14 @@ static const CoreInfo& GetCoreInfo()
     // this includes hyperthreads
     coreInfo.logicalCoreCount = std::thread::hardware_concurrency();
     coreInfo.physicalCoreCount = coreInfo.logicalCoreCount;
-        
-    #if KRAM_IOS || KRAM_MAC
+
+#if KRAM_APPLE
     // get big/little core counts
     // use sysctl -a from command line to see all
     size_t size = sizeof(coreInfo.bigCoreCount);
-    
+
     uint32_t perfLevelCount = 0;
-    
+
     // only big-little core counts on macOS12/iOS15
     sysctlbyname("hw.nperflevels", &perfLevelCount, &size, nullptr, 0);
     if (perfLevelCount > 0) {
@@ -106,10 +108,10 @@ static const CoreInfo& GetCoreInfo()
         // can't identify little cores
         sysctlbyname("hw.perflevel0.physicalcpu", &coreInfo.bigCoreCount, &size, nullptr, 0);
     }
-    
+
     // may not work on A10 2/2 exclusive
     coreInfo.physicalCoreCount = std::min(coreInfo.bigCoreCount + coreInfo.littleCoreCount, coreInfo.physicalCoreCount);
-    
+
     // no affinity, so core order here doesn't really matter.
     for (uint32_t i = 0; i < coreInfo.bigCoreCount; ++i) {
         coreInfo.remapTable.push_back({(uint8_t)i, CoreType::Big});
@@ -117,43 +119,44 @@ static const CoreInfo& GetCoreInfo()
     for (uint32_t i = 0; i < coreInfo.littleCoreCount; ++i) {
         coreInfo.remapTable.push_back({(uint8_t)(i + coreInfo.bigCoreCount), CoreType::Little});
     }
-    
+
     coreInfo.isHyperthreaded = coreInfo.logicalCoreCount != coreInfo.physicalCoreCount;
-    
-    #if KRAM_MAC
+
+#if KRAM_MAC
     // Call the sysctl and if successful return the result
     sysctlbyname("sysctl.proc_translated", &coreInfo.isTranslated, &size, NULL, 0);
-    #endif
-    
-    #elif KRAM_WIN
-    
+#endif
+
+#elif KRAM_WIN
+
     // have to walk array of data, and assemble this info, ugh
     // https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformation
-    
+
     // https://docs.microsoft.com/en-us/windows/win32/procthread/multiple-processors
-    
+
     // Really need to use _EX version to get at numa groups
     // but it doesn't have same bitmask for logical cores.
     // Revisit when really building app on Win, but it just
     // broke the build too many times.
-    
+
     DWORD logicalCoreCount = 0;
     DWORD physicalCoreCount = 0;
     bool isHyperthreaded = false;
-    
+
     using ProcInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION;
     using ProcInfoPtr = PSYSTEM_LOGICAL_PROCESSOR_INFORMATION;
-    
+
     DWORD returnLength = 0;
-    DWORD rc = GetLogicalProcessorInformation((ProcInfoPtr)nullptr, &returnLength);
-    
+    DWORD rc = GetLogicalProcessorInformation((ProcInfoPtr) nullptr, &returnLength);
+
     vector<uint8_t> buffer;
     buffer.resize(returnLength);
     rc = GetLogicalProcessorInformation((ProcInfoPtr)buffer.data(), &returnLength);
-    
+    (void)rc; // unused
+
     ProcInfoPtr ptr = nullptr;
     DWORD byteOffset = 0;
-    
+
     // walk the array
     ptr = (ProcInfoPtr)buffer.data();
     byteOffset = 0;
@@ -166,15 +169,17 @@ static const CoreInfo& GetCoreInfo()
                 }
                 break;
             }
+            default:
+                break;
         }
-        
+
         if (isHyperthreaded)
             break;
-        
+
         byteOffset += sizeof(ProcInfo);
         ptr++;
     }
-    
+
     ptr = (ProcInfoPtr)buffer.data();
     byteOffset = 0;
     uint32_t coreNumber = 0;
@@ -182,7 +187,7 @@ static const CoreInfo& GetCoreInfo()
         switch (ptr->Relationship) {
             case RelationProcessorCore: {
                 physicalCoreCount++;
-                
+
                 // A hyperthreaded core supplies more than one logical processor.
                 // Can identify AlderLake big vs. little off this
                 uint32_t logicalCores = CountSetBits(ptr->ProcessorMask);
@@ -194,24 +199,27 @@ static const CoreInfo& GetCoreInfo()
                     coreInfo.littleCoreCount++;
                     coreInfo.remapTable.push_back({(uint8_t)coreNumber, CoreType::Little});
                 }
-                
+
                 // Is this the correct index for physical cores?
                 // Always go through remap table
                 coreNumber += logicalCores;
-                
+
                 logicalCoreCount += logicalCores;
                 break;
             }
+            default:
+                break;
         }
         byteOffset += sizeof(ProcInfo);
         ptr++;
     }
-    
-    
+
+    (void)logicalCoreCount; // unused
+
     coreInfo.isHyperthreaded = isHyperthreaded;
     coreInfo.physicalCoreCount = physicalCoreCount;
-    
-    #elif KRAM_ANDROID
+
+#elif KRAM_ANDROID
 
     // TODO: have to walk array of proc/cpuinfo, and assemble this info, ugh
     // then build a core remap table since big core are typically last, little early
@@ -219,18 +227,18 @@ static const CoreInfo& GetCoreInfo()
 
     // JDK and NDK version of library with workarounds
     // https://github.com/google/cpu_features
-    
+
     // hack - assume all big cores, typical 1/3/4 or 2/2/4
     coreInfo.bigCoreCount = coreInfo.physicalCoreCount;
-    
-    for (int32_t i = coreInfo.bigCoreCount-1; i >= 0; --i) {
+
+    for (int32_t i = coreInfo.bigCoreCount - 1; i >= 0; --i) {
         coreInfo.remapTable.push_back({(uint8_t)i, CoreType::Big});
     }
-    
-    #endif
-    
+
+#endif
+
     // sort faster cores first in the remap table
-    std::sort(coreInfo.remapTable.begin(), coreInfo.remapTable.end(), [](const CoreNum& lhs, const CoreNum& rhs){
+    std::sort(coreInfo.remapTable.begin(), coreInfo.remapTable.end(), [](const CoreNum& lhs, const CoreNum& rhs) {
 #if KRAM_ANDROID
         // sort largest index
         if (lhs.type == rhs.type)
@@ -242,10 +250,8 @@ static const CoreInfo& GetCoreInfo()
             return lhs.index < rhs.index;
         return lhs.type > rhs.type;
 #endif
-        
-       
     });
-    
+
     return coreInfo;
 }
 
@@ -266,7 +272,7 @@ std::thread::native_handle_type getCurrentThread()
 // Of course, Windows has to make portability difficult.
 // And Mac non-standardly, doesn't even pass thread to call.
 //   This requires it to be set from thread itself.
-constexpr const uint32_t kMaxThreadName = 32;
+// Also linux (and Android?) limited to 15chars.
 
 #if KRAM_WIN
 
@@ -274,11 +280,13 @@ constexpr const uint32_t kMaxThreadName = 32;
 // Can just set in manifest file.
 // SetConsoleOutputCP(CP_UTF8);
 
-void setThreadName(std::thread::native_handle_type handle, const char* threadName)
+void setCurrentThreadName(const char* threadName)
 {
+    std::thread::native_handle_type handle = getCurrentThread();
+
     // TODO: use std::wstring_convert();
     // std::codecvt_utf8_utf16
-    
+
     // ugh, win still using char16_t.  TODO: this isn't utf8 to utf16 conversion
     uint32_t len = strlen(threadName);
     std::wstring str;
@@ -287,26 +295,16 @@ void setThreadName(std::thread::native_handle_type handle, const char* threadNam
         if (threadName[i] <= 127)
             str.push_back((char)threadName[i]);
     }
-    
-    ::SetThreadDescription(handle, str.c_str());
-}
 
-void setCurrentThreadName(const char* threadName)
-{
-    setThreadName(getCurrentThread(), threadName);
-}
-
-void setThreadName(std::thread& thread, const char* threadName)
-{
-    setThreadName(thread.native_handle(), threadName);
+    ::SetThreadDescription(handle, str.c_str());
 }
 
-void getCurrentThreadName(char name[kMaxThreadName])
+void getThreadName(std::thread::native_handle_type threadHandle, char name[kMaxThreadName])
 {
     name[0] = 0;
-    
+
     wchar_t* threadNameW = nullptr;
-    HRESULT hr = ::GetThreadDescription(getCurrentThread(), &threadNameW);
+    HRESULT hr = ::GetThreadDescription(threadHandle, &threadNameW);
     if (SUCCEEDED(hr)) {
         // convert name back
         uint32_t len = wcslen(threadNameW);
@@ -315,134 +313,148 @@ void getCurrentThreadName(char name[kMaxThreadName])
         for (uint32_t i = 0; i < len; ++i) {
             name[i] = (char)threadNameW[i];
         }
-        name[kMaxThreadName-1] = 0;
-        
+        name[kMaxThreadName - 1] = 0;
+
         LocalFree(threadNameW);
     }
 }
 
-#elif KRAM_MAC || KRAM_IOS
+#elif KRAM_APPLE
 
-void setThreadName(std::thread::native_handle_type macroUnusedArg(handle), const char* threadName)
+void setCurrentThreadName(const char* threadName)
 {
-    // This can only set on self
+    // 64-char limit
+    // can only set thread from thread on macOS, sucks
     int val = pthread_setname_np(threadName);
     if (val != 0)
         KLOGW("Thread", "Could not set thread name");
 }
 
-void setCurrentThreadName(const char* threadName)
+void getThreadName(std::thread::native_handle_type threadHandle, char name[kMaxThreadName])
 {
-    setThreadName(getCurrentThread(), threadName);
+    pthread_getname_np(threadHandle, name, kMaxThreadName);
 }
 
-// This doesn't exist on macOS. What a pain.  Doesn't line up with getter calls.
-// Means can't set threadName externally without telling thread to wake and set itself.
-//void setThreadName(std::thread& thread, const char* threadName)
-//{
-//    auto handle = thread.native_handle();
-//    setThreadName(handle, threadName);
-//}
+#elif KRAM_LINUX || KRAM_ANDROID
 
-void getCurrentThreadName(char name[kMaxThreadName])
+void setCurrentThreadName(const char* threadName)
 {
-    pthread_getname_np(getCurrentThread(), name, kMaxThreadName);
-}
-#else
+    // 15 char name limit on Linux/Android, how modern!
+    // Call fails if longer, so have to truncate
+    char threadName16[16];
+    strlcpy(threadName16, threadName, 16);
+    
+    int val = pthread_setname_np(getCurrentThread(), threadName16);
 
-// 15 char name limit on Linux/Android, how modern!
-void setThreadName(std::thread::native_handle_type handle, const char* threadName)
-{
-    int val = pthread_setname_np(handle, threadName);
     if (val != 0)
         KLOGW("Thread", "Could not set thread name");
 }
 
-void setCurrentThreadName(const char* threadName)
+void getThreadName(std::thread::native_handle_type threadHandle, char name[kMaxThreadName])
 {
-    setThreadName(getCurrentThread(), threadName);
+    pthread_getname_np(threadHandle, name, kMaxThreadName);
 }
 
-void setThreadName(std::thread& thread, const char* threadName)
+#endif
+
+void getThreadName(std::thread& thread, char name[kMaxThreadName])
 {
-    setThreadName(thread.native_handle(), threadName);
+    getThreadName(thread.native_handle(), name);
 }
 
 void getCurrentThreadName(char name[kMaxThreadName])
 {
-    pthread_getname_np(getCurrentThread(), name, kMaxThreadName);
+    getThreadName(getCurrentThread(), name);
 }
 
-#endif
-
 //------------------
 
-#if KRAM_MAC || KRAM_IOS
+#if KRAM_APPLE
 
-static void setThreadPriority(std::thread::native_handle_type handle, ThreadPriority priority)
+static void setThreadPriority(std::thread::native_handle_type macroUnusedArg(handle), ThreadPriority priority)
 {
-    if (priority == ThreadPriority::Default) {
-        
-        /* samples of qos
-        qos_class_t qos = QOS_CLASS_UNSPECIFIED;
-        switch(level) {
-            case ThreadQos::Interactive: qos = QOS_CLASS_USER_INTERACTIVE; break;
-            case ThreadQos::High: qos = QOS_CLASS_USER_INITIATED; break;
-            case ThreadQos::Default: qos = QOS_CLASS_DEFAULT; break;
-            case ThreadQos::Medium: qos = QOS_CLASS_UTILITY; break;
-            case ThreadQos::Low: qos = QOS_CLASS_BACKGROUND; break;
-        }
-        */
-        
-        // qos is transferred to GCD jobs, and can experience thread depriority
-        // can system can try to adjust priority inversion.
-        
-        // note here the priorityOffset = 0, but is negative offsets
-        // there is a narrow range of offsets
-        
-        // note this is a start/end overide call, but can set override on existing thread
-        // TODO: this returns a newly allocated object which isn't released here
-        // need to release with pthread_override_qos_class_end_np(override);
-        
-        qos_class_t qos = QOS_CLASS_DEFAULT;
-        auto val = pthread_override_qos_class_start_np(handle, qos, 0);
-        if (val != nullptr)
-            KLOGW("Thread", "Failed to set qos %d", (int)qos);
+    // Note: this only works on current thread.
+    // Xcode displays qos in the thread view, but doesn't display priority.
+    // Don't mix qos and priority.  qos locks out setting prioririty, scheduler.
+    
+    qos_class_t qos = QOS_CLASS_UNSPECIFIED;
+    switch(priority) {
+        case ThreadPriority::Interactive: qos = QOS_CLASS_USER_INTERACTIVE; break;
+        case ThreadPriority::High: qos = QOS_CLASS_USER_INITIATED; break;
+        case ThreadPriority::Default: qos = QOS_CLASS_DEFAULT; break;
+            
+        // TODO: more qOS to choose from
+        //case ThreadQos::Medium: qos = QOS_CLASS_UTILITY; break;
+        //case ThreadQos::Low: qos = QOS_CLASS_BACKGROUND; break;
     }
-    else {
-        int prioritySys = 0;
-        switch(priority) {
-            case ThreadPriority::Default: prioritySys = 30;  break; // skipped above
-            case ThreadPriority::High: prioritySys = 41; break;
-            case ThreadPriority::Interactive: prioritySys = 45; break;
-        }
-                
-        struct sched_param param = { prioritySys };
-        
-        // policy choices
-        // SCHED_RR, SCHED_FIFO, SCHED_OTHER
-        int policy = SCHED_RR;
-        
-        // this sets policy to round-robin and priority
-        int val = pthread_setschedparam(handle, policy, &param);
-        if (val != 0)
-            KLOGW("Thread", "Failed to set policy %d priority %d", policy, prioritySys);
+
+    // Note here the priorityOffset = 0, but is negative offsets.
+    // There is a narrow range of offsets per qos.
+    // QOS_CLASS_USER_INTERACTIVE: 38-47  -9
+    // QOS_CLASS_USER_INITIATED:   32-37  -5
+    // QOS_CLASS_DEFAULT:          21-31 -10
+    // QOS_CLASS_UTILITY:           5-20 -15
+    // QOS_CLASS_BACKGROUND:        0-4   -4
+    
+    int relativePriority = 0;
+    auto val = pthread_set_qos_class_self_np(qos, relativePriority);
+    if (val != 0)
+        KLOGW("Thread", "Failed to set qos %d", (int)qos);
+}
+
+#elif KRAM_LINUX
+static void setThreadPriority(std::thread::native_handle_type handle, ThreadPriority priority)
+{
+    // policy choices
+    // SCHED_RR, SCHED_FIFO, SCHED_OTHER
+    int policy = SCHED_RR;
+
+    int prioritySys = 0;
+    switch (priority) {
+        case ThreadPriority::Default:
+            prioritySys = 0;
+            policy = SCHED_OTHER;
+            break; // skipped above
+        case ThreadPriority::High:
+            prioritySys = -4;
+            policy = SCHED_RR;
+            break;
+        case ThreadPriority::Interactive:
+            prioritySys = -8;
+            policy = SCHED_RR;
+            break;
     }
+
+    struct sched_param param = {prioritySys};
+
+    // this sets policy to round-robin and priority
+    int val = pthread_setschedparam(handle, policy, &param);
+    if (val != 0)
+        KLOGW("Thread", "Failed to set policy %d priority %d", policy, prioritySys);
 }
 
+
 #elif KRAM_ANDROID
 
-static void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority)
+static void setThreadPriority(std::thread::native_handle_type macroUnusedArg(handle), ThreadPriority priority)
 {
+    // This only works on current thread.
+    
     // This doesn't change policy.
     // Android on -20 to 20, where lower is higher priority
     int prioritySys = 0;
-    switch(priority) {
-        case ThreadPriority::Default: prioritySys = 0;  break; // NORMAL
-        case ThreadPriority::High: prioritySys = -4; break; // ABOVE NORMAL
-        case ThreadPriority::Interactive: prioritySys = -8; break; // HIGHEST
+    switch (priority) {
+        case ThreadPriority::Default:
+            prioritySys = 0;
+            break; // NORMAL
+        case ThreadPriority::High:
+            prioritySys = -4;
+            break; // ABOVE NORMAL
+        case ThreadPriority::Interactive:
+            prioritySys = -8;
+            break; // HIGHEST
     }
-    
+
     int val = setpriority(PRIO_PROCESS, 0, prioritySys);
     if (val != 0)
         KLOGW("Thread", "Failed to set priority %d", prioritySys);
@@ -455,12 +467,18 @@ static void setThreadPriority(std::thread::native_handle_type handle, ThreadPrio
     // This doesn't change policy.
     // Win has 0 to 15 normal, then 16-31 real time priority
     int prioritySys = 0;
-    switch(priority) {
-        case ThreadPriority::Default: prioritySys = 0;  break; // NORMAL
-        case ThreadPriority::High: prioritySys = 1; break; // ABOVE NORMAL
-        case ThreadPriority::Interactive: prioritySys = 2; break; // HIGHEST
+    switch (priority) {
+        case ThreadPriority::Default:
+            prioritySys = 0;
+            break; // NORMAL
+        case ThreadPriority::High:
+            prioritySys = 1;
+            break; // ABOVE NORMAL
+        case ThreadPriority::Interactive:
+            prioritySys = 2;
+            break; // HIGHEST
     }
-    
+
     BOOL success = SetThreadPriority(handle, prioritySys);
     if (!success)
         KLOGW("Thread", "Failed to set priority %d", prioritySys);
@@ -481,42 +499,42 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t
     // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/
     //
     const auto& coreInfo = GetCoreInfo();
-    
+
     uint32_t maxIndex = coreInfo.remapTable.size() - 1;
     if (threadIndex > maxIndex)
         threadIndex = maxIndex;
-    
+
     threadIndex = coreInfo.remapTable[threadIndex].index;
-    
+
     // for now only allow single core mask
     uint64_t affinityMask = ((uint64_t)1) << threadIndex;
-    
+
     // These are used in most of the paths
     macroUnusedVar(handle);
     macroUnusedVar(affinityMask);
-    
+
     bool success = false;
-    
-#if KRAM_MAC || KRAM_IOS
+
+#if KRAM_APPLE
     // no support, don't use thread_policy_set it's not on M1 and just a hint
     success = true;
-    
+
 #elif KRAM_ANDROID
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
     CPU_SET(threadIndex, &cpuset);
-    
+
     // convert pthread to pid
     pid_t pid;
     pthread_getunique_np(handle, &pid);
     success = sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset) == 0;
-    
+
 #elif KRAM_WIN
     // each processor group only has 64 bits
     DWORD_PTR mask = SetThreadAffinityMask(handle, *(const DWORD_PTR*)&affinityMask);
     success = mask != 0;
-    
-#else
+
+#elif KRAM_LINUX
     // most systems are pthread-based, this is represented with array of bits
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
@@ -534,7 +552,6 @@ void task_system::set_current_affinity(uint32_t threadIndex)
     setThreadAffinity(getCurrentThread(), threadIndex);
 }
 
-
 #endif
 
 void task_system::run(int32_t threadIndex)
@@ -548,7 +565,7 @@ void task_system::run(int32_t threadIndex)
         // Note that if threadIndex queue is empty and stays empty
         // then pop() below will stop using that thread.  But async_ is round-robining
         // all work across the available queues.
-        int32_t multiple = 4;  // 32;
+        int32_t multiple = 4; // 32;
         int32_t numTries = 0;
         for (int32_t n = 0, nEnd = _count * multiple; n < nEnd; ++n) {
             numTries++;
@@ -585,47 +602,46 @@ void task_system::run(int32_t threadIndex)
 }
 
 // This only works for current thread, but simplifies setting several thread params.
-void setThreadInfo(ThreadInfo& info) {
+void setThreadInfo(ThreadInfo& info)
+{
     setCurrentThreadName(info.name);
 
     setThreadPriority(getCurrentThread(), info.priority);
-    
-    #if SUPPORT_AFFINITY
+
+#if SUPPORT_AFFINITY
     setThreadAffinity(getCurrentThread(), info.affinity);
-    #endif
+#endif
 }
 
-task_system::task_system(int32_t count) :
-    _count(std::min(count, (int32_t)GetCoreInfo().physicalCoreCount)),
-    _q{(size_t)_count},
-    _index(0)
+task_system::task_system(int32_t count) : _count(std::min(count, (int32_t)GetCoreInfo().physicalCoreCount)),
+                                          _q{(size_t)_count},
+                                          _index(0)
 {
     // see WWDC 2021 presentation here
     // Tune CPU job scheduling for Apple silicon games
     // https://developer.apple.com/videos/play/tech-talks/110147/
-    ThreadInfo infoMain = { "Main", ThreadPriority::Interactive, 0 };
+    ThreadInfo infoMain = {"Sheduler", ThreadPriority::Interactive, 0};
     setThreadInfo(infoMain);
-    
+
     // Note that running work on core0 when core0 may starve it
     // from assigning work to threads.
-        
+
     // start up the threads
     string name;
     for (int32_t threadIndex = 0; threadIndex != _count; ++threadIndex) {
-        
         // Generate a name, also corresponds to core for affinity
         // May want to include priority too.
         sprintf(name, "Task%d", threadIndex);
         _threadNames.push_back(name);
-        
+
         _threads.emplace_back([&, threadIndex, name] {
-            ThreadInfo infoTask = { name.c_str(), ThreadPriority::High, threadIndex };
+            ThreadInfo infoTask = {name.c_str(), ThreadPriority::High, threadIndex};
             setThreadInfo(infoTask);
 
             run(threadIndex);
         });
     }
-        
+
     // dump out thread data
     log_threads();
 }
@@ -633,75 +649,87 @@ task_system::task_system(int32_t count) :
 static ThreadPriority getThreadPriority(std::thread::native_handle_type handle)
 {
     ThreadPriority priority = ThreadPriority::Default;
+
+#if KRAM_APPLE
+    qos_class_t qos = QOS_CLASS_UNSPECIFIED;
+    int relativePriority = 0;
+    
+    pthread_get_qos_class_np(handle, &qos, &relativePriority);
     
-#if KRAM_MAC || KRAM_IOS || KRAM_ANDROID
-    // Note: this doesn't handle qOS, and returns default priority
-    // on those threads.
+    switch(qos) {
+        case QOS_CLASS_USER_INTERACTIVE: priority = ThreadPriority::Interactive; break;
+        case QOS_CLASS_USER_INITIATED: priority = ThreadPriority::High; break;
+        case QOS_CLASS_DEFAULT:
+        case QOS_CLASS_UTILITY:
+        case QOS_CLASS_BACKGROUND:
+        default:
+            priority = ThreadPriority::Default;
+            break;
+    }
     
+#elif KRAM_ANDROID || KRAM_LINUX
+   
     int policy = 0;
     struct sched_param priorityVal;
     int val = pthread_getschedparam(handle, &policy, &priorityVal);
     if (val != 0)
         KLOGW("Thread", "failed to retrieve thread data");
     int prioritySys = priorityVal.sched_priority;
-    
+
     // remap back to enum
-    switch(prioritySys) {
-        case 41: priority = ThreadPriority::High; break;
-        case 45: priority = ThreadPriority::Interactive; break;
-        default: priority = ThreadPriority::Default; break;
+    switch (prioritySys) {
+        case -8:
+            priority = ThreadPriority::High;
+            break;
+        case -4:
+            priority = ThreadPriority::Interactive;
+            break;
+        default:
+            priority = ThreadPriority::Default;
+            break;
     }
     
-/* Using code above since it may work with other threads
-#elif KRAM_ANDROID
-    // Note: only for current thread
-    
-    // only have getpriority call on current thread
-    // pthread_getschedparam never returns valid data
-    int priority = getpriority(PRIO_PROCESS, 0);
-    switch(prioritySys) {
-        case 41: priority = ThreadPriority::High; break;
-        case 45: priority = ThreadPriority::Interactive; break;
-        default: priority = ThreadPriority::Default; break;
-    }
-*/
 #elif KRAM_WIN
     // all threads same policy on Win?
     // https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Windows%20never%20adjusts%20the%20priority,the%20process%20that%20created%20it.
-    
+
     // scheduling based on process priority class, thread priority is +/- offset
     // DWORD priorityClass = GetPriorityClass(GetCurrentProcess());
-    
+
     // The handle must have the THREAD_QUERY_INFORMATION or THREAD_QUERY_LIMITED_INFORMATION access right.
     int prioritySys = GetThreadPriority(handle);
     if (prioritySys == THREAD_PRIORITY_ERROR_RETURN)
         prioritySys = 0;
-    
-    switch(prioritySys) {
-        case 1: priority = ThreadPriority::High; break;
-        case 2: priority = ThreadPriority::Interactive; break;
-        default: priority = ThreadPriority::Default; break;
+
+    switch (prioritySys) {
+        case 1:
+            priority = ThreadPriority::High;
+            break;
+        case 2:
+            priority = ThreadPriority::Interactive;
+            break;
+        default:
+            priority = ThreadPriority::Default;
+            break;
     }
 #endif
-    
+
     return priority;
 }
 
-
 void task_system::log_threads()
 {
     ThreadInfo info = {};
-    info.name = "Main";
+    info.name = "Scheduler";
 #if SUPPORT_AFFINITY
     info.affinity = 0;
 #endif
-    
+
     info.priority = getThreadPriority(getCurrentThread());
     KLOGI("Thread", "Thread:%s (pri:%d aff:%d)",
           info.name, info.priority, info.affinity);
-    
-    for (uint32_t i = 0; i < _threads.size(); ++i)
-    {
+
+    for (uint32_t i = 0; i < _threads.size(); ++i) {
         info.name = _threadNames[i].c_str();
 #if SUPPORT_AFFINITY
         // TODO: if more tasks/threads than cores, then this isn't accurate
@@ -725,8 +753,7 @@ task_system::~task_system()
         e.join();
 }
 
-}
-
+} //namespace kram
 
 /**************************************************************************************************/
 
diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h
index a4118afd..6aa6b3a7 100644
--- a/libkram/kram/TaskSystem.h
+++ b/libkram/kram/TaskSystem.h
@@ -1,7 +1,7 @@
 /*
     Copyright 2015 Adobe Systems Incorporated
     Distributed under the MIT License (see license at http://stlab.adobe.com/licenses.html)
-    
+
     This file is intended as example code and is not production quality.
 */
 
@@ -15,18 +15,16 @@
 
 // TODO: get these three out of header, they pull in basic_string via system_errror header
 // but this file isn't included in many places.
-#include <mutex>
 #include <condition_variable>
+#include <mutex>
 #include <thread>
 
 //#include <vector>
 
-
-
 /**************************************************************************************************/
 
 namespace kram {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 /**************************************************************************************************/
 
@@ -34,6 +32,7 @@ using mymutex = std::recursive_mutex;
 using mylock = std::unique_lock<mymutex>;
 using mycondition = std::condition_variable_any;
 
+// TOOO: can also have a priority_queue
 #define mydeque deque
 #define myfunction function
 
@@ -59,7 +58,7 @@ class notification_queue {
     {
         mylock lock{_mutex};
         while (_q.empty() && !_done) {
-            _ready.wait(lock);  // this is what blocks a given thread to avoid spin loop
+            _ready.wait(lock); // this is what blocks a given thread to avoid spin loop
         }
 
         // handle done state
@@ -92,7 +91,8 @@ class notification_queue {
     {
         {
             mylock lock{_mutex};
-            // TODO: fix this construct, it's saying no matching sctor for mydeque<eastl::function<void ()>>>::value_type
+            // TODO: fix this construct, eastl is saying no matching sctor for
+            // mydeque<eastl::function<void ()>>>::value_type
 #if USE_EASTL
             KLOGE("TaskSystem", "Fix eastl deque or function");
             //_q.emplace_back(forward<F>(f));
@@ -107,7 +107,7 @@ class notification_queue {
     // has queue been marked done or not
     bool is_done() const
     {
-        mylock lock{const_cast<mymutex&>(_mutex)};  // ugh
+        mylock lock{const_cast<mymutex&>(_mutex)}; // ugh
         bool done_ = _done;
         return done_;
     }
@@ -132,12 +132,10 @@ class notification_queue {
 // Note: if running multiple processes on the same cpu, then affinity
 // isn't ideal.  It will force work onto the same cores.  Especially if
 // limiting cores to say 4/16, then can run 4 processes faster w/o affinity.
-#define SUPPORT_AFFINITY (KRAM_ANDROID || KRAM_WIN)
-
+#define SUPPORT_AFFINITY (KRAM_ANDROID || KRAM_WIN || KRAM_LINUX)
 
 // only for ioS/macOS
-enum class ThreadPriority
-{
+enum class ThreadPriority {
     //Low = 1,
     //Medium = 2,
     Default = 3,
@@ -151,19 +149,32 @@ struct ThreadInfo {
     int affinity = 0; // single core for now
 };
 
+std::thread::native_handle_type getCurrentThread();
+
 // This only works for current thread, but simplifies setting several thread params.
 void setThreadInfo(ThreadInfo& info);
 
-    
+// This is limited to 16 on linux
+// #define TASK_COMM_LEN 16
+constexpr const uint32_t kMaxThreadName = 32;
+
+void setCurrentThreadName(const char* threadName);
+
+void getThreadName(std::thread::native_handle_type threadHandle, char name[kMaxThreadName]);
+
+void getThreadName(std::thread& thread, char name[kMaxThreadName]);
+
+void getCurrentThreadName(char name[kMaxThreadName]);
+
 class task_system {
     NOT_COPYABLE(task_system);
 
     const int32_t _count;
     vector<std::thread> _threads;
-    
+
     // want to store with thread itself, but no field.  Also have affinity, priority data.
     vector<string> _threadNames;
-    
+
     // currently one queue to each thread, but can steal from other queues
     vector<notification_queue> _q;
     std::atomic<int32_t> _index;
@@ -173,17 +184,17 @@ class task_system {
 #if SUPPORT_AFFINITY
     static void set_current_affinity(uint32_t threadIndex);
 #endif
-    
+
     static void set_current_priority(ThreadPriority priority);
-    
+
     void log_threads();
-    
+
 public:
     task_system(int32_t count = 1);
     ~task_system();
 
     int32_t num_threads() const { return _count; }
-    
+
     template <typename F>
     void async_(F&& f)
     {
@@ -205,5 +216,4 @@ class task_system {
     }
 };
 
-
-}  // namespace kram
+} // namespace kram
diff --git a/libkram/kram/_clang-format b/libkram/kram/_clang-format
deleted file mode 100644
index bb5f2bf3..00000000
--- a/libkram/kram/_clang-format
+++ /dev/null
@@ -1,156 +0,0 @@
----
-Language:        Cpp
-# BasedOnStyle:  Google
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: WithoutElse
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:   
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Stroustrup
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     0
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:   
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Regroup
-IncludeCategories: 
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 4
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats: 
-  - Language:        Cpp
-    Delimiters:      
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:      
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions: 
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-StatementMacros: 
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        4
-UseTab:          Never
-...
-
diff --git a/libkram/kram/float4a.cpp b/libkram/kram/float4a.cpp
deleted file mode 100644
index e89dba71..00000000
--- a/libkram/kram/float4a.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
-// The license and copyright notice shall be included
-// in all copies or substantial portions of the Software.
-
-#include "float4a.h"
diff --git a/libkram/kram/float4a.h b/libkram/kram/float4a.h
deleted file mode 100644
index 85453a23..00000000
--- a/libkram/kram/float4a.h
+++ /dev/null
@@ -1,451 +0,0 @@
-// kram - Copyright 2020-2023 by Alec Miller. - MIT License
-// The license and copyright notice shall be included
-// in all copies or substantial portions of the Software.
-
-#pragma once
-
-//#include "KramConfig.h"
-
-// This is only meant to emulate float4 when lib not available
-// (f.e. win or linux w/o clang) but may move off simd lib to this.  So
-// many open source projets skip SIMD, or only do SSE.  This is
-// how to support ARM and Neon from one codebase.  This uses
-// SSE2Neon.h to translate _mm calls to Neon calls
-#if !USE_SIMDLIB
-
-// this is also defined in KramConfig.h, but this keeps file independent
-#if USE_NEON
-#include "sse2neon.h"
-#else
-//#include <smmintrin.h> // SSE4.1, and includes all before it
-#include <immintrin.h>  // AVX
-#endif
-
-// see here for intrinsics and which instruction set they come from
-// https://docs.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-160
-
-namespace simd {
-
-#if USE_NEON
-#define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))
-
-// rqrt (high precision)
-inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
-{
-    float32x4_t est = vrsqrteq_f32(a);
-
-    est = _mm_fixzero_ps(est, a);
-
-    // newton raphson
-    float32x4_t stepA = vrsqrtsq_f32(a, vmulq_f32(est, est));  // xn+1 = xn(3-dxn*dxn)/2
-
-    return _mm_mul_ps(est, stepA);
-}
-
-// sqrt
-inline float32x4_t _mm_sqrthp_ps(const float32x4_t& a)
-{
-    // sqrt(a) = a * rsqrt(a)
-    return _mm_mul_ps(_mm_rsqrthp_ps(a), a);
-}
-
-// recip
-inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
-{
-    float32x4_t est = vrecpeq_f32(a);
-
-    est = _mm_fixzero_ps(est, a);
-
-    float32x4_t stepA = vrecpsq_f32(est, a);  // xn+1 = xn(2-dxn)
-
-    return _mm_mul_ps(est, stepA);
-}
-
-#else
-
-// this is an easier to read type, since __m128 can hold different data
-#define float32x4_t __m128
-
-#define _mm_fixzero_ps(a, b) _mm_and_ps(a, _mm_cmpneq_ps(b, _mm_setzero_ps()))
-#define _mm_sqrthp_ss(a) _mm_sqrt_ss(a)
-#define _mm_sqrthp_ps(a) _mm_sqrt_ps(a)
-
-inline float32x4_t _mm_rsqrthp_ps(const float32x4_t& a)
-{
-    static const float32x4_t kHalf = _mm_set1_ps(0.5f);
-    static const float32x4_t kThree = _mm_set1_ps(3.0f);
-
-    //for the reciprocal square root, it looks like this: (doesn't handle 0 -> Inf -> Nan), mix in min/max
-    // doubles precision
-    //x  = rsqrt_approx(c);
-    //x *= 0.5*(3 - x*x*c); // refinement
-
-    float32x4_t low;
-    low = _mm_rsqrt_ps(a);  // low precision
-
-    // zero out any elements that started out zero (0 -> 0)
-    low = _mm_fixzero_ps(low, a);
-
-    // this is already rolled into Neon wrapper
-    // TODO: use constants
-    low = _mm_mul_ps(low, _mm_mul_ps(kHalf,
-                                     _mm_sub_ps(kThree, _mm_mul_ps(a, _mm_mul_ps(low, low)))));
-
-    return low;
-}
-
-inline float32x4_t _mm_rcphp_ps(const float32x4_t& a)
-{
-    static const float32x4_t kTwo = _mm_set1_ps(2.0f);
-
-    // http://www.virtualdub.org/blog/pivot/entry.php?id=229#body (doesn't handle 0 -> Inf, min in min/max
-    // 20-bit precision
-    //x = reciprocal_approx(c);
-    //x' = x * (2 - x * c);
-
-    float32x4_t low = _mm_rcp_ps(a);
-
-    // zero out any elements that started out 0 (0 -> 0)
-    low = _mm_fixzero_ps(low, a);
-
-    // this is already rolled into Neon wrapper
-    low = _mm_mul_ps(low, _mm_sub_ps(kTwo, _mm_mul_ps(low, a)));
-
-    return low;
-}
-
-#define _mm_rsqrthp_ss(a) _mm_setx_ps(a, _mm_rsqrthp_ps(a))
-#define _mm_rcphp_ss(a) _mm_setx_ps(a, _mm_rcphp_ps(a))
-
-#endif
-
-//---------------------------------------------------------------------------------------
-
-using tSwizzle = uint32_t;
-
-// swizzle order has to be fliped to use shuffle
-#define macroSwizzle(x, y, z, w) _MM_SHUFFLE(w, z, y, x)
-
-// replicate a lane into a new vector
-#define _mm_splatx_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(0, 0, 0, 0))
-#define _mm_splaty_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(1, 1, 1, 1))
-#define _mm_splatz_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(2, 2, 2, 2))
-#define _mm_splatw_ps(v) _mm_shuffle_ps(v, v, macroSwizzle(3, 3, 3, 3))
-
-// dot product app with horizontal adds, without using _mm_hadd_ps()
-inline float32x4_t _mm_hadd4_ps(const float32x4_t& r)
-{
-#if 0  // SSE1
-//    // use for hpadd
-//    static const tSwizzle kSwizzleYYZW = macroSwizzle(1, 1, 2, 3);
-//    //static const tSwizzle kSwizzleZYZW = macroSwizzle(2,1,2,3);
-//    static const tSwizzle kSwizzleWZZW = macroSwizzle(3, 2, 2, 3);
-//
-//    float32x4_t t = _mm_add_ps(r, _mm_shuffle_ps(r, r, kSwizzleWZZW));  // xy + wz
-//    t = _mm_add_ss(t, _mm_shuffle_ps(t, t, kSwizzleYYZW));              // x + y
-//    return t;
-#else  // SSE3
-    float32x4_t t = _mm_hadd_ps(r, r);  // xy + wz
-    t = _mm_hadd_ps(t, t);              // x + y
-    return t;
-#endif
-}
-
-static const uint32_t kSignBitsF32x4i = {0x80000000};
-static const float32x4_t kSignBitsF32x4 = _mm_set1_ps(*(const float*)&kSignBitsF32x4i);
-static const float32x4_t kOnesF32x4 = _mm_set1_ps(1.0f);
-
-// higher level comparisons, returns 0 or 1
-#define _mm_pcmpeq_ps(a, b) _mm_and_ps(_mm_cmpeq_ps(a, b), kOnesF32x4)
-#define _mm_pcmpneq_ps(a, b) _mm_and_ps(_mm_cmpneq_ps(a, b), kOnesF32x4)
-#define _mm_pcmpgt_ps(a, b) _mm_and_ps(_mm_cmpgt_ps(a, b), kOnesF32x4)
-#define _mm_pcmpge_ps(a, b) _mm_and_ps(_mm_cmpge_ps(a, b), kOnesF32x4)
-#define _mm_pcmplt_ps(a, b) _mm_pcmpge_ps(b, a)
-#define _mm_pcmple_ps(a, b) _mm_pcmpgt_ps(b, a)
-
-//---------------------------------------------------------------------------------------
-
-// Note float3 should be it's own type, but it should be float4 in size.
-// float2 is harder since on Neon, it supports a float2 data structure.
-// Needs SSE4.1, but that's most of the processors these days.
-class float4 {
-public:
-    using tType = float32x4_t;
-    float4() {}
-
-    // TODO: problem is that Apple's simd::float4(val) is val,000, simd::float4(val, 0, 0, 0) is 0 (last element?)
-    // have to go through float4m(val, val, val, val) to get 4 values
-    // This behavior doesn't match HLSL/GLSL and is an artifact of the comma operator messing things up.
-    explicit float4(float val) { reg = _mm_set1_ps(val); }  // xyzw = val
-    explicit float4(tType val) { reg = val; }
-    float4(float xx, float yy, float zz, float ww) { reg = _mm_setr_ps(xx, yy, zz, ww); }
-    float4(const float4& val) { reg = val.reg; }
-
-    union {
-        tType reg;
-
-        // avoid using these, since they pull data out of simd registers
-        float v[4];
-        struct {
-            float x, y, z, w;
-        };
-        struct {
-            float r, g, b, a;
-        };
-    };
-
-    // use of these pull data out of simd registers
-    float& operator[](int32_t index)
-    {
-        return v[index];  // or _mm_extract_ps(reg, index), but imm needs to be hardcoded there
-    }
-    const float& operator[](int32_t index) const
-    {
-        return v[index];
-    }
-
-    // use these to stay in register
-    inline float4 xvec() { return float4(_mm_splatx_ps(reg)); }
-    inline float4 yvec() { return float4(_mm_splaty_ps(reg)); }
-    inline float4 zvec() { return float4(_mm_splatz_ps(reg)); }
-    inline float4 wvec() { return float4(_mm_splatw_ps(reg)); }
-
-    inline float4& operator*=(float s)
-    {
-        return *this *= float4(s);
-    }
-    inline float4& operator/=(float s)
-    {
-        return *this /= float4(s);
-    }
-    inline float4& operator-=(float s)
-    {
-        return *this -= float4(s);
-    }
-    inline float4& operator+=(float s)
-    {
-        return *this += float4(s);
-    }
-
-    // sse ops start here
-    inline float4& operator/=(const float4& b)
-    {
-        reg = _mm_div_ps(reg, b.reg);
-        return *this;
-    }
-    inline float4& operator*=(const float4& b)
-    {
-        reg = _mm_mul_ps(reg, b.reg);
-        return *this;
-    }
-    inline float4& operator-=(const float4& b)
-    {
-        reg = _mm_sub_ps(reg, b.reg);
-        return *this;
-    }
-    inline float4& operator+=(const float4& b)
-    {
-        reg = _mm_add_ps(reg, b.reg);
-        return *this;
-    }
-
-    inline float4 operator-() const
-    {
-        return float4(_mm_xor_ps(kSignBitsF32x4, reg));  // -a
-    }
-
-    inline bool equal(const float4& vv) const
-    {
-        int32_t maskBits = _mm_movemask_ps(_mm_cmpeq_ps(reg, vv.reg));
-        return maskBits == 15;
-    }
-
-    inline bool not_equal(const float4& vv) const { return !equal(vv); }
-
-    // returns 1's and 0's in a float4
-    inline float4 operator==(const float4& vv) const { return float4(_mm_pcmpeq_ps(reg, vv.reg)); }
-    inline float4 operator!=(const float4& vv) const { return float4(_mm_pcmpneq_ps(reg, vv.reg)); }
-    inline float4 operator>(const float4& vv) const { return float4(_mm_pcmpgt_ps(reg, vv.reg)); }
-    inline float4 operator>=(const float4& vv) const { return float4(_mm_pcmpge_ps(reg, vv.reg)); }
-    inline float4 operator<(const float4& vv) const { return float4(_mm_pcmplt_ps(reg, vv.reg)); }
-    inline float4 operator<=(const float4& vv) const { return float4(_mm_pcmple_ps(reg, vv.reg)); }
-};
-
-inline float4 operator*(const float4& lhs, const float4& rhs)
-{
-    float4 aa(lhs);
-    return aa *= rhs;
-}
-inline float4 operator/(const float4& lhs, const float4& rhs)
-{
-    float4 aa(lhs);
-    return aa /= rhs;
-}
-inline float4 operator+(const float4& lhs, const float4& rhs)
-{
-    float4 aa(lhs);
-    return aa += rhs;
-}
-inline float4 operator-(const float4& lhs, const float4& rhs)
-{
-    float4 aa(lhs);
-    return aa -= rhs;
-}
-
-// scalar ops for right side
-inline float4 operator*(const float4& vv, float s)
-{
-    float4 aa(vv);
-    return aa *= float4(s);
-}
-inline float4 operator/(const float4& vv, float s)
-{
-    float4 aa(vv);
-    return aa /= float4(s);
-}
-inline float4 operator+(const float4& vv, float s)
-{
-    float4 aa(vv);
-    return aa += float4(s);
-}
-inline float4 operator-(const float4& vv, float s)
-{
-    float4 aa(vv);
-    return aa -= float4(s);
-}
-
-inline float4 operator*(float a, const float4& b)
-{
-    float4 aa(a);
-    return aa *= b;
-}
-inline float4 operator/(float a, const float4& b)
-{
-    float4 aa(a);
-    return aa /= b;
-}
-inline float4 operator+(float a, const float4& b)
-{
-    float4 aa(a);
-    return aa += b;
-}
-inline float4 operator-(float a, const float4& b)
-{
-    float4 aa(a);
-    return aa -= b;
-}
-
-inline float4 min(const float4& lhs, const float4& rhs)
-{
-    return float4(_mm_min_ps(lhs.reg, rhs.reg));
-}
-inline float4 max(const float4& lhs, const float4& rhs)
-{
-    return float4(_mm_max_ps(lhs.reg, rhs.reg));
-}
-
-// do 4 of these at once
-inline float4 recip(const float4& vv)
-{
-    return float4(_mm_rcphp_ps(vv.reg));
-}
-inline float4 rsqrt(const float4& vv)
-{
-    return float4(_mm_rsqrthp_ps(vv.reg));
-}
-inline float4 sqrt(const float4& vv)
-{
-    return float4(_mm_sqrthp_ps(vv.reg));
-}
-
-inline float dot(const float4& lhs, const float4& rhs)
-{
-    return float4(_mm_hadd4_ps(_mm_mul_ps(lhs.reg, rhs.reg)))[0];
-}
-inline float length_squared(const float4& vv)
-{
-    return dot(vv, vv);
-}
-inline float length(const float4& vv)
-{
-    return sqrtf(length_squared(vv));
-}
-
-// sse4.1 ops
-inline float4 round(const float4& vv)
-{
-    return float4(_mm_round_ps(vv.reg, 0x8));  // round to nearest | exc
-}
-inline float4 ceil(const float4& vv)
-{
-    return float4(_mm_ceil_ps(vv.reg));
-}
-inline float4 floor(const float4& vv)
-{
-    return float4(_mm_floor_ps(vv.reg));  // SSE4.1
-}
-
-// see if any results are 1
-inline bool any(const float4& vv)
-{
-    return float4(_mm_hadd4_ps(vv.reg))[0] > 0.0f;
-}
-
-inline float4 select(const float4& lhs, const float4& rhs, const float4& mask)
-{
-    return float4(_mm_or_ps(_mm_andnot_ps(mask.reg, lhs.reg), _mm_and_ps(mask.reg, rhs.reg)));  // 0 picks a, 1 picks b
-}
-
-inline float4 normalize(const float4& vv)
-{
-    return float4(vv) /= length(vv);
-}
-
-inline float4 float4m(float x)
-{
-    return float4(x);
-}
-
-inline float4 float4m(float x, float y, float z, float w)
-{
-    return float4(x, y, z, w);
-}
-
-// need a float3 for this
-//inline float4 float4m(const float3& v float w)
-//{
-//    return float4(v, w);
-//}
-
-inline float4 saturate(const float4& v)
-{
-    return min(max(v, float4m(0.0f)), float4m(1.0f));
-}
-
-
-
-// don't have float2/float3 type yet
-//// use instead of simd_make_float
-//inline float2 float2m(float x)
-//{
-//    return float2(x);
-//}
-
-
-//inline float3 float3m(float x)
-//{
-//    return float3(x);
-//}
-//inline float3 float3m(float x, float y, float z)
-//{
-//    return float3(x, y, z);
-//}
-//inline float3 saturate(const float3& v)
-//{
-//    return min(max(v, float3m(0.0f)), float3m(1.0f));
-//}
-
-
-}  // namespace simd
-
-#endif
diff --git a/libkram/kram/sse2neon.h b/libkram/kram/sse2neon.h
deleted file mode 100644
index 9ce4712a..00000000
--- a/libkram/kram/sse2neon.h
+++ /dev/null
@@ -1,5958 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// This header file does not yet translate all of the SSE intrinsics.
-//
-// Contributors to this work are:
-//   John W. Ratcliff <jratcliffscarab@gmail.com>
-//   Brandon Rowlett <browlett@nvidia.com>
-//   Ken Fast <kfast@gdeb.com>
-//   Eric van Beurden <evanbeurden@nvidia.com>
-//   Alexander Potylitsin <apotylitsin@nvidia.com>
-//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
-//   Jim Huang <jserv@biilabs.io>
-//   Mark Cheng <marktwtn@biilabs.io>
-//   Malcolm James MacLeod <malcolm@gulden.com>
-//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
-//   Sebastian Pop <spop@amazon.com>
-//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
-//   Danila Kutenin <danilak@google.com>
-//   François Turban (JishinMaster) <francois.turban@gmail.com>
-//   Pei-Hsuan Hung <afcidk@gmail.com>
-//   Yang-Hao Yuan <yanghau@biilabs.io>
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Tunable configurations */
-
-/* Enable precise implementation of _mm_min_ps and _mm_max_ps
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
- */
-#ifndef SSE2NEON_PRECISE_MINMAX
-#define SSE2NEON_PRECISE_MINMAX (0)
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#else
-#error "Macro name collisions may happen with unsupported compiler."
-#ifdef FORCE_INLINE
-#undef FORCE_INLINE
-#endif
-#define FORCE_INLINE static inline
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
- */
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
-#endif
-//#pragma GCC push_options
-//#pragma GCC target("fpu=neon")
-#elif defined(__aarch64__)
-//#pragma GCC push_options
-//#pragma GCC target("+simd")
-#else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
-#endif
-#endif
-
-#include <arm_neon.h>
-
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
-#include <math.h>
-#endif
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if __GNUC__ <= 9
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
-    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef int64x1_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64_s64(x) (x)
-
-#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
-#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
-#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
-
-#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64(x) (x)
-
-#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
-
-#if defined(__aarch64__)
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_m128d_f64(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
-
-#define vreinterpretq_f64_m128d(x) (x)
-#else
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_m128d_f32(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_f32_m128d(x) (x)
-#endif
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an _m128 struct
-// directly.  It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides.  This union should really only be used when
-// trying to access the members of the vector as integer values.  GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit.  If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components.  The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
-    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
-    int8_t m128_i8[16];    // as signed 8-bit integers.
-    int16_t m128_i16[8];   // as signed 16-bit integers.
-    int32_t m128_i32[4];   // as signed 32-bit integers.
-    int64_t m128_i64[2];   // as signed 64-bit integers.
-    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
-    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
-    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
-    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__)
-#if __GNUC__ <= 9
-FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
-{
-    uint8x16x4_t ret;
-    ret.val[0] = vld1q_u8(p + 0);
-    ret.val[1] = vld1q_u8(p + 16);
-    ret.val[2] = vld1q_u8(p + 32);
-    ret.val[3] = vld1q_u8(p + 48);
-    return ret;
-}
-#endif
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- *   _mm_<name>_<data_type>
- *
- * The parts of this format are given as follows:
- * 1. <name> describes the operation performed by the intrinsic
- * 2. <data_type> identifies the data type of the function's primary arguments
- *
- * This last part, <data_type>, is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- *                      than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- *   // Set packed 8-bit integers
- *   // 128 bits, 16 chars, per 8 bits
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
- *   // Shuffle packed 8-bit integers
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- *
- * Data (Number, Binary, Byte Index):
-    +------+------+-------------+------+------+-------------+
-    |      1      |      2      |      3      |      4      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |      5      |      6      |      7      |      8      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
- * Index (Byte Index):
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
-    +------+------+------+------+------+------+------+------+
- * Result:
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
-    +------+------+------+------+------+------+------+------+
-    |     256     |      2      |      5      |      6      | Number
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |      3      |      7      |      4      |      8      | Number
-    +------+------+------+------+------+------+-------------+
- */
-
-/* Set/get methods */
-
-/* Constants for use with _mm_prefetch.  */
-enum _mm_hint {
-    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
-    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
-    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
-    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
-    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
-    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
-    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
-    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
-};
-
-// Loads one cache line of data from address p to a location closer to the
-// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
-FORCE_INLINE void _mm_prefetch(const void *p, int i)
-{
-    (void) i;
-    __builtin_prefetch(p);
-}
-
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-//
-//   dst[31:0] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Sets the 128-bit value to zero
-// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Clears the four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Sets the four single-precision, floating-point values to w.
-//
-//   r0 := r1 := r2 := r3 := w
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to w.
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs.
-// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
-    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs in
-// reverse order.
-// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the 8 signed 16-bit integer values in reverse order.
-//
-// Return Value
-//   r0 := w0
-//   r1 := w1
-//   ...
-//   r7 := w7
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                    short w1,
-                                    short w2,
-                                    short w3,
-                                    short w4,
-                                    short w5,
-                                    short w6,
-                                    short w7)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
-}
-
-// Sets the 4 signed 32-bit integer values in reverse order
-// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
-{
-    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
-}
-
-// Sets the 16 signed 8-bit integer values to b.
-//
-//   r0 := b
-//   r1 := b
-//   ...
-//   r15 := b
-//
-// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
-    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Sets the 8 signed 16-bit integer values to w.
-//
-//   r0 := w
-//   r1 := w
-//   ...
-//   r7 := w
-//
-// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Sets the 16 signed 8-bit integer values.
-// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                  signed char b14,
-                                  signed char b13,
-                                  signed char b12,
-                                  signed char b11,
-                                  signed char b10,
-                                  signed char b9,
-                                  signed char b8,
-                                  signed char b7,
-                                  signed char b6,
-                                  signed char b5,
-                                  signed char b4,
-                                  signed char b3,
-                                  signed char b2,
-                                  signed char b1,
-                                  signed char b0)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Sets the 8 signed 16-bit integer values.
-// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                   short i6,
-                                   short i5,
-                                   short i4,
-                                   short i3,
-                                   short i2,
-                                   short i1,
-                                   short i0)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-    return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Sets the 16 signed 8-bit integer values in reverse order.
-// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                   signed char b1,
-                                   signed char b2,
-                                   signed char b3,
-                                   signed char b4,
-                                   signed char b5,
-                                   signed char b6,
-                                   signed char b7,
-                                   signed char b8,
-                                   signed char b9,
-                                   signed char b10,
-                                   signed char b11,
-                                   signed char b12,
-                                   signed char b13,
-                                   signed char b14,
-                                   signed char b15)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Sets the 4 signed 32-bit integer values to i.
-//
-//   r0 := i
-//   r1 := i
-//   r2 := i
-//   r3 := I
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Sets the 4 signed 32-bit integer values.
-// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
-    int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
-    return vreinterpretq_m128i_s64(vld1q_s64(data));
-}
-
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
-{
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-{
-    double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores the lower single - precision, floating - point value.
-// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
-{
-    _mm_store_pd(mem_addr, a);
-}
-
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
-// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
-    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
-    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
-    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
-}
-
-// Stores the lower two single-precision floating point values of a to the
-// address p.
-//
-//   *p0 := a0
-//   *p1 := a1
-//
-// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_low_f32(a));
-}
-
-// Stores the upper two single-precision, floating-point values of a to the
-// address p.
-//
-//   *p0 := a2
-//   *p1 := a3
-//
-// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_high_f32(a));
-}
-
-// Loads a single single-precision, floating-point value, copying it into all
-// four words
-// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[63:32] := MEM[mem_addr+31:mem_addr]
-//   dst[95:64] := MEM[mem_addr+31:mem_addr]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
-
-// Sets the lower two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the upper two values are passed
-// through from a.
-//
-// Return Value
-//   r0 := *p0
-//   r1 := *p1
-//   r2 := a2
-//   r3 := a3
-//
-// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
-}
-
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
-//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
-//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
-{
-    float32x4_t v = vrev64q_f32(vld1q_f32(p));
-    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
-}
-
-// Sets the upper two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the lower two values are passed
-// through from a.
-//
-//   r0 := a0
-//   r1 := a1
-//   r2 := *p0
-//   r3 := *p1
-//
-// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
-    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-    // equivalent for neon
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load unaligned 16-bit integer from memory into the first element of dst.
-//
-//   dst[15:0] := MEM[mem_addr+15:mem_addr]
-//   dst[MAX:16] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
-{
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
-}
-
-// Load unaligned 64-bit integer from memory into the first element of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[MAX:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Loads two double-precision from 16-byte aligned memory, floating-point
-// values.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64(p));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
-{
-    return _mm_load_pd(p);
-}
-
-// Loads an single - precision, floating - point value into the low word and
-// clears the upper three words.
-// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
-    /* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
-#else
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-#endif
-}
-
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
-#if defined(__aarch64__)
-    float64x2_t v = vld1q_f64(p);
-    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
-#else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
-    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
-#endif
-}
-
-// Sets the low word to the single-precision, floating-point value of b
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
-                       vreinterpretq_f32_m128(a), 0));
-}
-
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
-}
-
-// Return vector of type __m128 with undefined elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
-//FORCE_INLINE __m128 _mm_undefined_ps(void)
-//{
-//    __m128 a;
-//    return a;
-//}
-
-/* Logic/Binary operations */
-
-// Computes the bitwise AND-NOT of the four single-precision, floating-point
-// values of a and b.
-//
-//   r0 := ~a0 & b0
-//   r1 := ~a1 & b1
-//   r2 := ~a2 & b2
-//   r3 := ~a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vbicq_s32(vreinterpretq_s32_m128(b),
-                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
-}
-
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-// 	     i := j*64
-// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-{
-    // *NOTE* argument swap
-    return vreinterpretq_m128d_s64(
-        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
-}
-
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
-// 128-bit value in a.
-//
-//   r := (~a) & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vbicq_s32(vreinterpretq_s32_m128i(b),
-                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
-// b.
-//
-//   r := a & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise AND of the four single-precision, floating-point values
-// of a and b.
-//
-//   r0 := a0 & b0
-//   r1 := a1 & b1
-//   r2 := a2 & b2
-//   r3 := a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Computes the bitwise OR of the four single-precision, floating-point values
-// of a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Computes bitwise EXOR (exclusive-or) of the four single-precision,
-// floating-point values of a and b.
-// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := j*64
-//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
-//
-//   r := a | b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
-// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-{
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
-#else
-    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
-    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-{
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
-    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Moves the upper two values of B into the lower two values of A.
-//
-//   r3 := a3
-//   r2 := a2
-//   r1 := b3
-//   r0 := b2
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-}
-
-// Moves the lower two values of B into the upper two values of A.
-//
-//   r3 := b1
-//   r2 := b0
-//   r1 := a1
-//   r0 := a0
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
-    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
-    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-{
-    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-{
-    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
-    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-    float32x2_t a21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-    float32x2_t a03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-    float32x2_t a33 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// NEON does not support a general purpose permute intrinsic
-// Selects four specific single-precision, floating-point values from a and b,
-// based on the mask i.
-//
-// C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
-//       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
-    __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
-    })
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_ps(a, b, imm)                                \
-    __extension__({                                              \
-        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
-        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
-        float32x4_t _shuf = __builtin_shufflevector(             \
-            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-        vreinterpretq_m128_f32(_shuf);                           \
-    })
-#else  // generic
-#define _mm_shuffle_ps(a, b, imm)                          \
-    __extension__({                                        \
-        __m128 ret;                                        \
-        switch (imm) {                                     \
-        case _MM_SHUFFLE(1, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_1032((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 3, 0, 1):                      \
-            ret = _mm_shuffle_ps_2301((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 3, 2, 1):                      \
-            ret = _mm_shuffle_ps_0321((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 1, 0, 3):                      \
-            ret = _mm_shuffle_ps_2103((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 1, 0):                      \
-            ret = _mm_movelh_ps((a), (b));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_1001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 1, 0, 1):                      \
-            ret = _mm_shuffle_ps_0101((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 1, 0):                      \
-            ret = _mm_shuffle_ps_3210((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 1, 1):                      \
-            ret = _mm_shuffle_ps_0011((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 2, 2):                      \
-            ret = _mm_shuffle_ps_0022((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 2, 0, 0):                      \
-            ret = _mm_shuffle_ps_2200((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 0, 2):                      \
-            ret = _mm_shuffle_ps_3202((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 3, 2):                      \
-            ret = _mm_movehl_ps((b), (a));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 1, 3, 3):                      \
-            ret = _mm_shuffle_ps_1133((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 1, 0):                      \
-            ret = _mm_shuffle_ps_2010((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_2001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_2032((a), (b));           \
-            break;                                         \
-        default:                                           \
-            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-            break;                                         \
-        }                                                  \
-        ret;                                               \
-    })
-#endif
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most signficant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least signficant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
-    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
-    uint8x16_t idx_masked =
-        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-    int8x16_t ret;
-    // %e and %f represent the even and odd D registers
-    // respectively.
-    __asm__ __volatile__(
-        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-        : [ret] "=&w"(ret)
-        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-    return vreinterpretq_m128i_s8(ret);
-#else
-    // use this line if testing on aarch64
-    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-// C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
-//       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-#define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
-#else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
-#endif
-
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
-// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_epi32(a, imm)                              \
-    __extension__({                                            \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
-        int32x4_t _shuf = __builtin_shufflevector(             \
-            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                        \
-    })
-#else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
-#endif
-
-// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
-// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-        int16x4_t lowBits = vget_low_s16(ret);                                \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
-                             1);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
-                             2);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
-                             3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflelo_epi16(a, imm)                                  \
-    __extension__({                                                  \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = __builtin_shufflevector(                   \
-            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
-            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
-        vreinterpretq_m128i_s16(_shuf);                              \
-    })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-        int16x4_t highBits = vget_high_s16(ret);                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
-                             5);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
-                             6);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
-                             7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflehi_epi16(a, imm)                             \
-    __extension__({                                             \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
-        int16x8_t _shuf = __builtin_shufflevector(              \
-            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-            (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                         \
-    })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-//
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF imm8[j]
-//           dst[i+15:i] := b[i+15:i]
-//       ELSE
-//           dst[i+15:i] := a[i+15:i]
-//       FI
-//   ENDFOR
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                        \
-    __extension__({                                                       \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
-    })
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-//
-//   FOR j := 0 to 15
-//       i := j*8
-//       IF mask[i+7]
-//           dst[i+7:i] := b[i+7:i]
-//       ELSE
-//           dst[i+7:i] := a[i+7:i]
-//       FI
-//   ENDFOR
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint8x16_t mask =
-        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    uint8x16_t b = vreinterpretq_u8_m128i(_b);
-    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-/* Shifts */
-
-
-// Shift packed 16-bit integers in a right by imm while shifting in sign
-// bits, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-{
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
-#define _mm_slli_epi16(a, imm)                                   \
-    __extension__({                                              \
-        __m128i ret;                                             \
-        if ((imm) <= 0) {                                        \
-            ret = a;                                             \
-        } else if ((imm) > 15) {                                 \
-            ret = _mm_setzero_si128();                           \
-        } else {                                                 \
-            ret = vreinterpretq_m128i_s16(                       \
-                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
-        }                                                        \
-        ret;                                                     \
-    })
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros. :
-// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-{
-    if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
-        return a;
-    if (imm > 31) /* TODO: add unlikely macro */
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s32(
-        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
-}
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-{
-    if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
-        return a;
-    if (imm > 63) /* TODO: add unlikely macro */
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s64(
-        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF imm8[7:0] > 15
-//       dst[i+15:i] := 0
-//     ELSE
-//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 16) {                              \
-            ret = vreinterpretq_m128i_u16(                                 \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := 0
-//     ELSE
-//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 32) {                              \
-            ret = vreinterpretq_m128i_u32(                                 \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     IF imm8[7:0] > 63
-//       dst[i+63:i] := 0
-//     ELSE
-//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 64) {                              \
-            ret = vreinterpretq_m128i_u64(                                 \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
-//     ELSE
-//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 32) {                              \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
-        } else {                                                           \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shifts the 128 - bit value in a right by imm bytes while shifting in
-// zeros.imm must be an immediate.
-//
-//   r := srl(a, imm*8)
-//
-// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_si128(a, imm)                                              \
-    __extension__({                                                         \
-        __m128i ret;                                                        \
-        if ((imm) <= 0) {                                                   \
-            ret = a;                                                        \
-        } else if ((imm) > 15) {                                            \
-            ret = _mm_setzero_si128();                                      \
-        } else {                                                            \
-            ret = vreinterpretq_m128i_s8(                                   \
-                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
-        }                                                                   \
-        ret;                                                                \
-    })
-
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
-// must be an immediate.
-//
-//   r := a << (imm * 8)
-//
-// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_si128(a, imm)                                          \
-    __extension__({                                                     \
-        __m128i ret;                                                    \
-        if ((imm) <= 0) {                                               \
-            ret = a;                                                    \
-        } else if ((imm) > 15) {                                        \
-            ret = _mm_setzero_si128();                                  \
-        } else {                                                        \
-            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
-                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
-        }                                                               \
-        ret;                                                            \
-    })
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 15)
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
-    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-// r2 := a2 << count
-// r3 := a3 << count
-//
-// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 31)
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
-    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-//
-// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 63)
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
-    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// ...
-// r7 := srl(a7, count)
-//
-// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 15)
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
-    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// r2 := srl(a2, count)
-// r3 := srl(a3, count)
-//
-// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 31)
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
-    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-//
-// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 63)
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
-    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// NEON does not provide a version of this function.
-// Creates a 16-bit mask from the most significant bits of the 16 signed or
-// unsigned 8-bit integers in a and zero extends the upper bits.
-// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-#if defined(__aarch64__)
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-    const int8_t ALIGN_STRUCT(16)
-        xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
-    const uint8x16_t mask_and = vdupq_n_u8(0x80);
-    const int8x16_t mask_shift = vld1q_s8(xr);
-    const uint8x16_t mask_result =
-        vshlq_u8(vandq_u8(input, mask_and), mask_shift);
-    uint8x8_t lo = vget_low_u8(mask_result);
-    uint8x8_t hi = vget_high_u8(mask_result);
-
-    return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
-#else
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-    // Shift out everything but the sign bits with an unsigned shift right.
-    //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
-    //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
-    //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
-    //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
-    //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
-    //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
-#endif
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-{
-    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-}
-
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
-}
-
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four
-// single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-    uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
-    uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
-#else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
-{
-    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
-    int64x2_t a_and_mask =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
-    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
-                                                                           : 1;
-}
-
-/* Math operations */
-
-// Subtracts the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-//
-//   dst[31:0] := a[31:0] - b[31:0]
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_sub_ps(a, b));
-}
-
-// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
-// and store the results in dst.
-//    r0 := a0 - b0
-//    r1 := a1 - b1
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
-// unsigned 32-bit integers of a.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-//
-//   dst[63:0] := a[63:0] - b[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
-// integers of a and saturates..
-// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
-// integers of a and saturates.
-//
-//   r0 := UnsignedSaturate(a0 - b0)
-//   r1 := UnsignedSaturate(a1 - b1)
-//   ...
-//   r15 := UnsignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r15 := SignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r7 := SignedSaturate(a7 - b7)
-//
-// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..15
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
-    int8x16_t a = vreinterpretq_s8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
-#else
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
-    // based on ltMask
-    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x16_t res = vbicq_s8(masked, zeroMask);
-
-    return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..7
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
-#else
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
-    // 'a') based on ltMask
-    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x8_t res = vbicq_s16(masked, zeroMask);
-    return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..3
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
-#else
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
-    // 'a') based on ltMask
-    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x4_t res = vbicq_s32(masked, zeroMask);
-    return vreinterpretq_m128i_s32(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      IF b[i+15:i] < 0
-//        dst[i+15:i] := -(a[i+15:i])
-//      ELSE IF b[i+15:i] == 0
-//        dst[i+15:i] := 0
-//      ELSE
-//        dst[i+15:i] := a[i+15:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
-
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
-#else
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
-    // based on ltMask
-    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x4_t res = vbic_s16(masked, zeroMask);
-
-    return vreinterpret_m64_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 1
-//      i := j*32
-//      IF b[i+31:i] < 0
-//        dst[i+31:i] := -(a[i+31:i])
-//      ELSE IF b[i+31:i] == 0
-//        dst[i+31:i] := 0
-//      ELSE
-//        dst[i+31:i] := a[i+31:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
-#else
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
-    // based on ltMask
-    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x2_t res = vbic_s32(masked, zeroMask);
-
-    return vreinterpret_m64_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      IF b[i+7:i] < 0
-//        dst[i+7:i] := -(a[i+7:i])
-//      ELSE IF b[i+7:i] == 0
-//        dst[i+7:i] := 0
-//      ELSE
-//        dst[i+7:i] := a[i+7:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-{
-    int8x8_t a = vreinterpret_s8_m64(_a);
-    int8x8_t b = vreinterpret_s8_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
-#else
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
-    // based on ltMask
-    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x8_t res = vbic_s8(masked, zeroMask);
-
-    return vreinterpret_m64_s8(res);
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(
-        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Computes the average of the 16 unsigned 8-bit integers in a and the 16
-// unsigned 8-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r15 := (a15 + b15) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the average of the 8 unsigned 16-bit integers in a and the 8
-// unsigned 16-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r7 := (a7 + b7) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
-}
-
-// Adds the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add 64-bit integers a and b, and store the result in dst.
-//
-//   dst[63:0] := a[63:0] + b[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// adds the scalar single-precision floating point values of a and b.
-// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-    // the upper values in the result must be the remnants of <a>.
-    return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
-// unsigned 16-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
-// unsigned 8-bit integers in b.
-// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
-// and saturates.
-//
-//   r0 := SignedSaturate(a0 + b0)
-//   r1 := SignedSaturate(a1 + b1)
-//   ...
-//   r7 := SignedSaturate(a7 + b7)
-//
-// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
-//
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
-// b and saturates..
-// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
-// unsigned 16-bit integers from b.
-//
-//   r0 := (a0 * b0)[15:0]
-//   r1 := (a1 * b1)[15:0]
-//   ...
-//   r7 := (a7 * b7)[15:0]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
-// unsigned 32-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      tmp[31:0] := a[i+15:i] * b[i+15:i]
-//      dst[i+15:i] := tmp[31:16]
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
-// Multiplies the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 * b0
-//   r1 := a1 * b1
-//   r2 := a2 * b2
-//   r3 := a3 * b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-//
-//   dst[31:0] := a[31:0] * b[31:0]
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_mul_ps(a, b));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-//
-//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
-//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
-    // vmull_u32 upcasts instead of masking, so we downcast.
-    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
-//
-//   dst[63:0] := a[31:0] * b[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u64(vget_low_u64(
-        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-//
-//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
-//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
-    // vmull_s32 upcasts instead of masking, so we downcast.
-    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0) + (a1 * b1)
-//   r1 := (a2 * b2) + (a3 * b3)
-//   r2 := (a4 * b4) + (a5 * b5)
-//   r3 := (a6 * b6) + (a7 * b7)
-// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
-    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                              vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                               vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-//
-//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
-//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
-//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
-//   ...
-//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
-    // Has issues due to saturation
-    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-    // Multiply
-    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    // Rounding narrowing shift right
-    // narrow = (int16_t)((mul + 16384) >> 15);
-    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-    // Join together
-    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*16
-//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
-//      a[i+7:i]*b[i+7:i] )
-//   ENDFOR
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                             vmovl_s8(vget_low_s8(b)));
-    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
-                             vmovl_s8(vget_high_s8(b)));
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
-#else
-    // This would be much simpler if x86 would choose to zero extend OR sign
-    // extend, not both. This could probably be optimized better.
-    uint16x8_t a = vreinterpretq_u16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // Zero extend a
-    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-    int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-    // multiply
-    int16x8_t prod1 = vmulq_s16(a_even, b_even);
-    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-#endif
-}
-
-// Computes the fused multiple add product of 32-bit floating point numbers.
-//
-// Return Value
-// Multiplies A and B, and adds C to the temporary result before returning it.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
-#else
-    return _mm_add_ps(_mm_mul_ps(a, b), c);
-#endif
-}
-
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-{
-    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
-    return _mm_fmadd_ps(b, mask, a);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
-    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
-    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
-    return (__m128i) vsetq_lane_u16(r4, r, 4);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
-{
-    uint16x4_t t =
-        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
-    return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
-//   ENDFOR
-//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
-//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
-// Divides the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 / b0
-//   r1 := a1 / b1
-//   r2 := a2 / b2
-//   r3 := a3 / b3
-//
-// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    float32x4_t recip1 =
-        vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
-#endif
-}
-
-// Divides the scalar single-precision floating point value of a by b.
-// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the approximations of reciprocals of the four single-precision,
-// floating-point values of a.
-// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-    return vreinterpretq_m128_f32(recip);
-}
-
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-//
-//   dst[31:0] := (1.0 / a[31:0])
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
-    return _mm_move_ss(a, _mm_rcp_ps(a));
-}
-
-// Computes the approximations of square roots of the four single-precision,
-// floating-point values of a. First computes reciprocal square roots and then
-// reciprocals of the four values.
-//
-//   r0 := sqrt(a0)
-//   r1 := sqrt(a1)
-//   r2 := sqrt(a2)
-//   r3 := sqrt(a3)
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    // ??? use step versions of both sqrt and recip for better accuracy?
-    return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Computes the approximation of the square root of the scalar single-precision
-// floating point value of in.
-// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Computes the approximations of the reciprocal square roots of the four
-// single-precision floating point values of in.
-// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-    return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Computes the maximums of the four single-precision, floating-point values of
-// a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
-#else
-    return vreinterpretq_m128_f32(
-        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Computes the minima of the four single-precision, floating-point values of a
-// and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
-#else
-    return vreinterpretq_m128_f32(
-        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Computes the maximum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the minimum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 > b0) ? a0 : b0
-//   r1 := (a1 > b1) ? a1 : b1
-//   r2 := (a2 > b2) ? a2 : b2
-//   r3 := (a3 > b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the pariwise minima of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 < b0) ? a0 : b0
-//   r1 := (a1 < b1) ? a1 : b1
-//   r2 := (a2 < b2) ? a2 : b2
-//   r3 := (a3 < b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(vshrn_n_u32(
-        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0)[31:16]
-//   r1 := (a1 * b1)[31:16]
-//   ...
-//   r7 := (a7 * b7)[31:16]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-    /* FIXME: issue with large values because of result saturation */
-    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Computes pairwise add of each argument as single-precision, floating-point
-// values a and b.
-// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Horizontally substract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsubq_f32(
-        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
-        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
-#else
-    float32x4x2_t c =
-        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
-    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s32(
-        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
-}
-
-// Computes pairwise difference of each argument as a 16-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Subtract
-    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
-}
-
-// Computes saturated pairwise sub of each argument as a 16-bit signed
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
-}
-
-// Computes saturated pairwise difference of each argument as a 16-bit signed
-// integer values a and b.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated subtract
-    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
-#endif
-}
-
-// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-}
-
-// Computes pairwise difference of each argument as a 32-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
-    int64x2_t a = vreinterpretq_s64_m128i(_a);
-    int64x2_t b = vreinterpretq_s64_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|b0|b2]
-    // [a1|a2|b1|b3]
-    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
-    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
-    // Subtract
-    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
-}
-
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
-{
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
-}
-
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-{
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
-    if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
-    }
-#endif
-
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
-
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
-    if (imm & (1 << 4))
-        sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
-    if (imm & (1 << 5))
-        sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
-    if (imm & (1 << 6))
-        sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
-    if (imm & (1 << 7))
-        sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
-    };
-    return vreinterpretq_m128_f32(res);
-}
-
-/* Compare operations */
-
-// Compares for less than
-// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
-}
-
-// Compares for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffffffff : 0x0
-//   r1 := (a1 > b1) ? 0xffffffff : 0x0
-//   r2 := (a2 > b2) ? 0xffffffff : 0x0
-//   r3 := (a3 > b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
-}
-
-// Compares for greater than or equal.
-// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
-}
-
-// Compares for less than or equal.
-//
-//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
-//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
-//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
-//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmple_ps(a, b));
-}
-
-// Compares for equality.
-// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for equality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
-}
-
-// Compares for inequality.
-// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compares for inequality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
-}
-
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
-{
-    return _mm_cmplt_ps(a, b);
-}
-
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
-{
-    return _mm_cmplt_ss(a, b);
-}
-
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
-{
-    return _mm_cmple_ps(a, b);
-}
-
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
-{
-    return _mm_cmple_ss(a, b);
-}
-
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
-    return _mm_cmpgt_ps(a, b);
-}
-
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
-{
-    return _mm_cmpgt_ss(a, b);
-}
-
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
-    return _mm_cmpge_ps(a, b);
-}
-
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
-{
-    return _mm_cmpge_ss(a, b);
-}
-
-// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
-// unsigned 8-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
-// unsigned 16-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
-    // ARMv7 lacks vceqq_u64
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for lesser than.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xff : 0x0
-//   r1 := (a1 > b1) ? 0xff : 0x0
-//   ...
-//   r15 := (a15 > b15) ? 0xff : 0x0
-//
-// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for less than.
-//
-//   r0 := (a0 < b0) ? 0xffff : 0x0
-//   r1 := (a1 < b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 < b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffff : 0x0
-//   r1 := (a1 > b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 > b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for less than.
-// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for greater than.
-// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    // ARMv7 lacks vcgtq_s64.
-    // This is based off of Clang's SSE2 polyfill:
-    // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
-
-    // Mask the sign bit out since we need a signed AND an unsigned comparison
-    // and it is ugly to try and split them.
-    int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
-    int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
-    int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
-    // Check if a > b
-    int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
-    // Copy upper mask to lower mask
-    // a_hi > b_hi
-    int64x2_t gt_hi = vshrq_n_s64(greater, 63);
-    // Copy lower mask to upper mask
-    // a_lo > b_lo
-    int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
-    // Compare for equality
-    int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
-    // Copy upper mask to lower mask
-    // a_hi == b_hi
-    int64x2_t eq_hi = vshrq_n_s64(equal, 63);
-    // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
-    int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
-    return vreinterpretq_m128i_s64(ret);
-#endif
-}
-
-// Compares the four 32-bit floats in a and b to check if any values are NaN.
-// Ordered compare between each value returns true for "orderable" and false for
-// "not orderable" (NaN).
-// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
-// also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
-    // Note: NEON does not have ordered compare builtin
-    // Need to compare a eq a and b eq b to check for NaN
-    // Do AND of results to get final
-    uint32x4_t ceqaa =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t ceqbb =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compares for ordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
-}
-
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-{
-    uint32x4_t f32a =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t f32b =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
-}
-
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than operation. :
-// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
-// note!! The documentation on MSDN is incorrect!  If either of the values is a
-// NAN the docs say you will get a one, but in fact, it will return a zero!!
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_lt_b =
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than operation. :
-// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_gt_b =
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_le_b =
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_ge_b =
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an equality operation. :
-// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an inequality operation. :
-// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-    uint32x4_t a_neq_b = vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
-}
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-/* Conversions */
-
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-{
-    __m128 ret = a;
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-#else
-    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t diff = data - floor(data);
-    if (diff > 0.5)
-        return (int32_t) ceil(data);
-    if (diff == 0.5) {
-        int32_t f = (int32_t) floor(data);
-        int32_t c = (int32_t) ceil(data);
-        return c & 1 ? f : c;
-    }
-    return (int32_t) floor(data);
-#endif
-}
-
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
-}
-
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then covert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
-//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
-}
-
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
-}
-
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
-}
-
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_u32(
-        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values using truncate.
-// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Converts the four signed 32-bit integer values of a to single-precision,
-// floating-point values
-// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Converts the two unsigned 8-bit integers in the lower 16 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Converts the two signed 8-bit integers in the lower 32 bits to four
-// signed 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Converts the two signed 16-bit integers in the lower 32 bits two signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
-    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four unsigned 16-bit integers in the lower 64 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_u32(
-        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Converts the two unsigned 16-bit integers in the lower 32 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
-    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the two unsigned 32-bit integers in the lower 64 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_u64(
-        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Converts the two signed 32-bit integers in the lower 64 bits to two signed
-// 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values.
-//
-//   r0 := (int) a0
-//   r1 := (int) a1
-//   r2 := (int) a2
-//   r3 := (int) a3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-#else
-    uint32x4_t signmask = vdupq_n_u32(0x80000000);
-    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
-    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-        vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-    int32x4_t r_trunc =
-        vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-        vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-    float32x4_t delta = vsubq_f32(
-        vreinterpretq_f32_m128(a),
-        vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
-    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
-#endif
-}
-
-// Copy the lower 32-bit integer in a to dst.
-//
-//   dst[31:0] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
-{
-    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
-//   r2 := 0x0
-//   r3 := 0x0
-//
-// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
-    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
-{
-    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit floating point values passed
-// in as a 128-bit parameter as packed 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit integers passed in as a
-// 128-bit parameter as packed 32-bit floating point values.
-// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
-#else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load unaligned 32-bit integer from memory into the first element of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[MAX:32] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
-{
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
-//
-//   FOR j := 0 to 1
-//     i := 32*j
-//     k := 64*j
-//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
-//   ENDFOR
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-{
-#if defined(__aarch64__)
-    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
-    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
-#else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
-#endif
-}
-
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
-#else
-    return ((double *) &a)[0];
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
-//
-//   FOR j := 0 to 1
-//     i := 64*j
-//     k := 32*j
-//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
-#else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
-    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
-{
-    return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
-// floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
-{
-#if defined(__aarch64__)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
-    }
-#else
-    float *v_float = (float *) &a;
-    __m128 zero, neg_inf, pos_inf;
-
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return (__m128){floorf(v_float[0]), floorf(v_float[1]),
-                        floorf(v_float[2]), floorf(v_float[3])};
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
-                        ceilf(v_float[3])};
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
-        neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
-                             floorf(v_float[2]), floorf(v_float[3]));
-        pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
-                             ceilf(v_float[2]), ceilf(v_float[3]));
-        return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return (__m128){roundf(v_float[0]), roundf(v_float[1]),
-                        roundf(v_float[2]), roundf(v_float[3])};
-    }
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
-{
-    return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
-{
-    return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-
-
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-/* Miscellaneous Operations */
-
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   ...
-//   r7 := a7 >> count
-//
-// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (c > 15)
-        return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
-}
-
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   r2 := a2 >> count
-//   r3 := a3 >> count
-//
-// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (c > 31)
-        return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-(int32_t)c)));
-}
-
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
-// saturates.
-// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   ...
-//   r7 := UnsignedSaturate(a7)
-//   r8 := UnsignedSaturate(b0)
-//   r9 := UnsignedSaturate(b1)
-//   ...
-//   r15 := UnsignedSaturate(b7)
-//
-// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
-// and saturates.
-//
-//   r0 := SignedSaturate(a0)
-//   r1 := SignedSaturate(a1)
-//   r2 := SignedSaturate(a2)
-//   r3 := SignedSaturate(a3)
-//   r4 := SignedSaturate(b0)
-//   r5 := SignedSaturate(b1)
-//   r6 := SignedSaturate(b2)
-//   r7 := SignedSaturate(b3)
-//
-// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   r2 := UnsignedSaturate(a2)
-//   r3 := UnsignedSaturate(a3)
-//   r4 := UnsignedSaturate(b0)
-//   r5 := UnsignedSaturate(b1)
-//   r6 := UnsignedSaturate(b2)
-//   r7 := UnsignedSaturate(b3)
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   ...
-//   r14 := a7
-//   r15 := b7
-//
-// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
-// lower 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   r4 := a2
-//   r5 := b2
-//   r6 := a3
-//   r7 := b3
-//
-// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
-// lower 2 signed or unsigned 32 - bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
-    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-}
-
-// Selects and interleaves the lower two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Selects and interleaves the upper two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a2
-//   r1 := b2
-//   r2 := a3
-//   r3 := b3
-//
-// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a8
-//   r1 := b8
-//   r2 := a9
-//   r3 := b9
-//   ...
-//   r14 := a15
-//   r15 := b15
-//
-// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
-// upper 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a4
-//   r1 := b4
-//   r2 := a5
-//   r3 := b5
-//   r4 := a6
-//   r5 := b6
-//   r6 := a7
-//   r7 := b7
-//
-// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
-// upper 2 signed or unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper signed or unsigned 64-bit integer in a with the
-// upper signed or unsigned 64-bit integer in b.
-//
-//   r0 := a1
-//   r1 := b1
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
-    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-//
-//   index[2:0] := 0
-//   min[15:0] := a[15:0]
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF a[i+15:i] < min[15:0]
-//           index[2:0] := j
-//           min[15:0] := a[i+15:i]
-//       FI
-//   ENDFOR
-//   dst[15:0] := min[15:0]
-//   dst[18:16] := index[2:0]
-//   dst[127:19] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
-    __m128i dst;
-    uint16_t min, idx = 0;
-    // Find the minimum value
-#if defined(__aarch64__)
-    min = vminvq_u16(vreinterpretq_u16_m128i(a));
-#else
-    __m64 tmp;
-    tmp = vreinterpret_m64_u16(
-        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-                 vget_high_u16(vreinterpretq_u16_m128i(a))));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-#endif
-    // Get the index of the minimum value
-    int i;
-    for (i = 0; i < 8; i++) {
-        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-            idx = (uint16_t) i;
-            break;
-        }
-        a = _mm_srli_si128(a, 2);
-    }
-    // Generate result
-    dst = _mm_setzero_si128();
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-    return dst;
-}
-
-// shift to right
-// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
-// http://blog.csdn.net/hemmingway/article/details/44828303
-// Clang requires a macro here, as it is extremely picky about c being a
-// literal.
-#define _mm_alignr_epi8(a, b, c) \
-    ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the CF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
-FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
-                  vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the ZF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
-FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Extracts the selected signed or unsigned 8-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Inserts the least significant 8 bits of b into the selected 8-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 16-bit integer from a and zero
-// extends.
-// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Inserts the least significant 16 bits of b into the selected 16-bit integer
-// of a.
-// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 32-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Inserts the least significant 32 bits of b into the selected 32-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 64-bit integer from a and zero
-// extends.
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Inserts the least significant 64 bits of b into the selected 64-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(a);
-#else
-    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
-    uint32_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-
-    vst1_u32(&count, count32x2_val);
-    return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(a);
-#else
-    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
-    uint64_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-    uint64x1_t count64x1_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-    count64x1_val = vpaddl_u32(count32x2_val);
-    vst1_u64(&count, count64x1_val);
-    return count;
-#endif
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-    do {                                                  \
-        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
-        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
-                            vget_low_f32(ROW23.val[0]));  \
-        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
-                            vget_low_f32(ROW23.val[1]));  \
-        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
-                            vget_high_f32(ROW23.val[0])); \
-        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
-                            vget_high_f32(ROW23.val[1])); \
-    } while (0)
-
-/* Crypto Extensions */
-
-#if defined(__ARM_FEATURE_CRYPTO)
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else  // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-                                    vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-                                    vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
-    uint8x16_t e =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
-    uint8x16_t f =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
-    uint8x16_t g =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
-    uint8x16_t h =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
-    uint8x16_t i =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
-    uint8x16_t j =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
-    uint8x16_t k =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f);  // L = E + F
-    uint8x16_t m = veorq_u8(g, h);  // M = G + H
-    uint8x16_t n = veorq_u8(i, j);  // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix = veorq_u8(d, cross1);
-    uint8x16_t r = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
-}
-#endif  // ARMv7 polyfill
-
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
-    uint64x2_t a = vreinterpretq_u64_m128i(_a);
-    uint64x2_t b = vreinterpretq_u64_m128i(_b);
-    switch (imm & 0x11) {
-    case 0x00:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-    case 0x01:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-    case 0x10:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-    case 0x11:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
-    default:
-        abort();
-    }
-}
-
-#if !defined(__ARM_FEATURE_CRYPTO)
-/* clang-format off */
-#define SSE2NEON_AES_DATA(w)                                           \
-    {                                                                  \
-        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
-        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
-        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
-        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
-        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
-        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
-        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
-        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
-        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
-        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
-        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
-        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
-        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
-        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
-        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
-        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
-        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
-        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
-        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
-        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
-        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
-        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
-        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
-        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
-        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
-        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
-        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
-        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
-        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
-        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
-        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
-        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
-        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
-        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
-        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
-        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
-        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
-    }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2NEON_AES_H0(x) (x)
-static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
-#undef SSE2NEON_AES_H0
-
-// In the absence of crypto extensions, implement aesenc using regular neon
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
-// for more information Reproduced with permission of the author.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
-                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
-                                         0xc, 0x1, 0x6, 0xb};
-    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
-
-    // shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    // sub bytes
-    v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
-    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-
-    // mix columns
-    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    //  add round key
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
-    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
-     (b0))
-#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
-#define SSE2NEON_AES_U0(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
-#define SSE2NEON_AES_U1(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
-#define SSE2NEON_AES_U2(p) \
-    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
-#define SSE2NEON_AES_U3(p) \
-    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
-    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
-    };
-#undef SSE2NEON_AES_B2W
-#undef SSE2NEON_AES_F2
-#undef SSE2NEON_AES_F3
-#undef SSE2NEON_AES_U0
-#undef SSE2NEON_AES_U1
-#undef SSE2NEON_AES_U2
-#undef SSE2NEON_AES_U3
-
-    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
-    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
-    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
-    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
-
-    __m128i out = _mm_set_epi32(
-        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
-         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
-        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
-         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
-        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
-         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
-        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
-         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, RoundKey);
-#endif
-}
-
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    /* FIXME: optimized for NEON */
-    uint8_t v[4][4] = {
-        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
-        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
-        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
-        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
-               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
-    };
-    for (int i = 0; i < 16; i++)
-        vreinterpretq_nth_u8_m128i(a, i) =
-            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
-    return a;
-}
-
-// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
-// This instruction generates a round key for AES encryption. See
-// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
-// for details.
-//
-// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
-{
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
-    for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
-    }
-    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-}
-#undef SSE2NEON_AES_DATA
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-        vreinterpretq_u8_m128i(b));
-}
-
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-                         RoundKey);
-}
-
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-    // AESE does ShiftRows and SubBytes on A
-    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-
-    uint8x16_t dest = {
-        // Undo ShiftRows step from AESE and extract X1 and X3
-        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
-        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
-        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
-    };
-    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
-    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
-}
-#endif
-
-/* Streaming Extensions */
-
-// Guarantees that every preceding store is globally visible before any
-// subsequent store.
-// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
-{
-    __sync_synchronize();
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#else
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-#endif
-}
-
-// Stores the data in a to the address p without polluting the caches.  If the
-// cache line containing address p is already in the cache, the cache will be
-// updated.
-// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, p);
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    return __builtin_nontemporal_load(p);
-#else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
-#endif
-}
-
-// Cache line containing p is flushed and invalidated from all caches in the
-// coherency domain. :
-// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-    (void) p;
-    // no corollary for Neon?
-}
-
-/* conflicts with mm_malloc.h
-// Allocate aligned blocks of memory.
-// https://software.intel.com/en-us/
-//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-    void *ptr;
-    if (align == 1)
-        return malloc(size);
-    if (align == 2 || (sizeof(void *) == 8 && align == 4))
-        align = sizeof(void *);
-    if (!posix_memalign(&ptr, align, size))
-        return ptr;
-    return NULL;
-}
-
-FORCE_INLINE void _mm_free(void *addr)
-{
-    free(addr);
-}
-*/
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
-#endif
-    return crc;
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#if defined(__GNUC__)
-//#pragma GCC pop_options
-#endif
-
-#endif
diff --git a/libkram/kram/win_mmap.h b/libkram/kram/win_mmap.h
index 68511b28..b21c1c9f 100644
--- a/libkram/kram/win_mmap.h
+++ b/libkram/kram/win_mmap.h
@@ -18,52 +18,60 @@
  */
 
 #include <io.h>
-#include <windows.h>
 #include <sys/types.h>
 
-#define PROT_READ     0x1
-#define PROT_WRITE    0x2
-#define PROT_EXEC     0x4
+#ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+
+#define PROT_READ 0x1
+#define PROT_WRITE 0x2
+#define PROT_EXEC 0x4
+
+#define MAP_SHARED 0x01
+#define MAP_PRIVATE 0x02
+#define MAP_ANON 0x20
+#define MAP_FAILED ((void *)-1)
 
-#define MAP_SHARED    0x01
-#define MAP_PRIVATE   0x02
-#define MAP_ANON      0x20
-#define MAP_FAILED    ((void *) -1)
+// off_t is 32-bit, which isn't great
+using myoff_t = int64_t;
 
 // TODO: find out which path this takes, want 64-bit mmsp
-#ifdef __USE_FILE_OFFSET64
-# define DWORD_HI(x) ((x) >> (uint64_t)32)
-# define DWORD_LO(x) ((x) & (uint64_t)0xffffffff)
-#else
-# define DWORD_HI(x) (0)
-# define DWORD_LO(x) (x)
-#endif
+//#ifdef __USE_FILE_OFFSET64
+#define DWORD_HI(x) ((x) >> (myoff_t)32)
+#define DWORD_LO(x) ((x) & (myoff_t)0xffffffff)
+//#else
+//#define DWORD_HI(x) (0)
+//#define DWORD_LO(x) (x)
+//#endif
 
-static void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)
+static void *mmap(void *start, size_t length, int prot, int flags, int fd, myoff_t offset)
 {
     if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
         return MAP_FAILED;
     if (fd == -1) {
         if (!(flags & MAP_ANON) || offset)
             return MAP_FAILED;
-    } else if (flags & MAP_ANON)
+    }
+    else if (flags & MAP_ANON)
         return MAP_FAILED;
 
-    DWORD flProtect;
+    DWORD flProtect = PAGE_READONLY;
     if (prot & PROT_WRITE) {
         if (prot & PROT_EXEC)
             flProtect = PAGE_EXECUTE_READWRITE;
         else
             flProtect = PAGE_READWRITE;
-    } else if (prot & PROT_EXEC) {
+    }
+    else if (prot & PROT_EXEC) {
         if (prot & PROT_READ)
             flProtect = PAGE_EXECUTE_READ;
         else if (prot & PROT_EXEC)
             flProtect = PAGE_EXECUTE;
-    } else
-        flProtect = PAGE_READONLY;
+    }
 
-    off_t end = length + offset;
+    myoff_t end = length + offset;
     HANDLE mmap_fd, h;
     if (fd == -1)
         mmap_fd = INVALID_HANDLE_VALUE;
@@ -83,23 +91,20 @@ static void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t
     if (flags & MAP_PRIVATE)
         dwDesiredAccess |= FILE_MAP_COPY;
     void *ret = MapViewOfFile(h, dwDesiredAccess, DWORD_HI(offset), DWORD_LO(offset), length);
+
+    // can free the file mapping, mmap will hold it
+    CloseHandle(h);
+
     if (ret == NULL) {
-        CloseHandle(h);
         ret = MAP_FAILED;
     }
-    
-    // TODO: can CreateFileMapping handle be closed here?  View will keep file open.
-    // even if the file handle (fd) is closed.  That would prevent handle leak?
-    
+
     return ret;
 }
 
 static void munmap(void *addr, size_t length)
 {
     UnmapViewOfFile(addr);
-    
-    // Is this a TODO?
-    /* ruh-ro, we leaked handle from CreateFileMapping() ... */
 }
 
 #undef DWORD_HI
diff --git a/libkram/lodepng/lodepng.cpp b/libkram/lodepng/lodepng.cpp
index 23bb89c9..1e08be4c 100644
--- a/libkram/lodepng/lodepng.cpp
+++ b/libkram/lodepng/lodepng.cpp
@@ -6301,7 +6301,7 @@ const char* lodepng_error_text(unsigned code) {
 
 #ifdef LODEPNG_COMPILE_CPP
 namespace lodepng {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 #ifdef LODEPNG_COMPILE_DISK
 unsigned load_file(vector<unsigned char>& buffer, const string& filename) {
diff --git a/libkram/lodepng/lodepng.h b/libkram/lodepng/lodepng.h
index 524bca4f..782150eb 100644
--- a/libkram/lodepng/lodepng.h
+++ b/libkram/lodepng/lodepng.h
@@ -28,7 +28,7 @@ freely, subject to the following restrictions:
 
 #include <string.h> /*for size_t*/
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 extern const char* LODEPNG_VERSION_STRING;
 
@@ -37,8 +37,9 @@ extern const char* LODEPNG_VERSION_STRING;
 // don't need io
 #define LODEPNG_NO_COMPILE_DISK
 
-// using miniz now, but this was failing using miniz so switched off
-//#define LODEPNG_NO_COMPILE_ZLIB
+// using miniz or libCompression now, but this was failing using miniz so switched off
+// This disables the code that inits the array for zlib_decompress.
+// #define LODEPNG_NO_COMPILE_ZLIB
 
 // was not doing png encodes, but to strip blocks now need to
 #define LODEPNG_COMPILE_ENCODER
diff --git a/libkram/miniz/miniz.cpp b/libkram/miniz/miniz.cpp
index a62263fc..7ecaf980 100644
--- a/libkram/miniz/miniz.cpp
+++ b/libkram/miniz/miniz.cpp
@@ -348,7 +348,7 @@ mz_ulong mz_compressBound(mz_ulong source_len)
     return mz_deflateBound(NULL, source_len);
 }
 
-typedef struct
+typedef struct inflate_state
 {
     tinfl_decompressor m_decomp;
     mz_uint m_dict_ofs, m_dict_avail, m_first_call, m_has_flushed;
@@ -3201,7 +3201,7 @@ typedef struct
     mz_uint m_element_size;
 } mz_zip_array;
 
-struct mz_zip_internal_state_tag
+typedef struct mz_zip_internal_state
 {
     mz_zip_array m_central_dir;
     mz_zip_array m_central_dir_offsets;
@@ -3223,7 +3223,7 @@ struct mz_zip_internal_state_tag
     void *m_pMem;
     size_t m_mem_size;
     size_t m_mem_capacity;
-};
+} mz_zip_internal_state;
 
 #define MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(array_ptr, element_size) (array_ptr)->m_element_size = element_size
 
diff --git a/libkram/miniz/miniz.h b/libkram/miniz/miniz.h
index 8867c3c6..295a8b6a 100644
--- a/libkram/miniz/miniz.h
+++ b/libkram/miniz/miniz.h
@@ -112,7 +112,12 @@
 */
 #pragma once
 
+// Alec add this for now (move to define on projects?)
 #if 1
+
+// Make sure large file calls are used.  Should be set across app.
+#define _LARGEFILE64_SOURCE 1
+
 // Alec add this for now (move to define on projects?)
 
 // skip crc read checks to speed up reads
@@ -124,6 +129,9 @@
 // handling file io separately
 #define MINIZ_NO_STDIO
 
+// These defines annoying conflict with everything (f.e. compress)
+#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
 #endif
 
 
@@ -291,7 +299,7 @@ enum
 struct mz_internal_state;
 
 /* Compression/decompression stream struct. */
-typedef struct mz_stream_s
+typedef struct mz_stream
 {
     const unsigned char *next_in; /* pointer to next byte to read */
     unsigned int avail_in;        /* number of bytes available at next_in */
@@ -521,7 +529,7 @@ typedef int mz_bool;
 #endif /* #ifdef MINIZ_NO_STDIO */
 
 #ifdef MINIZ_NO_TIME
-typedef struct mz_dummy_time_t_tag
+typedef struct mz_dummy_time_t
 {
     int m_dummy;
 } mz_dummy_time_t;
@@ -707,7 +715,7 @@ typedef enum {
 } tdefl_flush;
 
 /* tdefl's compression state structure. */
-typedef struct
+typedef struct tdefl_compressor
 {
     tdefl_put_buf_func_ptr m_pPut_buf_func;
     void *m_pPut_buf_user;
@@ -808,8 +816,8 @@ size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const voi
 typedef int (*tinfl_put_buf_func_ptr)(const void *pBuf, int len, void *pUser);
 int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
 
-struct tinfl_decompressor_tag;
-typedef struct tinfl_decompressor_tag tinfl_decompressor;
+//struct tinfl_decompressor_tag;
+typedef struct tinfl_decompressor tinfl_decompressor;
 
 #ifndef MINIZ_NO_MALLOC
 /* Allocate the tinfl_decompressor structure in C so that */
@@ -823,7 +831,7 @@ void tinfl_decompressor_free(tinfl_decompressor *pDecomp);
 #define TINFL_LZ_DICT_SIZE 32768
 
 /* Return status. */
-typedef enum {
+typedef enum tinfl_status {
     /* This flags indicates the inflator needs 1 or more input bytes to make forward progress, but the caller is indicating that no more are available. The compressed data */
     /* is probably corrupted. If you call the inflator again with more bytes it'll try to continue processing the input but this is a BAD sign (either the data is corrupted or you called it incorrectly). */
     /* If you call it again with no input you'll just get TINFL_STATUS_FAILED_CANNOT_MAKE_PROGRESS again. */
@@ -880,7 +888,7 @@ enum
     TINFL_FAST_LOOKUP_SIZE = 1 << TINFL_FAST_LOOKUP_BITS
 };
 
-typedef struct
+typedef struct tinfl_huff_table
 {
     mz_uint8 m_code_size[TINFL_MAX_HUFF_SYMBOLS_0];
     mz_int16 m_look_up[TINFL_FAST_LOOKUP_SIZE], m_tree[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
@@ -900,14 +908,14 @@ typedef mz_uint32 tinfl_bit_buf_t;
 #define TINFL_BITBUF_SIZE (32)
 #endif
 
-struct tinfl_decompressor_tag
+typedef struct tinfl_decompressor
 {
     mz_uint32 m_state, m_num_bits, m_zhdr0, m_zhdr1, m_z_adler32, m_final, m_type, m_check_adler32, m_dist, m_counter, m_num_extra, m_table_sizes[TINFL_MAX_HUFF_TABLES];
     tinfl_bit_buf_t m_bit_buf;
     size_t m_dist_from_out_buf_start;
     tinfl_huff_table m_tables[TINFL_MAX_HUFF_TABLES];
     mz_uint8 m_raw_header[4], m_len_codes[TINFL_MAX_HUFF_SYMBOLS_0 + TINFL_MAX_HUFF_SYMBOLS_1 + 137];
-};
+} tinfl_decompressor;
 
 #ifdef __cplusplus
 }
@@ -932,7 +940,7 @@ enum
     MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE = 512
 };
 
-typedef struct
+typedef struct mz_zip_archive_file_stat
 {
     /* Central directory file index. */
     mz_uint32 m_file_index;
@@ -992,8 +1000,8 @@ typedef size_t (*mz_file_read_func)(void *pOpaque, mz_uint64 file_ofs, void *pBu
 typedef size_t (*mz_file_write_func)(void *pOpaque, mz_uint64 file_ofs, const void *pBuf, size_t n);
 typedef mz_bool (*mz_file_needs_keepalive)(void *pOpaque);
 
-struct mz_zip_internal_state_tag;
-typedef struct mz_zip_internal_state_tag mz_zip_internal_state;
+//struct mz_zip_internal_state_tag;
+typedef struct mz_zip_internal_state mz_zip_internal_state;
 
 typedef enum {
     MZ_ZIP_MODE_INVALID = 0,
@@ -1088,7 +1096,7 @@ typedef struct mz_zip_archive
 
 } mz_zip_archive;
 
-typedef struct
+typedef struct mz_zip_reader_extract_iter_state
 {
     mz_zip_archive *pZip;
     mz_uint flags;
@@ -1109,7 +1117,7 @@ typedef struct
 } mz_zip_reader_extract_iter_state;
 
 // this is purely for looking at struct in debugger
-typedef struct {
+typedef struct mz_local_file_header {
     mz_uint32 local_file_signature; // 0x04034b50 read as LE number
     mz_uint16 version;
     mz_uint16 bit_flags;
diff --git a/libkram/simdjson/simdjson.h b/libkram/simdjson/simdjson.h
index 4ad510d3..9588ff55 100644
--- a/libkram/simdjson/simdjson.h
+++ b/libkram/simdjson/simdjson.h
@@ -27249,7 +27249,7 @@ inline void log_headers() noexcept {
     printf("|%.*s", 5+2, DASHES);
     printf("|--------");
     printf("|\n");
-    fflush(stdout);
+    // fflush(stdout);
   }
 }
 
@@ -27287,7 +27287,7 @@ inline void log_line(const json_iterator &iter, token_position index, depth_t de
     printf("| %5i ", depth);
     printf("| %.*s ", int(detail.size()), detail.data());
     printf("|\n");
-    fflush(stdout);
+    // fflush(stdout);
   }
 }
 
diff --git a/libkram/squish/maths.cpp b/libkram/squish/maths.cpp
index 79c08c5c..2c34fd81 100644
--- a/libkram/squish/maths.cpp
+++ b/libkram/squish/maths.cpp
@@ -34,7 +34,7 @@
 //#include <algorithm>
 
 namespace squish {
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
 {
diff --git a/libkram/squish/maths.h b/libkram/squish/maths.h
index 43f37c4a..2b6be1fb 100644
--- a/libkram/squish/maths.h
+++ b/libkram/squish/maths.h
@@ -357,10 +357,10 @@ class Vec4
 */
 
 
-using namespace simd;
+using namespace SIMD_NAMESPACE;
 using Vec4 = float4;
 // default ctor for float4(1) sets 1,0,0,0 in simd, but impls like Vec4 expect float4(repeating: x)
-#define VEC4_CONST(x) Vec4(makeVec4(x,x,x,x))
+#define VEC4_CONST(x) float4m(x)
 #define makeVec4(x,y,z,w) float4m(x,y,z,w)
 
 inline bool CompareAnyLessThan(Vec4 x, Vec4 y) { return any(x < y); }
diff --git a/libkram/transcoder/basisu_transcoder.cpp b/libkram/transcoder/basisu_transcoder.cpp
index 6579060f..6398affe 100644
--- a/libkram/transcoder/basisu_transcoder.cpp
+++ b/libkram/transcoder/basisu_transcoder.cpp
@@ -185,7 +185,7 @@ namespace basisu
 
 namespace basist
 {
-    using namespace NAMESPACE_STL;
+    using namespace STL_NAMESPACE;
 
 #if BASISD_ENABLE_DEBUG_FLAGS
 	static uint32_t g_debug_flags = 0;
diff --git a/libkram/transcoder/basisu_transcoder.h b/libkram/transcoder/basisu_transcoder.h
index 9cf29a63..7c63b340 100644
--- a/libkram/transcoder/basisu_transcoder.h
+++ b/libkram/transcoder/basisu_transcoder.h
@@ -42,7 +42,7 @@
 
 namespace basist
 {
-    using namespace NAMESPACE_STL;
+    using namespace STL_NAMESPACE;
 
 	// High-level composite texture formats supported by the transcoder.
 	// Each of these texture formats directly correspond to OpenGL/D3D/Vulkan etc. texture formats.
diff --git a/libkram/vectormath/README.md b/libkram/vectormath/README.md
new file mode 100644
index 00000000..2f98f71a
--- /dev/null
+++ b/libkram/vectormath/README.md
@@ -0,0 +1,106 @@
+vectormath
+==========
+
+A small vector math library for float and double vectors, matrices, and quaternions.  There are also types for int/long used for the float/double comparisons.  Each type can be disabled (SIMD_FLOAT, SIMD_DOUBLE) to limit usage.  This should be built as an optimized library to keep debug builds running fast.  Most of the calls are inlined, so selectively optimizing the included code in a debug build will also help.
+
+Small implementation kernel (just using float2/4 and double2/4 simd ops).  I would not recommend using with C.  All types have base C types for ObjC and these provide default math and comparison ops.
+
+You can also bury the impls with a little work, and avoid the simd headers getting pulled into code, but the whole point is to inline the calls for speed and stay in register.  So can drop to SSE4.2, but give up F16C.  And AVX2 provides fma to line up with arm64.  So going between arm64 and AVX2 seems like a good parallel if your systems support it.
+
+Based around the gcc/clang vector extensions.  These provide a lot of opimtized base ops.  The vecs extend to 2, 4, 8, 16, 32 operands.   On larger types, these use multiple 4 operand instructions to do so.   I've limited vector counts to 32B for AVX2 for now.   These are c types for the vectors, so no ctors or member functions.  You can further wrap these under your own vector math code, but you then have a lot of forwarding and conversion.  I recommend using the make ctors for the vectors.   But the curly brace init is misuse for what it does.
+
+```
+float4 v = {1.0f};        v = 1,xxx
+float4 v = float4m(1.0f); v = 1,1,1,1
+float4 v = 1.0f.          v = 1,1,1,1
+```
+
+Matrices are 2x2, 3x3, 3x4, and 4x4 column only.  Matrices have a C++ type with operators and calls.  Chop out with defines float, double, half, but keep int for the conditional tests.   Easy to add more types with the macros - u/char, u/long, u/short. 
+
+I gutted the armv7 stuff from sse2neon.h so that's readable.  But this is only needed for an _mm_shuffle_ps.  Updated sse_mathfun for the cos/sin/log ops, but it's currently only reference fp32 SSE.  I added the fp16 <-> fp32 calls, since that's all Android has.  
+
+Apple Accelerate has similar calls and structs.  The lib holds the optimized calls for sin, cos, log, inverse, but you only get them if you're on a new enough iOS/macOS.   And that api is so much code, that for some things it's not using the best methods.  
+
+---
+
+* Simd: arm64 Neon, x64 AVX2/AVX/SSE4.1
+* Compiler: Clang mainly 
+* Language: C types/ops, C++ matrix type and ops, can pass to ObjC
+* Features: Clang/GCC vector extensions (no MSVC)
+* C++ usage: C++11 but compiled as C++20
+* Platforms: macOS/iOS, Win, Linux, others
+
+Small vector simd kernel based around 2 and 4 element int, float, double ops.
+  
+Half (fp16) conversions in case _Float16 not supported (f.e. Android)
+
+Clang vector extensions provide:
+* swizzles (f.e. .xzy)
+* array ops (f.e. v[0], v[1])
+* rgba and xyzw access
+* built-in conversions ops
+* .even/.odd are even/odd elemnts of vector (reduced)
+* .hi/.lo vector chunks (f.e. float8 provides 2x float4 vecs)
+* math ops that vectorize to simd ops (+=, *=, +, -, ...)
+* comparison ops that generate an int2/3/4/8 op
+* can only use extension on C typedef
+* C++ vector is typedef to C typedef, no member functions
+* Can cast to system simd libs, no conversions needed
+* Converts to smaller vectors via swizzles. v.xyzw -> v.x, v.xy, v.xyz
+* Splats constants to stay in register v.x -> v.xxxx
+* Auto converts to _m128 and float32x4_t
+* Neon has to emulated 32B with 2 registers (f.e. double4)
+
+---
+
+Types
+
+* all types come in three flavors
+* float4a - aligned type
+* float4p - packed type
+* float   - c++ type omits the "a"
+*
+* int2/3/4/8
+* long2/3/4
+*
+* half2/3/4/8/16
+* float2/3/4/8
+* float2x2/3x3/3x4/4x4
+* double2/3/4
+* double2x2/3x3/3x4/4x4
+*
+* - u/char2...32
+* - u/short2...16
+* - ulong2...16
+
+---
+
+max vec size per register
+* 16B      32B
+* char16   char32
+* short8   short16
+* uint4    uint8
+* float4   float8
+* double2  double4
+
+---
+
+* DONE: Add double2 ops and double2x2...
+* DONE: Add quatf, and conversions
+* DONE: Add affine inverses
+* DONE: Split file into float and double sub-files
+* DONE: Add float2/double2 Neon vec ops.
+* DONE: Add double2 SSE ops
+* DONE: Add double4 AVX2 ops
+* DONE: Formatting in print support
+* DONE: Move to release Xcode library project and CMakeLists.txt file. 
+
+* TODO: Tests of the calls.
+* TODO: Add row vector support (vs. columns)
+* TODO: Consider adding ISPC optimized calls for log, exp, sin, cos, etc
+* DONE: Add debugger natvis and lldb formatters
+* SOME: Disassembly of the calls (MSVC?)
+
+---
+
+
diff --git a/libkram/vectormath/bounds234.cpp b/libkram/vectormath/bounds234.cpp
new file mode 100644
index 00000000..29253482
--- /dev/null
+++ b/libkram/vectormath/bounds234.cpp
@@ -0,0 +1,347 @@
+#include "vectormath234.h"
+
+#if SIMD_FLOAT && SIMD_INT
+
+namespace SIMD_NAMESPACE {
+
+culler::culler() : _planesCount(0)
+{
+}
+
+void culler::update(const float4x4& projView)
+{
+    // build a worldspace cameraa volume
+    // https://fgiesen.wordpress.com/2010/10/17/view-frustum-culling/
+    // but don't test farZ plane if infFarZ
+
+    float4x4 m = transpose(projView);
+    const float4& x = m[0];
+    const float4& y = m[1];
+    const float4& z = m[2];
+    const float4& w = m[3];
+
+    // x < w     0 < w - x
+    // x > -w    0 < w + x
+
+    _planes[0] = normalize(w + x);
+    _planes[1] = normalize(w - x);
+    _planes[2] = normalize(w + y);
+    _planes[3] = normalize(w - y);
+
+    // This uses 0 to 1
+
+    // revZ
+    _planes[4] = normalize(w - z);
+
+    bool isInfFarPlane = projView[2][2] == 0;
+    if (isInfFarPlane)
+        _planes[5] = 0;
+    else
+        _planes[5] = normalize(z);
+
+    // anyway to always use 6 for unrolling?
+    // f.e. above use 0,0,-1,FLT_MAX, instead of 0
+    _planesCount = isInfFarPlane ? 5 : 6;
+
+    // select min or max based on normal direction
+    for (int i = 0; i < _planesCount; ++i) {
+        _selectionMasks[i] = _planes[i] < 0;
+    }
+
+    // Nathan Reed - If you represent the frustum corners in homogeneous coordinates,
+    // with w=0 for points at infinity, this just falls out of the usual
+    // point vs plane test, where you dot the homogeneous point against the plane equation.
+
+    // generate 8 corners of camera volume from the inverse
+    float4x4 projViewInv = inverse(projView); // TODO: can pass down
+    float nearClip = 1;
+
+    // inset so division can occur
+    float farClip = isInfFarPlane ? 1e-6f : 0;
+
+    static float4 clipCorners[8] = {
+        {-1, -1, nearClip, 1},
+        {-1, 1, nearClip, 1},
+        {1, -1, nearClip, 1},
+        {1, 1, nearClip, 1},
+
+        {-1, -1, farClip, 1},
+        {-1, 1, farClip, 1},
+        {1, -1, farClip, 1},
+        {1, 1, farClip, 1},
+    };
+
+    // These are homogenous coords, so w may be 0
+    for (int i = 0; i < 8; ++i) {
+        float4 cornerHomog = projViewInv * clipCorners[i];
+        _corners[i] = cornerHomog / cornerHomog.w;
+        _corners[i].w = 1;
+    }
+}
+
+bool culler::cullBox(float3 min, float3 max) const
+{
+    // Note: make sure box min <= max, or this call will fail
+
+    // TODO: convert this from dot to a mul of 4, then finish plane 5,6
+    // could precompute/store the selection masks.
+    // Also may need to do 4 boxes at a time.
+
+    // TODO: also if frustum is finite farZ, then may need to test for
+    // frustum in box.  This is a rather expensive test though
+    // of the 8 frustum corners.
+
+    float4 min1 = float4m(min, 1);
+    float4 max1 = float4m(max, 1);
+
+    // test the min/max against the x planes
+    int count = 0;
+
+    for (int i = 0; i < _planesCount; ++i) {
+        count += dot(_planes[i], select(min1, max1, _selectionMasks[i])) > 0;
+    }
+
+    return count == _planesCount;
+}
+
+bool culler::cullSphere(float4 sphere) const
+{
+    // TODO: convert this from dot to a mul of 4, then finish plane 5,6
+    // keep everything in simd reg.
+    // Also may need to do 4 spheres at a time.
+
+    float4 sphere1 = float4m(sphere.xyz, 1);
+    float radius = sphere.w;
+
+    int count = 0;
+    for (int i = 0; i < _planesCount; ++i) {
+        count += dot(_planes[i], sphere1) > radius;
+    }
+
+    return count == _planesCount;
+}
+
+void culler::cullBoxes(const float3* boxes, int count, uint8_t shift, uint8_t* results) const
+{
+    // box array is 2x count
+    uint8_t bit = (1 << shift);
+    uint8_t skipBit = (1 << 7);
+
+    for (int i = 0; i < count; ++i) {
+        uint8_t& res8 = results[i];
+        if ((res8 & skipBit) != 0)
+            continue;
+
+        float3 min = boxes[2 * i];
+        float3 max = boxes[2 * i + 1];
+
+        if (cullBox(min, max))
+            res8 |= bit;
+    }
+}
+
+void culler::cullSpheres(const float4* sphere, int count, uint8_t shift, uint8_t* results) const
+{
+    uint8_t bit = (1 << shift);
+    uint8_t skipBit = (1 << 7);
+
+    for (int i = 0; i < count; ++i) {
+        uint8_t& res8 = results[i];
+        if ((res8 & skipBit) != 0)
+            continue;
+
+        if (cullSphere(sphere[i]))
+            res8 |= bit;
+    }
+}
+
+bool culler::isCameraInBox(bbox box) const
+{
+    // See if all 8 verts of the frustum are in the box.
+    // This becomes a false negative for non-inf far (skips box while inside)
+    const float3* corners = cameraCorners();
+
+    int3 count = 0;
+    for (int i = 0; i < 8; ++i) {
+        float3 c = corners[i];
+        count += c >= box.min &
+                 c <= box.max;
+    }
+
+    // high-bit set is -1
+    return all(count == (int3)-8);
+}
+
+bool culler::isCameraOutsideBox(bbox box) const
+{
+    // See if all 8 verts of the camera are outside box.
+    // This becomes a false positive (draws box even though outside)
+    const float3* corners = cameraCorners();
+
+    int3 countMin = 0;
+    int3 countMax = 0;
+    for (int i = 0; i < 8; ++i) {
+        float3 c = corners[i];
+        countMin += c < box.min;
+        countMax += c > box.max;
+    }
+
+    // high-bit set is -1
+    return any(countMin == (int3)-8 | countMax == (int3)-8);
+}
+
+bsphere culler::transformSphereTRS(bsphere sphere, const float4x4& modelTfm)
+{
+    // May be better to convert to box with non-uniform scale
+    // sphere gets huge otherwise.  Cache these too.
+
+#if 1
+    // not sure which code is smaller, still have to add t
+    float size = decompose_scale_max(modelTfm);
+    float radius = sphere.radius() * size;
+    float4 sphereCenter = float4m(sphere.center(), 1);
+    sphereCenter = modelTfm * sphereCenter;
+
+    sphere = bsphere(sphereCenter.xyz, radius);
+    return sphere;
+#else
+    // really just a 3x3 and translation
+    const float3x3& m = as_float3x3(modelTfm);
+    float3 t = m[3];
+
+    float size = decompose_scale_max(modelTfm);
+    float radius = sphere.radius() * size;
+    float3 sphereCenter = m * sphere.center();
+    sphereCenter += t;
+
+    sphere = bsphere(sphereCenter, radius);
+    return sphere;
+#endif
+}
+
+// Note: if doing infFar, may want float4 in homogenous space w = 0
+// then the points are accurate.
+
+void culler::boxCorners(bbox box, float3 pt[8]) const
+{
+    // TODO: fix these so order is 000 to 111 in bits
+
+    float3 min1 = box.min;
+    float3 max1 = box.max;
+
+    pt[0] = min1;
+    pt[1] = max1;
+
+    pt[2] = float3m(min1.xy, max1.z);
+    pt[3] = float3m(max1.xy, min1.z);
+
+    pt[4] = min1;
+    pt[4].y = max1.y;
+    pt[5] = max1;
+    pt[5].x = min1.x;
+
+    pt[6] = max1;
+    pt[6].y = min1.y;
+    pt[7] = min1;
+    pt[7].x = max1.x;
+}
+
+void culler::boxCorners(bbox box, float4 pt[8]) const
+{
+    float4 min1 = float4m(box.min, 1);
+    float4 max1 = float4m(box.max, 1);
+
+    pt[0] = min1;
+    pt[1] = max1;
+
+    pt[2] = float4m(min1.xy, max1.zw);
+    pt[3] = float4m(max1.xy, min1.zw);
+
+    pt[4] = min1;
+    pt[4].y = max1.y; // float4m(min1.x, max1.y, min1.zw),
+    pt[5] = max1;
+    pt[5].x = min1.x; // float4m(min1.x, max1.yzw),
+
+    pt[6] = max1;
+    pt[6].y = min1.y; // float4m(max1.x, min1.y, max1.zw),
+    pt[7] = min1;
+    pt[7].x = max1.x; // float4m(max1.x, min1.yzw),
+}
+
+bbox culler::transformBoxTRS(bbox box, const float4x4& modelTfm)
+{
+    // Woth doing on cpu and caching.  So can still process an array
+    // but should transform only ones thatt didn't change transform or bound.
+
+#if 0
+    // This is for a full general 4x4, but want a simpler affine version
+    // convert the box to 8 pts first
+    float4 pt[8];
+    boxCorners(box, pt)
+
+    box.setInvalid();
+    for (int i = 0; i < 8; ++i) {
+        float3 v = (modelTfm * pt[i]).xyz;
+        box.unionWith(v);
+    }
+
+#elif 0
+    // really just a 3x3 and translation
+    const float3x3& m = as_float3x3(modelTfm);
+    float3 t = m[3];
+
+    // convert the box to 8 pts first
+    float3 pt[8];
+    boxCorners(box, ptr);
+
+    box.setInvalid();
+    for (int i = 0; i < 8; ++i) {
+        float3 v = m * pt[i];
+        box.unionWith(v);
+    }
+    box.offset(t);
+
+#else
+    // This is way less setup on the points.
+
+    const float3x3& m = as_float3x3(modelTfm);
+    float3 t = m[3];
+
+    // what about this
+    // box.min = m * box.min;
+    // box.max = m * box.max;
+    // swap back extrema that flipped due to rot/invert
+    // box.fix();
+    // box.offset(t);
+
+    // Inspiration for code below.
+    // https://github.com/erich666/GraphicsGems/blob/master/gems/TransBox.c
+    float3 min1 = box.min;
+    float3 max1 = box.max;
+
+    box.min = t;
+    box.max = t;
+
+    float3 a, b;
+    for (int i = 0; i < 3; ++i) {
+        // these are muls, not dots
+        a = m[i] * min1;
+        b = m[i] * max1;
+
+        int3 test = a < b;
+
+        box.min += select(0, a, test);
+        box.max += select(0, a, !test);
+
+        box.max += select(0, b, test);
+        box.min += select(0, b, !test);
+    }
+
+#endif
+
+    return box;
+}
+
+} //namespace SIMD_NAMESPACE
+
+#endif
diff --git a/libkram/vectormath/bounds234.h b/libkram/vectormath/bounds234.h
new file mode 100644
index 00000000..cec15486
--- /dev/null
+++ b/libkram/vectormath/bounds234.h
@@ -0,0 +1,166 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+#if USE_SIMDLIB && SIMD_FLOAT
+
+namespace SIMD_NAMESPACE {
+
+// TODO: may want a rect, circle, capsule as well.
+
+struct bbox {
+    bbox() {} // nothing
+    bbox(float3 minv, float3 maxv) : min(minv), max(maxv) {}
+
+    // TODO: add a unit radius and unit diameter box
+
+    // can use this to accumulate points into a box
+    void setInvalid()
+    {
+        min = (float3)FLT_MAX;
+        max = -(float3)FLT_MAX;
+    }
+    bool isInvalid() const { return any(min > max); }
+
+    void unionWith(float3 v)
+    {
+        min = SIMD_NAMESPACE::min(min, v);
+        max = SIMD_NAMESPACE::max(max, v);
+    }
+    void unionWith(bbox b)
+    {
+        min = SIMD_NAMESPACE::min(min, b.min);
+        max = SIMD_NAMESPACE::max(max, b.max);
+    }
+
+    // TODO: call to intersect or combine bbox
+
+    float3 center() const { return 0.5f * (min + max); }
+    float3 dimensions() const { return max - min; }
+
+    float width() const { return dimensions().x; }
+    float height() const { return dimensions().y; }
+    float depth() const { return dimensions().z; }
+
+    float diameter() const { return length(dimensions()); }
+    float radius() const { return 0.5f * diameter(); }
+    float radiusSquared() const { return length_squared(dimensions() * 0.5f); }
+
+    void scale(float3 s)
+    {
+        min *= s;
+        max *= s;
+    }
+    void offset(float3 o)
+    {
+        min += o;
+        max += o;
+    }
+
+    // after transforms (f.e. rotate, min/max can swap)
+    void fix()
+    {
+        // don't call this on invalid box, or it will be huge
+        float3 tmp = SIMD_NAMESPACE::max(min, max);
+        min = SIMD_NAMESPACE::min(min, max);
+        max = tmp;
+    }
+
+public:
+    float3 min;
+    float3 max;
+};
+
+// center + radius
+struct bsphere {
+    bsphere() {} // nothing
+    bsphere(float3 center, float radius) : centerRadius(float4m(center, radius)) {}
+
+    // TODO: add a unit radius and unit diameter
+
+    float3 center() const { return centerRadius.xyz; }
+    float radius() const { return centerRadius.w; }
+    float radiusSquared() const { return centerRadius.w * centerRadius.w; }
+
+public:
+    float4 centerRadius;
+};
+
+// Fast cpu culler per frustum.  Easy port to gpu which can do occlusion.
+// This only tests 5 or 6 planes.
+struct culler {
+    culler();
+
+    void update(const float4x4& projView);
+
+    // can use the helper types instead
+    bool cullSphere(float4 sphere) const;
+    bool cullBox(float3 min, float3 max) const;
+
+    bool cullBox(const bbox& box) const
+    {
+        return cullBox(box.min, box.max);
+    }
+    bool cullSphere(const bsphere& sphere) const
+    {
+        return cullSphere(sphere.centerRadius);
+    }
+
+    // Caller must zero the results array, and visible state sets only low bit
+    // These store a bit (with shift) in the results array.
+    // If high bit is set in results, then test is skipped.
+    void cullBoxes(const float3* boxes, int count, uint8_t shift, uint8_t* results) const;
+    void cullSpheres(const float4* sphere, int count, uint8_t shift, uint8_t* results) const;
+
+    void cullBoxes(const bbox* boxes, int count, uint8_t shift, uint8_t* results) const
+    {
+        cullBoxes((const float3*)boxes, count, shift, results);
+    }
+    void cullSpheres(const bsphere* spheres, int count, uint8_t shift, uint8_t* results) const
+    {
+        cullSpheres((const float4*)spheres, count, shift, results);
+    }
+
+    // move these out?
+    static bsphere transformSphereTRS(bsphere sphere, const float4x4& modelTfm);
+    static bbox transformBoxTRS(bbox box, const float4x4& modelTfm);
+
+    // bbox corners
+    void boxCorners(bbox box, float3 pt[8]) const;
+    void boxCorners(bbox box, float4 pt[8]) const;
+
+    bool isCameraInBox(bbox box) const;
+    bool isCameraOutsideBox(bbox box) const;
+
+    // Camera corners in world space
+    const float3* cameraCorners() const
+    {
+        return as_float3(_corners);
+    }
+    int cameraCornersCount() const { return 8; }
+
+    // Camera clip planes in world space
+    const float4* cameraPlanes() const { return _planes; }
+    int cameraPlanesCount() const { return _planesCount; }
+
+private:
+    // camera planes in world space
+    float4 _planes[6];
+
+    // This won't work if SIMD_INT is not defined.
+#if SIMD_INT
+    // cached tests of which planes are positive/negative
+    int4 _selectionMasks[6];
+#endif
+
+    uint32_t _planesCount;
+
+    // 8 corners of camera volume
+    float4 _corners[8];
+};
+
+} // namespace SIMD_NAMESPACE
+
+#endif // USE_SIMDLIB && SIMD_FLOAT
diff --git a/libkram/vectormath/double234.cpp b/libkram/vectormath/double234.cpp
new file mode 100644
index 00000000..0efa841e
--- /dev/null
+++ b/libkram/vectormath/double234.cpp
@@ -0,0 +1,683 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+#include "vectormath234.h"
+
+// This has to include this, not double234.h
+#if USE_SIMDLIB && SIMD_DOUBLE
+
+#if SIMD_ACCELERATE_MATH
+// TODO: reduce this header to just calls use (f.e. geometry, etc)
+#include <simd/simd.h>
+#endif // SIMD_ACCELERATE_MATH
+
+namespace SIMD_NAMESPACE {
+
+// clang-format off
+
+#if SIMD_ACCELERATE_MATH
+// These will get inlined here from the template
+macroVectorRepeatFnImpl(double, log)
+macroVectorRepeatFnImpl(double, exp)
+
+macroVectorRepeatFnImpl(double, sin)
+macroVectorRepeatFnImpl(double, cos)
+macroVectorRepeatFnImpl(double, tan)
+
+macroVectorRepeatFnImpl(double, asin)
+macroVectorRepeatFnImpl(double, acos)
+macroVectorRepeatFnImpl(double, atan)
+
+macroVectorRepeatFn2Impl(double, atan2)
+
+#endif // SIMD_ACCELERATE_MATH
+
+#if SIMD_CMATH_MATH
+macroVectorRepeatFnImpl(double, log, ::log)
+macroVectorRepeatFnImpl(double, exp, ::exp)
+
+macroVectorRepeatFnImpl(double, sin, ::sin)
+macroVectorRepeatFnImpl(double, cos, ::cos)
+macroVectorRepeatFnImpl(double, tan, ::tan)
+
+macroVectorRepeatFnImpl(double, asin, ::asin)
+macroVectorRepeatFnImpl(double, acos, ::acos)
+macroVectorRepeatFnImpl(double, atan, ::atan)
+
+#endif // SIMD_CMATH_MATH
+
+    // clang-format on
+
+    //---------------------------
+
+    static const double2 kdouble2_posx = {1.0f, 0.0f};
+static const double2 kdouble2_posy = kdouble2_posx.yx;
+
+static const double2 kdouble2_negx = {-1.0f, 0.0f};
+static const double2 kdouble2_negy = kdouble2_negx.yx;
+
+static const double2 kdouble2_ones = kdouble2_posx.xx;
+static const double2 kdouble2_zero = {};
+
+//----
+
+static const double3 kdouble3_posx = {1.0f, 0.0f, 0.0f};
+static const double3 kdouble3_posy = kdouble3_posx.yxy;
+static const double3 kdouble3_posz = kdouble3_posx.yyx;
+
+static const double3 kdouble3_negx = {-1.0f, 0.0f, 0.0f};
+static const double3 kdouble3_negy = kdouble3_negx.yxy;
+static const double3 kdouble3_negz = kdouble3_negx.yyx;
+
+static const double3 kdouble3_ones = kdouble3_posx.xxx;
+static const double3 kdouble3_zero = {};
+
+//----
+
+static const double4 kdouble4_posx = {1.0f, 0.0f, 0.0f, 0.0f};
+static const double4 kdouble4_posy = kdouble4_posx.yxyy;
+static const double4 kdouble4_posz = kdouble4_posx.yyxy;
+static const double4 kdouble4_posw = kdouble4_posx.yyyx;
+
+static const double4 kdouble4_negxw = {-1.0f, 0.0f, 0.0f, 1.0f};
+static const double4 kdouble4_negyw = kdouble4_negxw.yxyw;
+static const double4 kdouble4_negzw = kdouble4_negxw.yyxw;
+
+static const double4 kdouble4_posxw = {1.0f, 0.0f, 0.0f, 1.0f};
+static const double4 kdouble4_posyw = kdouble4_posxw.yxyw;
+static const double4 kdouble4_poszw = kdouble4_posxw.yyxw;
+
+static const double4 kdouble4_negx = {-1.0f, 0.0f, 0.0f, 0.0f};
+static const double4 kdouble4_negy = kdouble4_negx.yxyy;
+static const double4 kdouble4_negz = kdouble4_negx.yyxy;
+static const double4 kdouble4_negw = kdouble4_negx.yyyx;
+
+static const double4 kdouble4_ones = kdouble4_posx.xxxx;
+static const double4 kdouble4_zero = {};
+
+//---------------------------
+
+static const double2x2 kdouble2x2_zero = {}; // what is this value 0, or default ctor
+static const double3x3 kdouble3x3_zero = {};
+static const double3x4 kdouble3x4_zero = {};
+static const double4x4 kdouble4x4_zero = {};
+
+static const double2x2 kdouble2x2_identity = diagonal_matrix((double2)1);
+static const double3x3 kdouble3x3_identity = diagonal_matrix((double3)1);
+static const double3x4 kdouble3x4_identity = diagonal_matrix3x4((double3)1);
+static const double4x4 kdouble4x4_identity = diagonal_matrix((double4)1);
+
+//----
+
+const double2& double2_zero() { return kdouble2_zero; }
+const double2& double2_ones() { return kdouble2_ones; }
+
+const double2& double2_posx() { return kdouble2_posx; }
+const double2& double2_posy() { return kdouble2_posy; }
+
+const double2& double2_negx() { return kdouble2_negx; }
+const double2& double2_negy() { return kdouble2_negy; }
+
+//----
+
+const double3& double3_zero() { return kdouble3_zero; }
+const double3& double3_ones() { return kdouble3_ones; }
+
+const double3& double3_posx() { return kdouble3_posx; }
+const double3& double3_posy() { return kdouble3_posy; }
+const double3& double3_posz() { return kdouble3_posz; }
+
+const double3& double3_negx() { return kdouble3_negx; }
+const double3& double3_negy() { return kdouble3_negy; }
+const double3& double3_negz() { return kdouble3_negz; }
+
+//----
+
+const double4& double4_zero() { return kdouble4_zero; }
+const double4& double4_ones() { return kdouble4_ones; }
+
+const double4& double4_posx() { return kdouble4_posx; }
+const double4& double4_posy() { return kdouble4_posy; }
+const double4& double4_posz() { return kdouble4_posz; }
+const double4& double4_posw() { return kdouble4_posw; }
+
+const double4& double4_negx() { return kdouble4_negx; }
+const double4& double4_negy() { return kdouble4_negy; }
+const double4& double4_negz() { return kdouble4_negz; }
+const double4& double4_negw() { return kdouble4_negw; }
+
+const double4& double4_posxw() { return kdouble4_posxw; }
+const double4& double4_posyw() { return kdouble4_posyw; }
+const double4& double4_poszw() { return kdouble4_poszw; }
+
+const double4& double4_negxw() { return kdouble4_negxw; }
+const double4& double4_negyw() { return kdouble4_negyw; }
+const double4& double4_negzw() { return kdouble4_negzw; }
+
+//---------------------------
+
+const double2x2& double2x2::zero() { return kdouble2x2_zero; }
+const double2x2& double2x2::identity() { return kdouble2x2_identity; }
+
+const double3x3& double3x3::zero() { return kdouble3x3_zero; }
+const double3x3& double3x3::identity() { return kdouble3x3_identity; }
+
+const double3x4& double3x4::zero() { return kdouble3x4_zero; }
+const double3x4& double3x4::identity() { return kdouble3x4_identity; }
+
+const double4x4& double4x4::zero() { return kdouble4x4_zero; }
+const double4x4& double4x4::identity() { return kdouble4x4_identity; }
+
+//---------------------------
+
+// These should not be used often.  So can stay buried
+double2x2::double2x2(double2 diag)
+    : base((const base&)diagonal_matrix(diag)) {}
+double3x3::double3x3(double3 diag)
+    : base((const base&)diagonal_matrix(diag)) {}
+double3x4::double3x4(double3 diag)
+    : base((const base&)diagonal_matrix3x4(diag)) {}
+double4x4::double4x4(double4 diag)
+    : base((const base&)diagonal_matrix(diag)) {}
+
+//---------------------------
+
+double2x2 diagonal_matrix(double2 x)
+{
+    double4 xx = zeroext(x);
+    return double2x2(xx.xw, xx.wy);
+}
+double3x3 diagonal_matrix(double3 x)
+{
+    double4 xx = zeroext(x);
+    return double3x3(xx.xww, xx.wyw, xx.wwz);
+}
+double3x4 diagonal_matrix3x4(double3 x)
+{
+    double4 xx = zeroext(x);
+    return double3x4(xx.xwww, xx.wyww, xx.wwzw);
+}
+double4x4 diagonal_matrix(double4 x)
+{
+    double4 xx = x;
+    xx.w = 0.0f;
+    double4 ww = xx;
+    ww.z = x.w;
+    return double4x4(xx.xwww, xx.wyww, xx.wwzw, ww.wwwz);
+}
+
+//---------------------------
+
+// textbook transpose
+double2x2 transpose(const double2x2& x)
+{
+    // std::swap would seem faster here?
+#if SIMD_SSE
+#if SIMD_AVX2
+    double4 x0, x1;
+    x0.xy = x[0];
+    x1.xy = x[1];
+
+    double4 r01 = _mm256_unpacklo_pd(x0, x1);
+    return (double2x2){r01.lo, r01.hi};
+#else
+    double2 x0, x1;
+    x0.xy = x[0];
+    x1.xy = x[1];
+
+    // super slow transpose
+    double2 r0 = {x0[0], x1[0]};
+    double2 r1 = {x0[1], x1[1]};
+    return (double2x2){r0, r1};
+#endif
+#endif // SIMD_SSE
+
+#if SIMD_NEON
+    double2 r0 = vzip1q_f64(x[0], x[1]);
+    double2 r1 = vzip2q_f64(x[0], x[1]);
+    return (double2x2){r0, r1};
+#endif // SIMD_NEON
+}
+
+double3x3 transpose(const double3x3& x)
+{
+    double4 x0, x1, x2;
+    x0.xyz = x[0];
+    x1.xyz = x[1];
+    x2.xyz = x[2];
+
+#if SIMD_SSE
+#if SIMD_AVX2 && 0
+    double4 t0 = _mm256_unpacklo_pd(x0, x1);
+    double4 t1 = _mm256_unpackhi_pd(x0, x1);
+
+    double4 r0 = t0;
+    r0.hi = x2.lo;
+    // TODO: fix shuffle,  222 outside 15 range
+    // looks like op was changed to 4-bit bitmask
+    // lookup shuffle 4 values, and convert this
+    //
+    // 0xde = _MM_SHUFFLE(x,y,z,w)
+    // #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+        (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+    // fp0 to fp3 = 2, 3, 1, 3
+
+    double4 r1 = _mm256_shuffle_pd(t0, x2, 0xde);
+    double4 r2 = x2;
+    r2.lo = t1.lo;
+#else
+    // super slow transpose
+    double3 r0 = {x0[0], x1[0], x2[0]};
+    double3 r1 = {x0[1], x1[1], x2[1]};
+    double3 r2 = {x0[2], x1[2], x2[2]};
+#endif
+#endif // SIMD_SSE
+
+#if SIMD_NEON
+    double2 padding = {0};
+    double4 r0, r1, r2;
+    r0.lo = vzip1q_f64(x0.lo, x1.lo);
+    r1.lo = vzip2q_f64(x0.lo, x1.lo);
+    r2.lo = vzip1q_f64(x0.hi, x1.hi);
+    r0.hi = vzip1q_f64(x2.lo, padding);
+    r1.hi = vzip2q_f64(x2.lo, padding);
+    r2.hi = vzip1q_f64(x2.hi, padding);
+#endif // SIMD_NEON
+    return (double3x3){r0.xyz, r1.xyz, r2.xyz};
+}
+
+double4x4 transpose(const double4x4& x)
+{
+#if SIMD_SSE
+#if SIMD_AVX2
+
+    // NOTE: similar to _MM_TRANSPOSE4_PS using shuffles
+    // but old Neon didn't really have shuffle.
+
+    // using shuffles + permute
+    // unpack runs slower
+    double4 tmp0, tmp1, tmp2, tmp3;
+    tmp0 = _mm256_shuffle_pd(x[0], x[1], 0x0);
+    tmp2 = _mm256_shuffle_pd(x[0], x[1], 0xF);
+    tmp1 = _mm256_shuffle_pd(x[2], x[3], 0x0);
+    tmp3 = _mm256_shuffle_pd(x[2], x[3], 0xF);
+
+    double4 r0, r1, r2, r3;
+    r0 = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);
+    r1 = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);
+    r2 = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);
+    r3 = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);
+
+#else
+    // super slow transpose
+    double4 x0, x1, x2, x3;
+    x0 = x[0];
+    x1 = x[1];
+    x2 = x[2];
+    x3 = x[3];
+
+    double4 r0 = {x0[0], x1[0], x2[0], x3[0]};
+    double4 r1 = {x0[1], x1[1], x2[1], x3[1]};
+    double4 r2 = {x0[2], x1[2], x2[2], x3[2]};
+    double4 r3 = {x0[3], x1[3], x2[3], x3[3]};
+#endif
+#endif // SIMD_SSE
+
+#if SIMD_NEON
+    double4 r0, r1, r2, r3;
+    r0.lo = vzip1q_f64(x[0].lo, x[1].lo);
+    r1.lo = vzip2q_f64(x[0].lo, x[1].lo);
+    r2.lo = vzip1q_f64(x[0].hi, x[1].hi);
+    r3.lo = vzip2q_f64(x[0].hi, x[1].hi);
+    r0.hi = vzip1q_f64(x[2].lo, x[3].lo);
+    r1.hi = vzip2q_f64(x[2].lo, x[3].lo);
+    r2.hi = vzip1q_f64(x[2].hi, x[3].hi);
+    r3.hi = vzip2q_f64(x[2].hi, x[3].hi);
+#endif
+    return (double4x4){r0, r1, r2, r3};
+}
+
+// inverse
+double2x2 inverse(const double2x2& x)
+{
+    double invDet = 1.0f / determinant(x);
+    if (invDet == 0.0f) return kdouble2x2_zero;
+
+    double2x2 r = transpose(x);
+    r[0] *= invDet;
+    r[1] *= invDet;
+    return r;
+}
+
+double3x3 inverse(const double3x3& x)
+{
+    double invDet = 1.0f / determinant(x);
+    if (invDet == 0.0f) return kdouble3x3_zero;
+
+    double3x3 r;
+
+    // this forms the adjoint
+    r[0] = cross(x[1], x[2]) * invDet;
+    r[1] = cross(x[2], x[0]) * invDet;
+    r[2] = cross(x[0], x[1]) * invDet;
+    return r;
+}
+
+// std::swap has warning on aligned data
+inline void swap(double4& a, double4& b)
+{
+    double4 temp = a;
+    a = b;
+    b = temp;
+}
+
+double4x4 inverse(const double4x4& x)
+{
+    // This is a full gje inverse
+
+    double4x4 a(x), b(kdouble4x4_identity);
+    bool inversionSucceeded = true;
+
+    // As a evolves from original mat into identity -
+    // b evolves from identity into inverse(a)
+    int cols = double4x4::col;
+    int rows = double4x4::row;
+
+    // Loop over cols of a from left to right, eliminating above and below diag
+    for (int j = 0; j < rows; j++) {
+        // Find largest pivot in column j among rows j..2
+        int i1 = j; // Row with largest pivot candidate
+        for (int i = j + 1; i < cols; i++) {
+            if (fabs(a[i][j]) > fabs(a[i1][j])) {
+                i1 = i;
+            }
+        }
+
+        // Swap rows i1 and j in a and b to put pivot on diagonal
+        SIMD_NAMESPACE::swap(a[i1], a[j]);
+        SIMD_NAMESPACE::swap(b[i1], b[j]);
+
+        // Scale row j to have a unit diagonal
+        double s = a[j][j];
+        if (s == 0.0) {
+            inversionSucceeded = false;
+            break;
+        }
+
+        s = 1.0 / s;
+        b[j] *= s;
+        a[j] *= s;
+
+        // Eliminate off-diagonal elems in col j of a, doing identical ops to b
+        for (int i = 0; i < cols; i++) {
+            if (i != j) {
+                s = a[i][j];
+                b[i] -= b[j] * s;
+                a[i] -= a[j] * s;
+            }
+        }
+    }
+
+    if (!inversionSucceeded) {
+        b = kdouble4x4_zero;
+    }
+
+    return b;
+}
+
+// determinant
+// internal only ops
+// TODO: could just be macros
+inline double3 rotate1(double3 x) { return x.yzx; }
+inline double3 rotate2(double3 x) { return x.zxy; }
+inline double4 rotate1(double4 x) { return x.yzwx; }
+inline double4 rotate2(double4 x) { return x.zwxy; }
+inline double4 rotate3(double4 x) { return x.wxyz; }
+
+double determinant(const double2x2& x)
+{
+    return cross(x[0], x[1]);
+}
+
+double determinant(const double3x3& x)
+{
+    return reduce_add(
+        x[0] * (rotate1(x[1]) * rotate2(x[2]) - rotate2(x[1]) * rotate1(x[2])));
+}
+
+double determinant(const double4x4& x)
+{
+    double4 codet = x[0] * (rotate1(x[1]) * (rotate2(x[2]) * rotate3(x[3]) - rotate3(x[2]) * rotate2(x[3])) +
+                            rotate2(x[1]) * (rotate3(x[2]) * rotate1(x[3]) - rotate1(x[2]) * rotate3(x[3])) +
+                            rotate3(x[1]) * (rotate1(x[2]) * rotate2(x[3]) - rotate2(x[2]) * rotate1(x[3])));
+    return reduce_add(codet.even - codet.odd);
+}
+
+// trace
+double trace(const double2x2& x)
+{
+    return x[0].x + x[1].y;
+}
+
+double trace(const double3x3& x)
+{
+    return x[0].x + x[1].y + x[2].z;
+}
+
+double trace(const double4x4& x)
+{
+    return x[0].x + x[1].y + x[2].z + x[3].w;
+}
+
+// TODO: may want pre-transform on double3x4 since it's transposed
+// 3 x m3x4 should = 3 element vec
+//
+// simd premul transform on left does a super expensive transpose to avoid dot
+// don't use this, should just dotproducts?
+//static   half2 mul(  half2 x,   half2x2 y) { return mul(transpose(y), x); }
+//
+//
+// Here's how to multiply matrices, since default ops won't do this.
+// be careful with operator* built-in.  Will do column by column mul won't it?
+// Maybe that's why *= is missing on matrices.
+//
+// This is taking each scalar of y[0], hopfully this extends and stays in vec op
+
+// premul-transform has to do dots
+double2 mul(double2 y, const double2x2& x)
+{
+    double2 r;
+    r.x = dot(y, x[0]);
+    r.y = dot(y, x[1]);
+    return r;
+}
+
+double3 mul(double3 y, const double3x3& x)
+{
+    double3 r;
+    r.x = dot(y, x[0]);
+    r.y = dot(y, x[1]);
+    r.z = dot(y, x[2]);
+    return r;
+}
+
+double4 mul(double4 y, const double4x4& x)
+{
+    double4 r;
+    r.x = dot(y, x[0]);
+    r.y = dot(y, x[1]);
+    r.z = dot(y, x[2]);
+    r.w = dot(y, x[3]);
+    return r;
+}
+
+// post-transform at least does a mul madd
+double2 mul(const double2x2& x, double2 y)
+{
+    double2 r = x[0] * y[0]; // no mul(v,v)
+    r = muladd(x[1], y[1], r);
+    return r;
+}
+
+double3 mul(const double3x3& x, double3 y)
+{
+    double3 r = x[0] * y[0];
+    r = muladd(x[1], y[1], r);
+    r = muladd(x[2], y[2], r);
+    return r;
+}
+
+double4 mul(const double4x4& x, double4 y)
+{
+    double4 r = x[0] * y[0];
+    r = muladd(x[1], y[1], r);
+    r = muladd(x[2], y[2], r);
+    r = muladd(x[3], y[3], r);
+    return r;
+}
+
+// matrix muls using mul madd
+double2x2 mul(const double2x2& x, const double2x2& y)
+{
+    double2x2 r;
+
+    // m * columns
+    r[0] = mul(x, y[0]);
+    r[1] = mul(x, y[1]);
+
+    return r;
+}
+
+double3x3 mul(const double3x3& x, const double3x3& y)
+{
+    double3x3 r;
+    r[0] = mul(x, y[0]);
+    r[1] = mul(x, y[1]);
+    r[2] = mul(x, y[2]);
+    return r;
+}
+
+double4x4 mul(const double4x4& x, const double4x4& y)
+{
+    double4x4 r;
+    r[0] = mul(x, y[0]);
+    r[1] = mul(x, y[1]);
+    r[2] = mul(x, y[2]);
+    r[3] = mul(x, y[3]);
+    return r;
+}
+
+// sub
+double2x2 sub(const double2x2& x, const double2x2& y)
+{
+    double2x2 r(x);
+    r[0] -= y[0];
+    r[1] -= y[1];
+    return r;
+}
+double3x3 sub(const double3x3& x, const double3x3& y)
+{
+    double3x3 r(x);
+    r[0] -= y[0];
+    r[1] -= y[1];
+    r[2] -= y[2];
+    return r;
+}
+double4x4 sub(const double4x4& x, const double4x4& y)
+{
+    double4x4 r(x);
+    r[0] -= y[0];
+    r[1] -= y[1];
+    r[2] -= y[2];
+    r[3] -= y[3];
+    return r;
+}
+
+// add
+double2x2 add(const double2x2& x, const double2x2& y)
+{
+    double2x2 r(x);
+    r[0] += y[0];
+    r[1] += y[1];
+    return r;
+}
+double3x3 add(const double3x3& x, const double3x3& y)
+{
+    double3x3 r(x);
+    r[0] += y[0];
+    r[1] += y[1];
+    r[2] += y[2];
+    return r;
+}
+double4x4 add(const double4x4& x, const double4x4& y)
+{
+    double4x4 r(x);
+    r[0] += y[0];
+    r[1] += y[1];
+    r[2] += y[2];
+    r[3] += y[3];
+    return r;
+}
+
+// equal
+bool equal(const double2x2& x, const double2x2& y)
+{
+    return all(x[0] == y[0] &
+               x[1] == y[1]);
+}
+bool equal(const double3x3& x, const double3x3& y)
+{
+    return all(x[0] == y[0] &
+               x[1] == y[1] &
+               x[2] == y[2]);
+}
+bool equal(const double4x4& x, const double4x4& y)
+{
+    return all(x[0] == y[0] &
+               x[1] == y[1] &
+               x[2] == y[2] &
+               x[3] == y[3]);
+}
+
+// equal_abs
+bool equal_abs(const double2x2& x, const double2x2& y, double tol)
+{
+    return all((abs(x[0] - y[0]) <= tol) &
+               (abs(x[1] - y[1]) <= tol));
+}
+bool equal_abs(const double3x3& x, const double3x3& y, double tol)
+{
+    return all((abs(x[0] - y[0]) <= tol) &
+               (abs(x[1] - y[1]) <= tol) &
+               (abs(x[2] - y[2]) <= tol));
+}
+bool equal_abs(const double4x4& x, const double4x4& y, double tol)
+{
+    return all((abs(x[0] - y[0]) <= tol) &
+               (abs(x[1] - y[1]) <= tol) &
+               (abs(x[2] - y[2]) <= tol) &
+               (abs(x[3] - y[3]) <= tol));
+}
+
+// equal_rel
+bool equal_rel(const double2x2& x, const double2x2& y, double tol)
+{
+    return all((abs(x[0] - y[0]) <= tol * abs(x[0])) &
+               (abs(x[1] - y[1]) <= tol * abs(x[1])));
+}
+bool equal_rel(const double3x3& x, const double3x3& y, double tol)
+{
+    return all((abs(x[0] - y[0]) <= tol * abs(x[0])) &
+               (abs(x[1] - y[1]) <= tol * abs(x[1])) &
+               (abs(x[2] - y[2]) <= tol * abs(x[2])));
+}
+bool equal_rel(const double4x4& x, const double4x4& y, double tol)
+{
+    return all((abs(x[0] - y[0]) <= tol * abs(x[0])) &
+               (abs(x[1] - y[1]) <= tol * abs(x[1])) &
+               (abs(x[2] - y[2]) <= tol * abs(x[2])) &
+               (abs(x[3] - y[3]) <= tol * abs(x[3])));
+}
+
+} // namespace SIMD_NAMESPACE
+#endif // SIMD_DOUBLE
diff --git a/libkram/vectormath/double234.h b/libkram/vectormath/double234.h
new file mode 100644
index 00000000..4d899595
--- /dev/null
+++ b/libkram/vectormath/double234.h
@@ -0,0 +1,921 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+// This is not yet standalone.  vectormath234.h includes it.
+#if USE_SIMDLIB && SIMD_DOUBLE
+
+// clang-format off
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// define c vector/matrix types
+macroVector8TypesAligned(double, double)
+macroVector8TypesPacked(double, double)
+
+// storage type for matrix
+typedef struct { double2a columns[2]; } double2x2a;
+typedef struct { double3a columns[3]; } double3x3a;
+typedef struct { double4a columns[3]; } double3x4a;
+typedef struct { double4a columns[4]; } double4x4a;
+
+// glue to Accelerate
+#if SIMD_ACCELERATE_MATH_NAMES
+macroVector8TypesStorageRenames(double, simd_double)
+#endif // SIMD_ACCELERATE_MATH_NAMES
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+namespace SIMD_NAMESPACE {
+
+macroVector8TypesStorageRenames(double, double)
+
+    // clang-format on
+
+    SIMD_CALL double2 double2m(double x)
+{
+    return x;
+}
+SIMD_CALL double2 double2m(double x, double y)
+{
+    return {x, y};
+}
+
+SIMD_CALL double3 double3m(double x)
+{
+    return x;
+}
+SIMD_CALL double3 double3m(double x, double y, double z)
+{
+    return {x, y, z};
+}
+SIMD_CALL double3 double3m(double2 v, double z)
+{
+    double3 r;
+    r.xy = v;
+    r.z = z;
+    return r;
+}
+
+SIMD_CALL double4 double4m(double x)
+{
+    return x;
+}
+SIMD_CALL double4 double4m(double2 xy, double2 zw)
+{
+    double4 r;
+    r.xy = xy;
+    r.zw = zw;
+    return r;
+}
+SIMD_CALL double4 double4m(double x, double y, double z, double w = 1.0)
+{
+    return {x, y, z, w};
+}
+SIMD_CALL double4 double4m(double3 v, double w = 1.0)
+{
+    double4 r;
+    r.xyz = v;
+    r.w = w;
+    return r;
+}
+
+//-----------------------------------
+// start of implementation
+
+// zeroext - internal helper
+SIMD_CALL double4 zeroext(double2 x)
+{
+    double4 v = 0;
+    v.xy = x;
+    return v;
+}
+SIMD_CALL double4 zeroext(double3 x)
+{
+    double4 v = 0;
+    v.xyz = x;
+    return v;
+}
+
+#if SIMD_NEON
+
+SIMD_CALL double reduce_min(double2 x)
+{
+    return vminvq_f64(x);
+}
+SIMD_CALL double reduce_min(double4 x)
+{
+    return fmin(reduce_min(x.lo), reduce_min(x.hi));
+}
+
+SIMD_CALL double reduce_max(double2 x)
+{
+    return vmaxvq_f64(x);
+}
+SIMD_CALL double reduce_max(double4 x)
+{
+    return fmax(reduce_max(x.lo), reduce_max(x.hi));
+}
+
+SIMD_CALL double2 min(double2 x, double2 y)
+{
+    // precise returns x on Nan
+    return vminnmq_f64(x, y);
+}
+SIMD_CALL double4 min(double4 x, double4 y)
+{
+    // precise returns x on Nan
+    return double4m(min(x.lo, y.lo), min(x.hi, y.hi));
+}
+
+SIMD_CALL double2 max(double2 x, double2 y)
+{
+    // precise returns x on Nan
+    return vmaxnmq_f64(x, y);
+}
+SIMD_CALL double4 max(double4 x, double4 y)
+{
+    // precise returns x on Nan
+    return double4m(max(x.lo, y.lo), max(x.hi, y.hi));
+}
+
+SIMD_CALL double2 muladd(double2 x, double2 y, double2 t)
+{
+    // requires __ARM_VFPV4__
+    // t passed first unlike sse
+    return vfmaq_f64(t, x, y);
+}
+SIMD_CALL double4 muladd(double4 x, double4 y, double4 t)
+{
+    return double4m(muladd(x.lo, y.lo, t.lo), muladd(x.hi, y.hi, t.hi));
+}
+
+SIMD_CALL double2 sqrt(double2 x)
+{
+    return vsqrtq_f64(x);
+}
+SIMD_CALL double4 sqrt(double4 x)
+{
+    return double4m(sqrt(x.lo), sqrt(x.hi));
+}
+
+SIMD_CALL double2 reduce_addv(double2 x)
+{
+    // 4:1 reduction
+    x = vpaddq_f64(x, x);
+    return x.x;
+}
+SIMD_CALL double4 reduce_addv(double4 x)
+{
+    // 4:1 reduction
+    x.lo = vpaddq_f64(x.lo, x.lo);
+    x.hi = vpaddq_f64(x.hi, x.hi);
+    x.lo = vpaddq_f64(x.lo, x.hi);
+    return x.x; // repeat x to all values
+}
+SIMD_CALL double3 reduce_addv(double3 x)
+{
+    return reduce_addv(zeroext(x)).x;
+}
+
+// round to nearest | exc
+SIMD_CALL double2 round(double2 vv)
+{
+    return vrndnq_f64(vv);
+}
+SIMD_CALL double4 round(double4 vv)
+{
+    return double4m(round(vv.lo), round(vv.hi));
+}
+
+SIMD_CALL double2 ceil(double2 vv)
+{
+    return vrndpq_f64(vv);
+}
+SIMD_CALL double4 ceil(double4 vv)
+{
+    return double4m(ceil(vv.lo), ceil(vv.hi));
+}
+
+SIMD_CALL double2 floor(double2 vv)
+{
+    return vrndmq_f64(vv);
+}
+SIMD_CALL double4 floor(double4 vv)
+{
+    return double4m(floor(vv.lo), floor(vv.hi));
+}
+
+#endif // SIMD_NEON
+
+//----------------------
+
+#if SIMD_SSE
+
+// x64 doesn't seem to have a simd op for min/max reduce
+SIMD_CALL double reduce_min(double2 x)
+{
+    return fmin(x.x, x.y);
+}
+SIMD_CALL double reduce_min(double4 x)
+{
+    return fmin(fmin(x.x, x.y), fmin(x.z, x.w));
+}
+
+SIMD_CALL double reduce_max(double2 x)
+{
+    return fmax(x.x, x.y);
+}
+SIMD_CALL double reduce_max(double4 x)
+{
+    return fmax(fmax(x.x, x.y), fmax(x.z, x.w));
+}
+
+// needs SIMD_LONG
+// needed for precise min/max calls below
+#if SIMD_LONG
+SIMD_CALL double2 bitselect_forminmax(double2 x, double2 y, long2 mask)
+{
+    return (double2)(((long2)x & ~mask) | ((long2)y & mask));
+}
+// may only want to use this on AVX2
+SIMD_CALL double4 bitselect_forminmax(double4 x, double4 y, long4 mask)
+{
+    return (double4)(((long4)x & ~mask) | ((long4)y & mask));
+}
+#endif // SIMD_LONG
+
+// precise returns x on Nan
+SIMD_CALL double2 min(double2 x, double2 y)
+{
+    return bitselect_forminmax(_mm_min_pd(x, y), x, y != y);
+}
+SIMD_CALL double2 max(double2 x, double2 y)
+{
+    return bitselect_forminmax(_mm_max_pd(x, y), x, y != y);
+}
+SIMD_CALL double2 muladd(double2 x, double2 y, double2 t)
+{
+#ifdef __FMA__
+    return _mm_fmadd_pd(x, y, t);
+#else
+    // fallback with not same characteristics
+    return x * y + t;
+#endif
+}
+
+SIMD_CALL double2 sqrt(double2 x)
+{
+    return _mm_sqrt_pd(x);
+}
+SIMD_CALL double2 reduce_addv(double2 x)
+{
+    x = _mm_hadd_pd(x, x);
+    return x.x;
+}
+SIMD_CALL double2 round(double2 x)
+{
+    return _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+SIMD_CALL double2 ceil(double2 x)
+{
+    return _mm_ceil_pd(x);
+}
+SIMD_CALL double2 floor(double2 x)
+{
+    return _mm_floor_pd(x);
+}
+
+// now avx/avx2 can do 4 doubles in one call
+#if SIMD_AVX2
+
+SIMD_CALL double4 min(double4 x, double4 y)
+{
+    return bitselect_forminmax(_mm256_min_pd(x, y), x, y != y);
+}
+SIMD_CALL double4 max(double4 x, double4 y)
+{
+    return bitselect_forminmax(_mm256_max_pd(x, y), x, y != y);
+}
+SIMD_CALL double4 muladd(double4 x, double4 y, double4 t)
+{
+#ifdef __FMA__
+    return _mm256_fmadd_pd(x, y, t);
+#else
+    // fallback with not same characteristics
+    return x * y + t;
+#endif
+}
+
+SIMD_CALL double4 sqrt(double4 x)
+{
+    return _mm256_sqrt_pd(x);
+}
+
+SIMD_CALL double4 reduce_addv(double4 x)
+{
+    x = _mm256_hadd_ps(x, x); // xy = x+y,z+w
+    x = _mm256_hadd_ps(x, x); // x  = x+y
+    return x.x; // repeat x to all values
+}
+SIMD_CALL double3 reduce_addv(double3 x)
+{
+    return reduce_addv(zeroext(x)).x;
+}
+
+SIMD_CALL double4 round(double4 vv)
+{
+    return _mm256_round_pd(vv, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+SIMD_CALL double4 ceil(double4 vv)
+{
+    return _mm256_ceil_pd(vv);
+}
+SIMD_CALL double4 floor(double4 vv)
+{
+    return _mm256_floor_pd(vv);
+}
+
+#else
+
+// SSE4 ops as a fallback.  These have to make 2+ calls.
+SIMD_CALL double4 min(double4 x, double4 y)
+{
+    return double4m(min(x.lo, y.lo), min(x.hi, y.hi));
+}
+SIMD_CALL double4 max(double4 x, double4 y)
+{
+    return double4m(max(x.lo, y.lo), max(x.hi, y.hi));
+}
+SIMD_CALL double4 muladd(double4 x, double4 y, double4 t)
+{
+#ifdef __FMA__
+    return double4m(muladd(x.lo, y.lo, t.lo),
+                    muladd(x.hi, y.hi, t.hi));
+#else
+    // fallback with not same characteristics
+    return x * y + t;
+#endif
+}
+
+SIMD_CALL double4 sqrt(double4 x)
+{
+    return double4m(sqrt(x.lo), sqrt(x.hi));
+}
+
+SIMD_CALL double4 reduce_addv(double4 x)
+{
+    // 4:1 reduction
+    x.lo = _mm_hadd_pd(x.lo, x.lo);
+    x.hi = _mm_hadd_pd(x.hi, x.hi);
+    x.lo = _mm_hadd_pd(x.lo, x.hi);
+    return x.x; // repeat x to all values
+}
+SIMD_CALL double3 reduce_addv(double3 x)
+{
+    return reduce_addv(zeroext(x)).x;
+}
+
+SIMD_CALL double4 round(double4 vv)
+{
+    return double4m(round(vv.lo), round(vv.hi));
+}
+SIMD_CALL double4 ceil(double4 vv)
+{
+    return double4m(ceil(vv.lo), ceil(vv.hi));
+}
+SIMD_CALL double4 floor(double4 vv)
+{
+    return double4m(floor(vv.lo), floor(vv.hi));
+}
+
+#endif
+#endif // SIMD_SSE
+
+// end of implementation
+//-----------------------------------
+
+#if SIMD_LONG
+
+// bitselect
+SIMD_CALL double2 bitselect(double2 x, double2 y, long2 mask)
+{
+    return (double2)bitselect((long2)x, (long2)y, mask);
+}
+SIMD_CALL double3 bitselect(double3 x, double3 y, long3 mask)
+{
+    return (double3)bitselect((long3)x, (long3)y, mask);
+}
+SIMD_CALL double4 bitselect(double4 x, double4 y, long4 mask)
+{
+    return (double4)bitselect((long4)x, (long4)y, mask);
+}
+
+// select
+SIMD_CALL double2 select(double2 x, double2 y, long2 mask)
+{
+    return bitselect(x, y, mask >> 63);
+}
+SIMD_CALL double3 select(double3 x, double3 y, long3 mask)
+{
+    return bitselect(x, y, mask >> 63);
+}
+SIMD_CALL double4 select(double4 x, double4 y, long4 mask)
+{
+    return bitselect(x, y, mask >> 63);
+}
+
+#endif // SIMD_LONG
+
+// double3 leftovers
+SIMD_CALL double3 min(double3 x, double3 y) { return vec4to3(min(vec3to4(x), vec3to4(y))); }
+SIMD_CALL double3 max(double3 x, double3 y) { return vec4to3(max(vec3to4(x), vec3to4(y))); }
+SIMD_CALL double3 muladd(double3 x, double3 y, double3 t) { return vec4to3(muladd(vec3to4(x), vec3to4(y), vec3to4(t))); }
+SIMD_CALL double3 sqrt(double3 x) { return vec4to3(sqrt(vec3to4(x))); }
+SIMD_CALL double reduce_min(double3 x) { return reduce_min(vec3to4(x)); }
+SIMD_CALL double reduce_max(double3 x) { return reduce_max(vec3to4(x)); }
+SIMD_CALL double3 round(double3 x) { return vec4to3(round(vec3to4(x))); }
+SIMD_CALL double3 ceil(double3 x) { return vec4to3(ceil(vec3to4(x))); }
+SIMD_CALL double3 floor(double3 x) { return vec4to3(floor(vec3to4(x))); }
+
+SIMD_CALL double4 rsqrt(double4 x) { return 1.0 / sqrt(x); }
+SIMD_CALL double2 rsqrt(double2 x) { return 1.0 / sqrt(x); }
+SIMD_CALL double3 rsqrt(double3 x) { return 1.0 / sqrt(x); }
+
+SIMD_CALL double2 recip(double2 x) { return 1.0 / x; }
+SIMD_CALL double3 recip(double3 x) { return 1.0 / x; }
+SIMD_CALL double4 recip(double4 x) { return 1.0 / x; }
+
+SIMD_CALL double reduce_add(double2 x) { return reduce_addv(x).x; }
+SIMD_CALL double reduce_add(double3 x) { return reduce_addv(x).x; }
+SIMD_CALL double reduce_add(double4 x) { return reduce_addv(x).x; }
+
+// clamp
+// order matters here for Nan, left op returned on precise min/max
+SIMD_CALL double2 clamp(double2 x, double2 minv, double2 maxv)
+{
+    return min(maxv, max(minv, x));
+}
+SIMD_CALL double3 clamp(double3 x, double3 minv, double3 maxv)
+{
+    return min(maxv, max(minv, x));
+}
+SIMD_CALL double4 clamp(double4 x, double4 minv, double4 maxv)
+{
+    return min(maxv, max(minv, x));
+}
+
+// saturate
+SIMD_CALL double2 saturate(double2 x) { return clamp(x, 0, (double2)1); }
+SIMD_CALL double3 saturate(double3 x) { return clamp(x, 0, (double3)1); }
+SIMD_CALL double4 saturate(double4 x) { return clamp(x, 0, (double4)1); }
+
+// lerp - another easy one
+SIMD_CALL double2 lerp(double2 x, double2 y, double2 t) { return x + t * (y - x); }
+SIMD_CALL double3 lerp(double3 x, double3 y, double3 t) { return x + t * (y - x); }
+SIMD_CALL double4 lerp(double4 x, double4 y, double4 t) { return x + t * (y - x); }
+
+// dot
+SIMD_CALL double dot(double2 x, double2 y) { return reduce_add(x * y); }
+SIMD_CALL double dot(double3 x, double3 y) { return reduce_add(x * y); }
+SIMD_CALL double dot(double4 x, double4 y) { return reduce_add(x * y); }
+
+// length_squared
+SIMD_CALL double length_squared(double2 x) { return reduce_add(x * x); }
+SIMD_CALL double length_squared(double3 x) { return reduce_add(x * x); }
+SIMD_CALL double length_squared(double4 x) { return reduce_add(x * x); }
+
+// length
+// worth using simd_sqrt?
+SIMD_CALL double length(double2 x) { return ::sqrt(reduce_add(x * x)); }
+SIMD_CALL double length(double3 x) { return ::sqrt(reduce_add(x * x)); }
+SIMD_CALL double length(double4 x) { return ::sqrt(reduce_add(x * x)); }
+
+// distance
+SIMD_CALL double distance(double2 x, double2 y) { return length(x - y); }
+SIMD_CALL double distance(double3 x, double3 y) { return length(x - y); }
+SIMD_CALL double distance(double4 x, double4 y) { return length(x - y); }
+
+// normalize
+// optimized by staying in reg
+// x * invlength(x)
+SIMD_CALL double4 normalize(double4 x) { return x / sqrt(reduce_addv(x * x)).x; }
+SIMD_CALL double2 normalize(double2 x) { return x / sqrt(reduce_addv(x * x)).x; }
+SIMD_CALL double3 normalize(double3 x) { return x / sqrt(reduce_addv(x * x)).x; }
+
+// abs
+SIMD_CALL double2 abs(double2 x)
+{
+    return bitselect(0.0, x, 0x7fffffffffffffff);
+}
+SIMD_CALL double3 abs(double3 x)
+{
+    return bitselect(0.0, x, 0x7fffffffffffffff);
+}
+SIMD_CALL double4 abs(double4 x)
+{
+    return bitselect(0.0, x, 0x7fffffffffffffff);
+}
+
+// cross
+SIMD_CALL double cross(double2 x, double2 y)
+{
+    return x.x * y.y - x.y * y.x;
+}
+SIMD_CALL double3 cross(double3 x, double3 y)
+{
+    return x.yzx * y.zxy - x.zxy * y.yzx;
+}
+
+// equal
+// == and != return a int234 vector, so need these to match other vecs
+SIMD_CALL bool equal(double2 x, double2 y)
+{
+    return all(x == y);
+}
+SIMD_CALL bool equal(double3 x, double3 y)
+{
+    return all(x == y);
+}
+SIMD_CALL bool equal(double4 x, double4 y)
+{
+    return all(x == y);
+}
+
+// equal_abs
+SIMD_CALL bool equal_abs(double2 x, double2 y, double tol)
+{
+    return all((abs(x - y) <= tol));
+}
+SIMD_CALL bool equal_abs(double3 x, double3 y, double tol)
+{
+    return all((abs(x - y) <= tol));
+}
+SIMD_CALL bool equal_abs(double4 x, double4 y, double tol)
+{
+    return all((abs(x - y) <= tol));
+}
+
+// equal_rel
+SIMD_CALL bool equal_rel(double2 x, double2 y, double tol)
+{
+    return all((abs(x - y) <= tol * ::abs(x.x)));
+}
+SIMD_CALL bool equal_rel(double3 x, double3 y, double tol)
+{
+    return all((abs(x - y) <= tol * ::abs(x.x)));
+}
+SIMD_CALL bool equal_rel(double4 x, double4 y, double tol)
+{
+    return all((abs(x - y) <= tol * ::abs(x.x)));
+}
+
+// step
+SIMD_CALL double2 step(double2 edge, double2 x)
+{
+    return bitselect((double2)1, 0, x < edge);
+}
+SIMD_CALL double3 step(double3 edge, double3 x)
+{
+    return bitselect((double3)1, 0, x < edge);
+}
+SIMD_CALL double4 step(double4 edge, double4 x)
+{
+    return bitselect((double4)1, 0, x < edge);
+}
+
+// smoothstep
+SIMD_CALL double2 smoothstep(double2 edge0, double2 edge1, double2 x)
+{
+    double2 t = saturate((x - edge0) / (edge0 - edge1));
+    return t * t * (3 - 2 * t);
+}
+SIMD_CALL double3 smoothstep(double3 edge0, double3 edge1, double3 x)
+{
+    double3 t = saturate((x - edge0) / (edge0 - edge1));
+    return t * t * (3 - 2 * t);
+}
+SIMD_CALL double4 smoothstep(double4 edge0, double4 edge1, double4 x)
+{
+    double4 t = saturate((x - edge0) / (edge0 - edge1));
+    return t * t * (3 - 2 * t);
+}
+
+// fract
+SIMD_CALL double2 fract(double2 x)
+{
+    return min(x - floor(x), 0x1.fffffffffffffp-1);
+}
+SIMD_CALL double3 fract(double3 x)
+{
+    return min(x - floor(x), 0x1.fffffffffffffp-1);
+}
+SIMD_CALL double4 fract(double4 x)
+{
+    return min(x - floor(x), 0x1.fffffffffffffp-1);
+}
+
+SIMD_CALL bool is_nan(double2 x)
+{
+    return any(x != x);
+}
+SIMD_CALL bool is_nan(double3 x)
+{
+    return any(x != x);
+}
+SIMD_CALL bool is_nan(double4 x)
+{
+    return any(x != x);
+}
+
+SIMD_CALL double2 fix_nan(double2 x, double2 replace)
+{
+    return min(replace, x);
+}
+SIMD_CALL double3 fix_nan(double3 x, double3 replace)
+{
+    return min(replace, x);
+}
+SIMD_CALL double4 fix_nan(double4 x, double4 replace)
+{
+    return min(replace, x);
+}
+
+// fast conversions where possible
+// need non-const too
+SIMD_CALL const double3& as_double3(const double4& m)
+{
+    return reinterpret_cast<const double3&>(m);
+}
+SIMD_CALL const double3* as_double3(const double4* m)
+{
+    return reinterpret_cast<const double3*>(m);
+}
+
+// this one is dangerous, since w is undefined
+//SIMD_CALL const double4& as_double4(const double3& m) {
+//    return reinterpret_cast<const double4&>(m);
+//}
+
+//-------------------
+// Functions
+
+// clang-format off
+
+// power series
+macroVectorRepeatFnDecl(double, log)
+macroVectorRepeatFnDecl(double, exp)
+
+// trig
+macroVectorRepeatFnDecl(double, cos)
+macroVectorRepeatFnDecl(double, sin)
+macroVectorRepeatFnDecl(double, tan)
+
+macroVectorRepeatFnDecl(double, acos)
+macroVectorRepeatFnDecl(double, asin)
+macroVectorRepeatFnDecl(double, atan)
+
+macroVectorRepeatFn2Decl(double, atan2)
+
+    // clang-format on
+
+    SIMD_CALL double2 pow(double2 x, double2 y)
+{
+    return exp(log(x) * y);
+}
+SIMD_CALL double3 pow(double3 x, double3 y) { return exp(log(x) * y); }
+SIMD_CALL double4 pow(double4 x, double4 y) { return exp(log(x) * y); }
+
+//-------------------
+// constants
+
+const double2& double2_zero();
+const double2& double2_ones();
+
+const double2& double2_posx();
+const double2& double2_posy();
+
+const double2& double2_negx();
+const double2& double2_negy();
+
+//----
+
+const double3& double3_zero();
+const double3& double3_ones();
+
+const double3& double3_posx();
+const double3& double3_posy();
+const double3& double3_posz();
+
+const double3& double3_negx();
+const double3& double3_negy();
+const double3& double3_negz();
+
+//----
+
+const double4& double4_zero();
+const double4& double4_ones();
+
+const double4& double4_posx();
+const double4& double4_posy();
+const double4& double4_posz();
+const double4& double4_posw();
+
+const double4& double4_negx();
+const double4& double4_negy();
+const double4& double4_negz();
+const double4& double4_negw();
+
+const double4& double4_posxw();
+const double4& double4_posyw();
+const double4& double4_poszw();
+
+const double4& double4_negxw();
+const double4& double4_negyw();
+const double4& double4_negzw();
+
+//-------------------
+// Matrix
+
+struct double2x2 : double2x2a {
+    // can be split out to traits
+    static constexpr int col = 2;
+    static constexpr int row = 2;
+    using column_t = double2;
+    using scalar_t = double;
+    using base = double2x2a;
+
+    static const double2x2& zero();
+    static const double2x2& identity();
+
+    double2x2() {} // no default init
+    explicit double2x2(double2 diag);
+    double2x2(double2 c0, double2 c1)
+        : base((base){c0, c1}) {}
+    double2x2(const base& m)
+        : base(m) {}
+
+    // simd lacks these ops
+    double2& operator[](int idx) { return columns[idx]; }
+    const double2& operator[](int idx) const { return columns[idx]; }
+};
+
+struct double3x3 : double3x3a {
+    static constexpr int col = 3;
+    static constexpr int row = 3;
+    using column_t = double3;
+    using scalar_t = double;
+    using base = double3x3a;
+
+    // Done as wordy c funcs otherwize.  Funcs allow statics to init.
+    static const double3x3& zero();
+    static const double3x3& identity();
+
+    double3x3() {} // no default init
+    explicit double3x3(double3 diag);
+    double3x3(double3 c0, double3 c1, double3 c2)
+        : base((base){c0, c1, c2}) {}
+    double3x3(const base& m)
+        : base(m) {}
+
+    double3& operator[](int idx) { return columns[idx]; }
+    const double3& operator[](int idx) const { return columns[idx]; }
+};
+
+// This is mostly a transposed holder for a 4x4, so very few ops defined
+// Can also serve as a SOA for some types of cpu math.
+struct double3x4 : double3x4a {
+    static constexpr int col = 3;
+    static constexpr int row = 4;
+    using column_t = double4;
+    using scalar_t = double;
+    using base = double3x4a;
+
+    static const double3x4& zero();
+    static const double3x4& identity();
+
+    double3x4() {} // no default init
+    explicit double3x4(double3 diag);
+    double3x4(double4 c0, double4 c1, double4 c2)
+        : base((base){c0, c1, c2}) {}
+    double3x4(const base& m)
+        : base(m) {}
+
+    double4& operator[](int idx) { return columns[idx]; }
+    const double4& operator[](int idx) const { return columns[idx]; }
+};
+
+struct double4x4 : double4x4a {
+    static constexpr int col = 4;
+    static constexpr int row = 4;
+    using column_t = double4;
+    using scalar_t = double;
+    using base = double4x4a;
+
+    static const double4x4& zero();
+    static const double4x4& identity();
+
+    double4x4() {} // no default init
+    explicit double4x4(double4 diag);
+    double4x4(double4 c0, double4 c1, double4 c2, double4 c3)
+        : base((base){c0, c1, c2, c3}) {}
+    double4x4(const base& m)
+        : base(m) {}
+
+    double4& operator[](int idx) { return columns[idx]; }
+    const double4& operator[](int idx) const { return columns[idx]; }
+};
+
+double2x2 diagonal_matrix(double2 x);
+double3x3 diagonal_matrix(double3 x);
+double3x4 diagonal_matrix3x4(double3 x);
+double4x4 diagonal_matrix(double4 x);
+
+// ops need to call these
+
+// using refs here, 3x3 and 4x4 are large to pass by value (3 simd regs)
+double2x2 transpose(const double2x2& x);
+double3x3 transpose(const double3x3& x);
+double4x4 transpose(const double4x4& x);
+
+double2x2 inverse(const double2x2& x);
+double3x3 inverse(const double3x3& x);
+double4x4 inverse(const double4x4& x);
+
+double determinant(const double2x2& x);
+double determinant(const double3x3& x);
+double determinant(const double4x4& x);
+
+double trace(const double2x2& x);
+double trace(const double3x3& x);
+double trace(const double4x4& x);
+
+// premul = dot + premul
+double2 mul(double2 y, const double2x2& x);
+double3 mul(double3 y, const double3x3& x);
+double4 mul(double4 y, const double4x4& x);
+
+// posmul = mul + mad
+double2x2 mul(const double2x2& x, const double2x2& y);
+double3x3 mul(const double3x3& x, const double3x3& y);
+double4x4 mul(const double4x4& x, const double4x4& y);
+
+double2 mul(const double2x2& x, double2 y);
+double3 mul(const double3x3& x, double3 y);
+double4 mul(const double4x4& x, double4 y);
+
+double2x2 sub(const double2x2& x, const double2x2& y);
+double3x3 sub(const double3x3& x, const double3x3& y);
+double4x4 sub(const double4x4& x, const double4x4& y);
+
+double2x2 add(const double2x2& x, const double2x2& y);
+double3x3 add(const double3x3& x, const double3x3& y);
+double4x4 add(const double4x4& x, const double4x4& y);
+
+bool equal(const double2x2& x, const double2x2& y);
+bool equal(const double3x3& x, const double3x3& y);
+bool equal(const double4x4& x, const double4x4& y);
+
+// equal_abs
+bool equal_abs(const double2x2& x, const double2x2& y, double tol);
+bool equal_abs(const double3x3& x, const double3x3& y, double tol);
+bool equal_abs(const double4x4& x, const double4x4& y, double tol);
+
+// equal_rel
+bool equal_rel(const double2x2& x, const double2x2& y, double tol);
+bool equal_rel(const double3x3& x, const double3x3& y, double tol);
+bool equal_rel(const double4x4& x, const double4x4& y, double tol);
+
+// clang-format off
+
+// operators for C++
+macroMatrixOps(double2x2);
+macroMatrixOps(double3x3);
+// TODO: no mat ops yet on storage type double3x4
+// macroMatrixOps(double3x4);
+macroMatrixOps(double4x4);
+
+// clang-format on
+
+// fast conversions where possible
+SIMD_CALL const double3x3& as_double3x3(const double4x4& m)
+{
+    return reinterpret_cast<const double3x3&>(m);
+}
+SIMD_CALL const double3x3* as_double3x3(const double4x4* m)
+{
+    return reinterpret_cast<const double3x3*>(m);
+}
+
+} //namespace SIMD_NAMESPACE
+
+#endif
+
+#endif // USE_SIMDLIB && SIMD_DOUBLE
diff --git a/libkram/vectormath/float234.cpp b/libkram/vectormath/float234.cpp
new file mode 100644
index 00000000..140dca1d
--- /dev/null
+++ b/libkram/vectormath/float234.cpp
@@ -0,0 +1,1054 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+// This has to include this, not float234.h
+#include "vectormath234.h"
+
+#if USE_SIMDLIB && SIMD_FLOAT
+
+// I don't trust the ssemathfun approximations.  But providing them in case
+// want to go futher.  Really 3 choices - use c calls, use approximations,
+// or use platform simd lib that implements these (f.e. Accelerate).
+//#define SIMD_FAST_MATH       0
+//#if SIMD_FAST_MATH
+//// fp32 ops only
+//#include "sse_mathfun.h"
+//#endif // SIMD_FAST_MATH
+
+#if SIMD_ACCELERATE_MATH
+// TODO: reduce this header to just calls use (f.e. geometry, etc)
+#include <simd/simd.h>
+#endif // SIMD_ACCELERATE_MATH
+
+namespace SIMD_NAMESPACE {
+
+// clang-format off
+
+#if SIMD_ACCELERATE_MATH
+// These will get inlined here from the template
+macroVectorRepeatFnImpl(float, log)
+macroVectorRepeatFnImpl(float, exp)
+
+macroVectorRepeatFnImpl(float, sin)
+macroVectorRepeatFnImpl(float, cos)
+macroVectorRepeatFnImpl(float, tan)
+#endif // SIMD_ACCELERATE_MATH
+
+#if SIMD_CMATH_MATH
+
+macroVectorRepeatFnImpl(float, log, ::logf)
+macroVectorRepeatFnImpl(float, exp, ::expf)
+
+macroVectorRepeatFnImpl(float, sin, ::sinf)
+macroVectorRepeatFnImpl(float, cos, ::cosf)
+macroVectorRepeatFnImpl(float, tan, ::tanf)
+
+#endif // SIMD_CMATH_MATH
+
+// clang-format off
+
+// Wish cmath had this
+inline void sincosf(float radians, float& s, float& c) {
+    s = sinf(radians);
+    c = cosf(radians);
+}
+
+// These aren't embedded in function, so may have pre-init ordering issues.
+// or could add pre-init order to skip using functions.
+// Expose these through function calls as const&
+
+static const float2 kfloat2_posx = {1.0f, 0.0f};
+static const float2 kfloat2_posy = kfloat2_posx.yx;
+
+static const float2 kfloat2_negx = {-1.0f, 0.0f};
+static const float2 kfloat2_negy = kfloat2_negx.yx;
+
+static const float2 kfloat2_ones = kfloat2_posx.xx;
+static const float2 kfloat2_zero = {};
+
+//----
+
+static const float3 kfloat3_posx = {1.0f, 0.0f, 0.0f};
+static const float3 kfloat3_posy = kfloat3_posx.yxy;
+static const float3 kfloat3_posz = kfloat3_posx.yyx;
+
+static const float3 kfloat3_negx = {-1.0f, 0.0f, 0.0f};
+static const float3 kfloat3_negy = kfloat3_negx.yxy;
+static const float3 kfloat3_negz = kfloat3_negx.yyx;
+
+static const float3 kfloat3_ones = kfloat3_posx.xxx;
+static const float3 kfloat3_zero = {};
+
+//----
+
+static const float4 kfloat4_posx = {1.0f, 0.0f, 0.0f, 0.0f};
+static const float4 kfloat4_posy = kfloat4_posx.yxyy;
+static const float4 kfloat4_posz = kfloat4_posx.yyxy;
+static const float4 kfloat4_posw = kfloat4_posx.yyyx;
+
+static const float4 kfloat4_negxw = {-1.0f, 0.0f, 0.0f, 1.0f};
+static const float4 kfloat4_negyw = kfloat4_negxw.yxyw;
+static const float4 kfloat4_negzw = kfloat4_negxw.yyxw;
+
+static const float4 kfloat4_posxw = {1.0f, 0.0f, 0.0f, 1.0f};
+static const float4 kfloat4_posyw = kfloat4_posxw.yxyw;
+static const float4 kfloat4_poszw = kfloat4_posxw.yyxw;
+
+static const float4 kfloat4_negx = {-1.0f, 0.0f, 0.0f, 0.0f};
+static const float4 kfloat4_negy = kfloat4_negx.yxyy;
+static const float4 kfloat4_negz = kfloat4_negx.yyxy;
+static const float4 kfloat4_negw = kfloat4_negx.yyyx;
+
+static const float4 kfloat4_ones = kfloat4_posx.xxxx;
+static const float4 kfloat4_zero = {};
+
+//---------------------------
+
+static const float2x2 kfloat2x2_zero = {}; // what is this value 0, or default ctor
+static const float3x3 kfloat3x3_zero = {};
+static const float3x4 kfloat3x4_zero = {};
+static const float4x4 kfloat4x4_zero = {};
+
+static const float2x2 kfloat2x2_identity = diagonal_matrix(kfloat2_ones);
+static const float3x3 kfloat3x3_identity = diagonal_matrix(kfloat3_ones);
+static const float3x4 kfloat3x4_identity = diagonal_matrix3x4(kfloat3_ones);
+static const float4x4 kfloat4x4_identity = diagonal_matrix(kfloat4_ones);
+
+//----
+
+const float2& float2_zero(){ return kfloat2_zero; }
+const float2& float2_ones(){ return kfloat2_ones; }
+
+const float2& float2_posx(){ return kfloat2_posx; }
+const float2& float2_posy(){ return kfloat2_posy; }
+
+const float2& float2_negx(){ return kfloat2_negx; }
+const float2& float2_negy(){ return kfloat2_negy; }
+
+//----
+
+const float3& float3_zero(){ return kfloat3_zero; }
+const float3& float3_ones(){ return kfloat3_ones; }
+
+const float3& float3_posx(){ return kfloat3_posx; }
+const float3& float3_posy(){ return kfloat3_posy; }
+const float3& float3_posz(){ return kfloat3_posz; }
+
+const float3& float3_negx(){ return kfloat3_negx; }
+const float3& float3_negy(){ return kfloat3_negy; }
+const float3& float3_negz(){ return kfloat3_negz; }
+
+//----
+
+const float4& float4_zero(){ return kfloat4_zero; }
+const float4& float4_ones(){ return kfloat4_ones; }
+
+const float4& float4_posx(){ return kfloat4_posx; }
+const float4& float4_posy(){ return kfloat4_posy; }
+const float4& float4_posz(){ return kfloat4_posz; }
+const float4& float4_posw(){ return kfloat4_posw; }
+
+const float4& float4_negx(){ return kfloat4_negx; }
+const float4& float4_negy(){ return kfloat4_negy; }
+const float4& float4_negz(){ return kfloat4_negz; }
+const float4& float4_negw(){ return kfloat4_negw; }
+
+const float4& float4_posxw(){ return kfloat4_posxw; }
+const float4& float4_posyw(){ return kfloat4_posyw; }
+const float4& float4_poszw(){ return kfloat4_poszw; }
+
+const float4& float4_negxw(){ return kfloat4_negxw; }
+const float4& float4_negyw(){ return kfloat4_negyw; }
+const float4& float4_negzw(){ return kfloat4_negzw; }
+
+//---------------
+
+const float2x2& float2x2::zero() { return kfloat2x2_zero; }
+const float2x2& float2x2::identity() { return kfloat2x2_identity; }
+
+const float3x3& float3x3::zero() { return kfloat3x3_zero; }
+const float3x3& float3x3::identity() { return kfloat3x3_identity; }
+
+const float3x4& float3x4::zero() { return kfloat3x4_zero; }
+const float3x4& float3x4::identity() { return kfloat3x4_identity; }
+
+const float4x4& float4x4::zero() { return kfloat4x4_zero; }
+const float4x4& float4x4::identity() { return kfloat4x4_identity; }
+
+//----------------------------------
+
+static quatf quatf_zero(0.0f, 0.0f, 0.0f, 0.0f);
+static quatf quatf_identity(0.0f, 0.0f, 0.0f, 1.0f);
+
+const quatf& quatf::zero() { return quatf_zero; }
+const quatf& quatf::identity() { return quatf_identity; }
+
+//---------------------------
+
+float4x4 float4x4m(const float3x4& m) {
+    float4x4 m44;
+    m44[0] = m[0];
+    m44[1] = m[1];
+    m44[2] = m[2];
+    m44[3] = float4_posw();
+    
+    return transpose(m44);
+}
+
+float3x4 float3x4m(const float4x4& m) {
+    float4x4 m44(transpose(m));
+    return (const float3x4&)m44;
+}
+
+//---------------------------
+
+// These should not be used often.  So can stay buried
+float2x2::float2x2(float2 diag)
+: base((const base&)diagonal_matrix(diag)) { }
+float3x3::float3x3(float3 diag)
+: base((const base&)diagonal_matrix(diag)) { }
+float3x4::float3x4(float3 diag)
+: base((const base&)diagonal_matrix3x4(diag)) { }
+float4x4::float4x4(float4 diag)
+: base((const base&)diagonal_matrix(diag)) { }
+
+//---------------------------
+
+float2x2 diagonal_matrix(float2 x) {
+    float4 xx = zeroext(x);
+    return float2x2(xx.xw, xx.wy);
+}
+float3x3 diagonal_matrix(float3 x) {
+    float4 xx = zeroext(x);
+    return float3x3(xx.xww, xx.wyw, xx.wwz);
+}
+float3x4 diagonal_matrix3x4(float3 x) {
+    float4 xx = zeroext(x);
+    return float3x4(xx.xwww, xx.wyww, xx.wwzw);
+}
+float4x4 diagonal_matrix(float4 x) {
+    float4 xx = x; xx.w = 0.0f;
+    float4 ww = xx; ww.z = x.w;
+    return float4x4(xx.xwww, xx.wyww, xx.wwzw, ww.wwwz);
+}
+
+//--------------------------------------
+
+// textbook transpose
+float2x2 transpose(const float2x2& x) {
+    float4 x0, x1;
+    x0.xy = x[0];
+    x1.xy = x[1];
+#if SIMD_SSE
+    float4 r01 = _mm_unpacklo_ps(x0, x1);
+#else
+    float4 r01 = vzip1q_f32(x0, x1);
+#endif
+    return (float2x2){r01.lo, r01.hi};
+}
+
+float3x3 transpose(const float3x3& x) {
+    float4 x0, x1, x2;
+    x0.xyz = x[0];
+    x1.xyz = x[1];
+    x2.xyz = x[2];
+#if SIMD_SSE
+    float4 t0 = _mm_unpacklo_ps(x0, x1);
+    float4 t1 = _mm_unpackhi_ps(x0, x1);
+    float4 r0 = t0; r0.hi = x2.lo;
+    float4 r1 = _mm_shuffle_ps(t0, x2, 0xde);
+    float4 r2 = x2; r2.lo = t1.lo;
+#else
+    float4 padding = { 0 };
+    float4 t0 = vzip1q_f32(x0,x2);
+    float4 t1 = vzip2q_f32(x0,x2);
+    float4 t2 = vzip1q_f32(x1,padding);
+    float4 t3 = vzip2q_f32(x1,padding);
+    float4 r0 = vzip1q_f32(t0,t2);
+    float4 r1 = vzip2q_f32(t0,t2);
+    float4 r2 = vzip1q_f32(t1,t3);
+#endif
+    return (float3x3){r0.xyz, r1.xyz, r2.xyz};
+}
+
+float4x4 transpose(const float4x4& x) {
+#if SIMD_SSE
+    // shuffles are faster than unpack
+    float4x4 xt(x);
+    _MM_TRANSPOSE4_PS(xt[0], xt[1], xt[2], xt[3]);
+    return xt;
+#else
+    float4 t0 = vzip1q_f32(x[0],x[2]);
+    float4 t1 = vzip2q_f32(x[0],x[2]);
+    float4 t2 = vzip1q_f32(x[1],x[3]);
+    float4 t3 = vzip2q_f32(x[1],x[3]);
+    float4 r0 = vzip1q_f32(t0,t2);
+    float4 r1 = vzip2q_f32(t0,t2);
+    float4 r2 = vzip1q_f32(t1,t3);
+    float4 r3 = vzip2q_f32(t1,t3);
+    return (float4x4){r0,r1,r2,r3};
+#endif
+}
+
+// inverse
+float2x2 inverse(const float2x2& x) {
+    float invDet = 1.0f / determinant(x);
+    if (invDet == 0.0f) return kfloat2x2_zero;
+    
+    float2x2 r = transpose(x);
+    r[0] *= invDet;
+    r[1] *= invDet;
+    return r;
+}
+
+float3x3 inverse(const float3x3& x) {
+    float invDet = 1.0f / determinant(x);
+    if (invDet == 0.0f) return kfloat3x3_zero;
+    
+    float3x3 r;
+    
+    // this forms the adjoint
+    r[0] = cross(x[1], x[2]) * invDet;
+    r[1] = cross(x[2], x[0]) * invDet;
+    r[2] = cross(x[0], x[1]) * invDet;
+    return r;
+}
+
+// std::swap has warning on aligned data
+inline void swap(float4& a, float4& b) {
+    float4 temp = a;
+    a = b;
+    b = temp;
+}
+
+float4x4 inverse(const float4x4& x) {
+    // This is a full gje inverse
+    
+    float4x4 a(x), b(kfloat4x4_identity);
+    bool inversionSucceeded = true;
+    
+    // As a evolves from original mat into identity -
+    // b evolves from identity into inverse(a)
+    int cols = float4x4::col;
+    int rows = float4x4::row;
+    
+    // Loop over cols of a from left to right, eliminating above and below diag
+    for (int j=0; j<rows; j++) {
+        // Find largest pivot in column j among rows j..2
+        int i1 = j;            // Row with largest pivot candidate
+        for (int i=j+1; i<cols; i++) {
+            if ( fabsf(a[i][j]) > fabsf(a[i1][j]) ) {
+                i1 = i;
+            }
+        }
+        
+        // Swap rows i1 and j in a and b to put pivot on diagonal
+        SIMD_NAMESPACE::swap(a[i1], a[j]);
+        SIMD_NAMESPACE::swap(b[i1], b[j]);
+        
+        // Scale row j to have a unit diagonal
+        float s = a[j][j];
+        if ( s == 0.0f ) {
+            inversionSucceeded = false;
+            break;
+        }
+        
+        s = 1.0f/s;
+        b[j] *= s;
+        a[j] *= s;
+        
+        // Eliminate off-diagonal elems in col j of a, doing identical ops to b
+        for (int i=0; i<cols; i++ ) {
+            if (i != j) {
+                s = a[i][j];
+                b[i] -= b[j] * s;
+                a[i] -= a[j] * s;
+            }
+        }
+    }
+    
+    if (!inversionSucceeded) {
+        b = kfloat4x4_zero;
+    }
+    
+    return b;
+}
+
+// determinant
+// internal only ops
+// TODO: could just be macros
+inline float3 rotate1(float3 x) { return x.yzx; }
+inline float3 rotate2(float3 x) { return x.zxy; }
+inline float4 rotate1(float4 x) { return x.yzwx; }
+inline float4 rotate2(float4 x) { return x.zwxy; }
+inline float4 rotate3(float4 x) { return x.wxyz; }
+
+float determinant(const float2x2& x) {
+    return cross(x[0], x[1]);
+}
+
+float determinant(const float3x3& x) {
+    return reduce_add(
+                      x[0]*(rotate1(x[1])*rotate2(x[2]) - rotate2(x[1])*rotate1(x[2])));
+}
+
+float determinant(const float4x4& x) {
+    float4 codet = x[0]*(rotate1(x[1])*(rotate2(x[2])*rotate3(x[3])-rotate3(x[2])*rotate2(x[3])) +
+                         rotate2(x[1])*(rotate3(x[2])*rotate1(x[3])-rotate1(x[2])*rotate3(x[3])) +
+                         rotate3(x[1])*(rotate1(x[2])*rotate2(x[3])-rotate2(x[2])*rotate1(x[3])));
+    return reduce_add(codet.even - codet.odd);
+}
+
+// trace
+float trace(const float2x2& x) {
+    return x[0].x + x[1].y;
+}
+
+float trace(const float3x3& x) {
+    return x[0].x + x[1].y + x[2].z;
+}
+
+float trace(const float4x4& x) {
+    return x[0].x + x[1].y + x[2].z + x[3].w;
+}
+
+// TODO: may want pre-transform on float3x4 since it's transposed
+// 3 x m3x4 should = 3 element vec
+//
+// simd premul transform on left does a super expensive transpose to avoid dot
+// don't use this, should just dotproducts?
+//static   half2 mul(  half2 x,   half2x2 y) { return mul(transpose(y), x); }
+//
+//
+// Here's how to multiply matrices, since default ops won't do this.
+// be careful with operator* built-in.  Will do column by column mul won't it?
+// Maybe that's why *= is missing on matrices.
+//
+// This is taking each scalar of y[0], hopfully this extends and stays in vec op
+
+// premul-transform has to do dots
+float2 mul(float2 y, const float2x2& x) {
+    float2 r;
+    r.x = dot(y, x[0]);
+    r.y = dot(y, x[1]);
+    return r;
+}
+
+float3 mul(float3 y, const float3x3& x) {
+    float3 r;
+    r.x = dot(y, x[0]);
+    r.y = dot(y, x[1]);
+    r.z = dot(y, x[2]);
+    return r;
+}
+
+float4 mul(float4 y, const float4x4& x) {
+    float4 r;
+    r.x = dot(y, x[0]);
+    r.y = dot(y, x[1]);
+    r.z = dot(y, x[2]);
+    r.w = dot(y, x[3]);
+    return r;
+}
+
+
+// post-transform at least does a mul madd
+float2 mul(const float2x2& x, float2 y) {
+    float2 r = x[0] * y[0]; // no mul(v,v)
+    r = muladd( x[1], y[1], r);
+    return r;
+}
+
+float3 mul(const float3x3& x, float3 y) {
+    float3 r = x[0] * y[0];
+    r = muladd( x[1], y[1], r);
+    r = muladd( x[2], y[2], r);
+    return r;
+}
+
+float4 mul(const float4x4& x, float4 y) {
+    float4 r = x[0] * y[0];
+    r = muladd( x[1], y[1], r);
+    r = muladd( x[2], y[2], r);
+    r = muladd( x[3], y[3], r);
+    return r;
+}
+
+// matrix muls using mul madd
+float2x2 mul(const float2x2& x, const float2x2& y) {
+    float2x2 r;
+    
+    // m * columns
+    r[0] = mul(x, y[0]);
+    r[1] = mul(x, y[1]);
+    
+    return r;
+}
+
+float3x3 mul(const float3x3& x, const float3x3& y) {
+    float3x3 r;
+    r[0] = mul(x, y[0]);
+    r[1] = mul(x, y[1]);
+    r[2] = mul(x, y[2]);
+    return r;
+}
+
+float4x4 mul(const float4x4& x, const float4x4& y) {
+    float4x4 r;
+    r[0] = mul(x, y[0]);
+    r[1] = mul(x, y[1]);
+    r[2] = mul(x, y[2]);
+    r[3] = mul(x, y[3]);
+    return r;
+}
+
+// sub
+float2x2 sub(const float2x2& x, const float2x2& y) {
+    float2x2 r(x);
+    r[0] -= y[0];
+    r[1] -= y[1];
+    return r;
+}
+float3x3 sub(const float3x3& x, const float3x3& y) {
+    float3x3 r(x);
+    r[0] -= y[0];
+    r[1] -= y[1];
+    r[2] -= y[2];
+    return r;
+}
+float4x4 sub(const float4x4& x, const float4x4& y) {
+    float4x4 r(x);
+    r[0] -= y[0];
+    r[1] -= y[1];
+    r[2] -= y[2];
+    r[3] -= y[3];
+    return r;
+}
+
+// add
+float2x2 add(const float2x2& x, const float2x2& y) {
+    float2x2 r(x);
+    r[0] += y[0];
+    r[1] += y[1];
+    return r;
+}
+float3x3 add(const float3x3& x, const float3x3& y) {
+    float3x3 r(x);
+    r[0] += y[0];
+    r[1] += y[1];
+    r[2] += y[2];
+    return r;
+}
+float4x4 add(const float4x4& x, const float4x4& y) {
+    float4x4 r(x);
+    r[0] += y[0];
+    r[1] += y[1];
+    r[2] += y[2];
+    r[3] += y[3];
+    return r;
+}
+
+// equal
+bool equal(const float2x2& x, const float2x2& y) {
+    return all(x[0] == y[0] &
+               x[1] == y[1]);
+}
+bool equal(const float3x3& x, const float3x3& y) {
+    return all(x[0] == y[0] &
+               x[1] == y[1] &
+               x[2] == y[2]);
+}
+bool equal(const float4x4& x, const float4x4& y) {
+    return all(x[0] == y[0] &
+               x[1] == y[1] &
+               x[2] == y[2] &
+               x[3] == y[3]);
+}
+
+// equal_abs
+bool equal_abs(const float2x2& x, const float2x2& y, float tol) {
+    return all((abs(x[0] - y[0]) <= tol) &
+               (abs(x[1] - y[1]) <= tol));
+}
+bool equal_abs(const float3x3& x, const float3x3& y, float tol) {
+    return all((abs(x[0] - y[0]) <= tol) &
+               (abs(x[1] - y[1]) <= tol) &
+               (abs(x[2] - y[2]) <= tol));
+}
+bool equal_abs(const float4x4& x, const float4x4& y, float tol) {
+    return all((abs(x[0] - y[0]) <= tol) &
+               (abs(x[1] - y[1]) <= tol) &
+               (abs(x[2] - y[2]) <= tol) &
+               (abs(x[3] - y[3]) <= tol));
+}
+
+// equal_rel
+bool equal_rel(const float2x2& x, const float2x2& y, float tol) {
+    return all((abs(x[0] - y[0]) <= tol * abs(x[0])) &
+               (abs(x[1] - y[1]) <= tol * abs(x[1])));
+}
+bool equal_rel(const float3x3& x, const float3x3& y, float tol) {
+    return all((abs(x[0] - y[0]) <= tol * abs(x[0])) &
+               (abs(x[1] - y[1]) <= tol * abs(x[1])) &
+               (abs(x[2] - y[2]) <= tol * abs(x[2])));
+}
+bool equal_rel(const float4x4& x, const float4x4& y, float tol) {
+    return all((abs(x[0] - y[0]) <= tol * abs(x[0])) &
+               (abs(x[1] - y[1]) <= tol * abs(x[1])) &
+               (abs(x[2] - y[2]) <= tol * abs(x[2])) &
+               (abs(x[3] - y[3]) <= tol * abs(x[3])));
+}
+
+//-----------------
+
+// can negate w, or xyz with -kCross
+static const float4 kCross = {1.0f,1.0f,1.0f,-1.0f};
+
+// can eliminate 4 shufs by using 4 constants, 2 q swizzles are dupes
+static const float4 kConvertToMatrix = {0.0f,1.0f,2.0f,-2.0f};
+
+quatf::quatf(float3 axis, float radians)
+{
+    float s, c;
+    sincosf(radians * 0.5f, s, c);
+    v = float4m(s * axis, c);
+}
+
+quatf inverse(quatf q)
+{
+    return quatf(normalize(q.v * -kCross)); // vec *, but goiong to get quad mul below
+}
+
+quatf operator*(quatf q1, quatf q2)
+{
+    // This is original
+    //q1.y * q2.z - q1.z * q2.y + q1.x * q2.w + q1.w * q2.x,
+    //q1.z * q2.x - q1.x * q2.z + q1.y * q2.w + q1.w * q2.y,
+    //q1.x * q2.y - q1.y * q2.x + q1.z * q2.w + q1.w * q2.z,
+    //q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z
+    
+    // quaternion multiplication is similar to a four-space cross-product
+    float4 qv1 = q1.v;
+    float4 qv2 = q2.v;
+
+    // TODO: probably better to swizzle the kCross, but need 4 constants then
+    return quatf
+    (
+        dot((qv1 * qv2.wzyx).xywz, kCross), // -zy
+        dot((qv1 * qv2.zwxy).wyzx, kCross), // -xz
+        dot((qv1 * qv2.yxwz).xwzy, kCross), // -yx
+        dot(-qv1 * qv2, kCross) // +w, -xyz
+    );
+}
+
+float3x3 float3x3m(quatf qq)
+{
+    // not doing normalize like original
+    //q = normalize(q);
+    
+    float3x3 m;
+    float3 t;
+    
+    float4 c = kConvertToMatrix;
+    float4 q = qq.v;
+    q *= -kCross;
+    
+    t = q.wzy * c.wzw;
+    m[0] = q.yzx * t.zxy - q.zxy * t.yzx + c.yxx;
+    
+    t = q.zwx * c.wwz;
+    m[1] = q.yzx * t.zxy - q.zxy * t.yzx + c.xyx;
+    
+    // orthonormal basis, so use cross product for last term
+    m[2] = cross(m[0], m[1]); // 2 instr hlsl, 7 ops SSE
+    
+    return m;
+}
+
+float4x4 float4x4m(quatf qq)
+{
+    /* from Ken Shoemake's GGIV article
+    float xs = x*s,      ys = y*s,          zs = z*s;
+    float wx = w*xs,      wy = w*ys,     wz = w*zs;
+    float xx = x*xs,      xy = x*ys,     xz = x*zs;
+    float yy = y*ys,      yz = y*zs;
+    float zz = z*zs;
+   
+    // original formulation
+    return Mat44
+            (
+            1.0 - 2.0*(y*y + z*z),          2.0*(x*y - w*z),        2.0*(x*z + w*y), 0.0,
+                  2.0*(x*y + w*z), 1.0 - 2.0*(x*x + z*z),        2.0*(y*z - w*x), 0.0,
+                  2.0*(x*z - w*y),       2.0*(y*z + w*x),  1.0 - 2.0*(x*x + y*y), 0.0
+                  0.0,                                  0.0,                       0.0, 1.0
+            );
+   */
+
+    
+    // not doing normalize like original
+    //q = normalize(q);
+    
+    float4x4 m;
+    float4 t;
+
+    float4 c = kConvertToMatrix;
+    float4 q = qq.v;
+    q *= -kCross;
+
+    // really just 3 values, so w is always 0
+    t = q.wzyw * c.wzwx;
+    m[0] = q.yzxw * t.zxyw - q.zxyw * t.yzxw + c.yxxx;
+    
+    // really just 3 values, so w is always 0
+    t = q.zwxw * c.wwzx;
+    m[1] = q.yzxw * t.zxyw - q.zxyw * t.yzxw + c.xyxx;
+    
+    // orthonormal basis, so use cross product for last term
+    m[2] = float4m(cross(m[0].xyz, m[1].xyz), 0.0f); // 2 instr hlsl, 7 ops SSE
+    m[3] = float4_posw();
+
+    return m;
+}
+
+// Slow but constant velocity.
+quatf slerp(quatf q0, quatf q1, float t)
+{
+    // find smallest angle, flip axis
+    if (dot(q0.v, q1.v) < 0.0f)
+        q1.v.xyz = -q1.v.xyz;
+    
+    // theta/2
+    float cosThetaHalf = dot(q0.v, q1.v);
+    if (cosThetaHalf >= 0.995f)
+    {
+        return lerp(q0,q1,t);
+    }
+    else
+    {
+        // expensive
+        float thetaHalf = acosf(cosThetaHalf);
+        float sinThetaHalf = sinf(thetaHalf);
+
+        float s0 = sinf(thetaHalf * (1-t)) / sinThetaHalf;  // at t=0, s0 = 1
+        float s1 = sinf(thetaHalf * t)     / sinThetaHalf;  // at t=1, s1 = 1
+
+        return quatf(s0 * q0.v+ s1 * q1.v);
+    }
+}
+
+// compute control points for a bezier spline segment
+inline void quat_bezier_cp_impl(quatf q0, quatf q1, quatf q2,
+              quatf& a1, quatf& b1)
+{
+    // TODO: find out of these were quat or vec mul?
+    // Double(q0, q1);
+    a1.v = 2.0f * dot(q0.v,q1.v) * q1.v - q0.v;
+    
+    // Bisect(a1, q2);
+    a1.v = (a1.v + q2.v);
+    a1.v = normalize(a1.v);
+    
+    // Double(a1, q1);
+    b1.v = 2.0f * dot(a1.v,q1.v) * q1.v - a1.v;
+}
+
+
+// compute control points for a cubic bezier spline segment (quats must be smallest angle)
+void quat_bezier_cp(quatf q0, quatf q1, quatf q2, quatf q3,
+                     quatf& a1, quatf& b2)
+{
+    quatf b1, a2; // b1 unused calc
+    quat_bezier_cp_impl(q0,q1,q1, a1,b1);
+    quat_bezier_cp_impl(q1,q2,q3, a2,b2);
+}
+
+
+// spherical cubic bezier spline interpolation
+// takes in contol points
+quatf quat_bezer_slerp(quatf q0, quatf b, quatf c, quatf q1, float t)
+{
+    // deCastljau interpolation of the control points
+    quatf mid(slerp(b, c, t));
+
+    return slerp(slerp(slerp(q0, b, t), mid, t),
+                 slerp(mid, slerp(c, q1, t), t),
+        t);
+}
+
+// spherical cubic bezier spline interpolation
+// takes in contol points
+quatf quat_bezer_lerp(quatf q0, quatf b, quatf c, quatf q1, float t)
+{
+    // deCastljau interpolation of the control points
+    quatf mid(lerp(b, c, t));
+
+    return lerp(
+                 lerp(lerp(q0, b, t), mid, t),
+                 lerp(mid, lerp(c, q1, t), t),
+        t);
+}
+
+// ----------------------
+
+void transpose_affine(float4x4& m)
+{
+    // TODO: see other tranpsose not using shuffles and do that.
+    // TODO: use platform shuffles on Neon
+    
+    // avoid copy and one shuffle
+    float4 tmp3, tmp2, tmp1, tmp0;
+                   
+    // using sse2neon to port this
+    tmp0 = _mm_shuffle_ps(m[0], m[1], 0x44);
+    tmp2 = _mm_shuffle_ps(m[0], m[1], 0xEE);
+
+    tmp1 = _mm_shuffle_ps(m[2], m[3], 0x44);
+    tmp3 = _mm_shuffle_ps(m[2], m[3], 0xEE);
+                                                            
+    m[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
+    m[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
+    m[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
+
+    // skips m[3] - known 0001
+}
+
+float4x4 inverse_tr(const float4x4& mtx)
+{
+    float4x4 inverse(mtx);
+    inverse[3] = float4_negw();  // will be flipped by matrix mul
+    transpose_affine(inverse);  // handle rotation (R)inv = (R)T
+    
+    inverse[3] = inverse * (-mtx[3]); // 1 mul, 3 mads
+
+    return inverse;
+}
+
+// invert a row vector matrix
+float4x4 inverse_tru(const float4x4& mtx)
+{
+    bool success = false;
+    
+    float scaleX = length_squared(mtx[0]);
+    
+    float4x4 inverse(mtx);
+    if (scaleX > 1e-6f) {
+        inverse[3] = float4_negw();
+        
+        transpose_affine(inverse);
+
+        // all rows/columns in orthogonal tfm have same magnitude with uniform scale
+        float4 invScaleX = float4m(1.0f / scaleX); // inverse squared
+        
+        // scale the rotation matrix by scaling inverse
+        inverse[0] *= invScaleX;
+        inverse[1] *= invScaleX;
+        inverse[2] *= invScaleX;
+
+        // handle the translation
+        inverse[3] = inverse * (-mtx[3]);
+        
+        success = true;
+        (void)(success);
+    }
+    
+    return inverse;
+}
+
+float4x4 float4x4m_tr(float3 t, quatf r) {
+    float4x4 m(float4x4::identity());
+    m[3].xyz = t;
+    
+    m *= float4x4m(r);
+    return m;
+}
+
+// TODO: there are faster ways to apply post rot, post scale
+float4x4 float4x4m_trs(float3 t, quatf r, float3 scale) {
+    return translation(t) * float4x4m(r) * float4x4(float4m(scale,1.0f));
+}
+
+float4x4 float4x4m_inverse_trs(float3 t, quatf r, float3 scale) {
+    // 1/S * RT * -T
+    return float4x4(recip(float4m(scale,1.0f))) * transpose(float4x4m(r)) * translation(-t);
+}
+
+
+// leaving this in here, since it can be further optimized
+float4x4 float4x4m_tru(float3 t, quatf r, float scale) {
+    return float4x4m_trs(t, r, float3m(scale));
+}
+
+float4x4 float4x4m_tru_inverse(float3 t, quatf r, float scale) {
+    return float4x4m_inverse_trs(t, r, float3m(scale));
+}
+    
+float4x4 inverse_trs(const float4x4& mtx)
+{
+    bool success = false;
+    
+    float4x4 inverse(mtx);
+    
+    // TODO: fix handling of failure
+    // compute the scaling terms (4 dots)
+    // float3 scale = calcScaleSquaredRowTfm(m);
+    // if (all(scale > float3(1e-6f)) {
+        inverse[3] = float4_negw(); // neccessary for simple inverse to hold
+    
+        transpose_affine(inverse);
+
+        // this is cheaper than 3 dot's above, just mul/add
+        float4 invScale = recip(inverse[0]*inverse[0] +
+                                inverse[1]*inverse[1] +
+                                inverse[2]*inverse[2]);
+    
+        // scale the rotation matrix by scaling inverse
+        inverse[0] *= invScale;
+        inverse[1] *= invScale;
+        inverse[2] *= invScale;
+        inverse[3] = inverse * (-mtx[3]);
+        
+        success = true;
+    (void)(success);
+    //}
+    
+    return inverse;
+}
+
+float4x4 float4x4m(char axis, float radians)
+{
+    float    sinTheta, cosTheta;
+    sincosf(radians, sinTheta, cosTheta);
+            
+    float4x4 m;
+    m[3] = float4_posw();
+            
+    switch(axis) {
+        case 'x':
+        {
+            m[0] = float4_posx();
+            m[1] = float4m(0.0f,  cosTheta, sinTheta, 0.0f);
+            m[2] = float4m(0.0f, -sinTheta, cosTheta, 0.0f);
+            break;
+        }
+        
+        case 'y':
+        {
+            m[0] = float4m(cosTheta, 0.0f, -sinTheta, 0.0f);
+            m[1] = float4_posy();
+            m[2] = float4m(sinTheta, 0.0f,  cosTheta, 0.0f);
+            break;
+        }
+        
+        case 'z':
+        {
+            m[0] = float4m( cosTheta, sinTheta, 0.0f, 0.0f);
+            m[1] = float4m(-sinTheta, cosTheta, 0.0f, 0.0f);
+            m[2] = float4_posz();
+            break;
+        }
+    }
+    return m;
+}
+
+float4x4 perspective_rhcs(float fovyRadians, float aspectXtoY, float nearZ, float farZ)
+{
+    // form tangents
+    float tanY = tanf(fovyRadians * 0.5f);
+    float tanX = tanY * aspectXtoY;
+    
+    // currently symmetric
+    // all postive values from center
+    float4 tangents = { -tanX, tanY, tanX, -tanY }; // l,t,r,b
+    
+    return perspective_rhcs(tangents, nearZ, farZ);
+}
+
+float4x4 perspective_rhcs(float4 tangents, float nearZ, float farZ)
+{
+    tangents *= nearZ;
+    
+    float l = tangents.x;
+    float t = tangents.y;
+    float r = tangents.z;
+    float b = tangents.w;
+    
+    float dx = (r - l);
+    float dy = (t - b);
+    
+    float xs = 2.0f * nearZ / dx;
+    float ys = 2.0f * nearZ / dy;
+    
+    // 0.5x?
+    float xoff = (r + l) / dx;
+    float yoff = (t + b) / dy;
+    
+    // reverseZ, looking down -Z axis
+    float m22;
+    float m23;
+  
+    if (farZ == FLT_MAX) {
+        m22 = 0;
+        m23 = nearZ;
+    }
+    else {
+        float dz = farZ - nearZ;
+        m22 = nearZ / dz;
+        m23 = (nearZ * farZ) / dz;
+        
+        // Math looking down -Z axis
+        // ( z * nearZ + nearZ * farZ ) / dz
+        // nearZ * (farZ + z) / (farZ - nearZ)
+        // -nearZ -> nearZ * (farZ - nearZ ) / (farZ - nearZ) = nearZ
+        //   when dividing by w = -(-nearZ) then get 1
+        // -farZ  -> nearZ * (farZ - farZ) / (farZ - nearZ) = 0
+    }
+        
+    float4x4 m(
+        (float4){xs,       0,   0,  0},
+        (float4){ 0,      ys,   0,  0},
+        (float4){ xoff, yoff, m22, -1},
+        (float4){ 0,       0, m23,  0}
+    );
+    
+    return m;
+}
+
+float4x4 orthographic_rhcs(float4 rect, float nearZ, float farZ)
+{
+    // l,t,r,b
+    float l = rect.x;
+    float t = rect.y;
+    float r = rect.z;
+    float b = rect.w;
+    
+    float dx = (r-l);
+    float dy = (t-b);
+    
+    float m00 = 2.0f / dx;
+    float m11 = 2.0f / dy;
+    
+    float m03 = (r+l) / dx;
+    float m13 = (t+b) / dy;
+    
+    // reverzeZ looking down -z axis
+    float dz = (farZ - nearZ);
+    
+    float m22 = 1.0 / dz;
+    float m23 = farZ / dz;
+    
+    // Math looking down -Z axis
+    // -near -> (-nearZ + farZ) / dz = 1
+    // -far  -> (-farZ + farZ) / dz = 0
+    
+    float4x4 m(
+        (float4){m00,   0,   0, 0},
+        (float4){  0, m11,   0, 0},
+        (float4){  0,   0, m22, 0},
+        (float4){m03, m13, m23, 1}
+    );
+    return m;
+}
+
+} // SIMD_NAMESPACE
+#endif // USE_SIMDLIB
diff --git a/libkram/vectormath/float234.h b/libkram/vectormath/float234.h
new file mode 100644
index 00000000..ec07bb64
--- /dev/null
+++ b/libkram/vectormath/float234.h
@@ -0,0 +1,1051 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+#if USE_SIMDLIB && SIMD_FLOAT
+
+// This is not yet standalone.  vectormath234.h includes it.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// clang-format off
+
+// define c++ vector/matrix types
+macroVector4TypesAligned(float, float)
+macroVector4TypesPacked(float, float)
+
+// storage type for matrix
+typedef struct { float2a columns[2]; } float2x2a;
+typedef struct { float3a columns[3]; } float3x3a;
+typedef struct { float4a columns[3]; } float3x4a;
+typedef struct { float4a columns[4]; } float4x4a;
+
+// glue to Accelerate
+#if SIMD_ACCELERATE_MATH_NAMES
+macroVector4TypesStorageRenames(float, simd_float)
+#endif // SIMD_ACCELERATE_MATH_NAMES
+
+// clang-format on
+
+#ifdef __cplusplus
+}
+
+namespace SIMD_NAMESPACE {
+
+// clang-format off
+macroVector4TypesStorageRenames(float, float)
+    // clang-format on
+
+    //-----------------------------------
+    // start of implementation
+
+    // zeroext - internal helper
+    SIMD_CALL float4 zeroext(float2 x)
+{
+    float4 v = 0;
+    v.xy = x;
+    return v;
+}
+SIMD_CALL float4 zeroext(float3 x)
+{
+    float4 v = 0;
+    v.xyz = x;
+    return v;
+}
+
+#if SIMD_NEON
+
+// DONE: expose float2 ops on Neon.
+// q = 4, nothing = 2
+
+SIMD_CALL float reduce_min(float2 x) { return vminv_f32(x); }
+SIMD_CALL float reduce_min(float4 x) { return vminvq_f32(x); }
+
+SIMD_CALL float reduce_max(float2 x) { return vmaxv_f32(x); }
+SIMD_CALL float reduce_max(float4 x) { return vmaxvq_f32(x); }
+
+// precise returns x on Nan
+SIMD_CALL float2 min(float2 x, float2 y) { return vminnm_f32(x, y); }
+SIMD_CALL float4 min(float4 x, float4 y) { return vminnmq_f32(x, y); }
+
+// precise returns x on Nan
+SIMD_CALL float2 max(float2 x, float2 y) { return vmaxnm_f32(x, y); }
+SIMD_CALL float4 max(float4 x, float4 y) { return vmaxnmq_f32(x, y); }
+
+// requires __ARM_VFPV4__
+// t passed first unlike sse
+SIMD_CALL float2 muladd(float2 x, float2 y, float2 t) { return vfma_f32(t, x, y); }
+SIMD_CALL float4 muladd(float4 x, float4 y, float4 t) { return vfmaq_f32(t, x, y); }
+
+SIMD_CALL float2 sqrt(float2 x) { return vsqrt_f32(x); }
+SIMD_CALL float4 sqrt(float4 x) { return vsqrtq_f32(x); }
+
+SIMD_CALL float2 reduce_addv(float2 x)
+{
+    x = vpadd_f32(x, x);
+    return x.x; // repeat x to all values
+}
+SIMD_CALL float4 reduce_addv(float4 x)
+{
+    // 4:1 reduction
+    x = vpaddq_f32(x, x); // xy = x+y,z+w
+    x = vpaddq_f32(x, x); // x  = x+y
+    return x.x; // repeat x to all values
+}
+SIMD_CALL float3 reduce_addv(float3 x)
+{
+    return reduce_addv(zeroext(x)).x; // repeat
+}
+
+// round to nearest | exc
+SIMD_CALL float2 round(float2 vv) { return vrndn_f32(vv); }
+SIMD_CALL float4 round(float4 vv) { return vrndnq_f32(vv); }
+
+SIMD_CALL float2 ceil(float2 vv) { return vrndp_f32(vv); }
+SIMD_CALL float4 ceil(float4 vv) { return vrndpq_f32(vv); }
+
+SIMD_CALL float2 floor(float2 vv) { return vrndm_f32(vv); }
+SIMD_CALL float4 floor(float4 vv) { return vrndmq_f32(vv); }
+
+#endif // SIMD_NEON
+
+#if SIMD_SSE
+
+// x64 doesn't seem to have a simd op for min/max reduce
+SIMD_CALL float reduce_min(float4 x)
+{
+    return fmin(fmin(x.x, x.y), fmin(x.z, x.w));
+}
+SIMD_CALL float reduce_min(float2 x)
+{
+    return reduce_min(vec2to4(x));
+}
+
+SIMD_CALL float reduce_max(float4 x)
+{
+    return fmax(fmax(x.x, x.y), fmax(x.z, x.w));
+}
+SIMD_CALL float reduce_max(float2 x)
+{
+    return reduce_max(vec2to4(x));
+}
+
+// needs SIMD_INT
+// needed for precise min/max calls below
+#if SIMD_INT
+SIMD_CALL float4 bitselect_forminmax(float4 x, float4 y, int4 mask)
+{
+    return (float4)(((int4)x & ~mask) | ((int4)y & mask));
+}
+#endif
+
+SIMD_CALL float4 min(float4 x, float4 y)
+{
+    // precise returns x on Nan
+    return bitselect_forminmax(_mm_min_ps(x, y), x, y != y);
+}
+SIMD_CALL float2 min(float2 x, float2 y)
+{
+    return vec4to2(min(vec2to4(x), vec2to4(y)));
+}
+
+SIMD_CALL float4 max(float4 x, float4 y)
+{
+    // precise returns x on Nan
+    return bitselect_forminmax(_mm_max_ps(x, y), x, y != y);
+}
+SIMD_CALL float2 max(float2 x, float2 y)
+{
+    return vec4to2(max(vec2to4(x), vec2to4(y)));
+}
+
+SIMD_CALL float4 muladd(float4 x, float4 y, float4 t)
+{
+    // can't get Xcode to set -mfma with AVX2 set
+#ifdef __FMA__
+    return _mm_fmadd_ps(x, y, t);
+#else
+    // fallback with not same characteristics
+    return x * y + t;
+#endif
+}
+SIMD_CALL float2 muladd(float2 x, float2 y, float2 t)
+{
+    return vec4to2(muladd(vec2to4(x), vec2to4(y), vec2to4(t)));
+}
+
+SIMD_CALL float4 sqrt(float4 x)
+{
+    return _mm_sqrt_ps(x);
+}
+SIMD_CALL float2 sqrt(float2 x)
+{
+    return vec4to2(sqrt(vec2to4(x)));
+}
+
+SIMD_CALL float4 reduce_addv(float4 x)
+{
+    // 4:1 reduction
+    x = _mm_hadd_ps(x, x); // xy = x+y,z+w
+    x = _mm_hadd_ps(x, x); // x  = x+y
+    return x.x; // repeat x to all values
+}
+SIMD_CALL float2 reduce_addv(float2 x)
+{
+    return reduce_addv(zeroext(x)).x;
+}
+SIMD_CALL float3 reduce_addv(float3 x)
+{
+    return reduce_addv(zeroext(x)).x;
+}
+
+// SSE4.1
+SIMD_CALL float4 round(float4 vv)
+{
+    // round to nearest | exc
+    return _mm_round_ps(vv, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+SIMD_CALL float2 round(float2 x)
+{
+    return vec4to2(round(vec2to4(x)));
+}
+
+SIMD_CALL float4 ceil(float4 vv)
+{
+    return _mm_ceil_ps(vv);
+}
+SIMD_CALL float2 ceil(float2 x)
+{
+    return vec4to2(ceil(vec2to4(x)));
+}
+
+SIMD_CALL float4 floor(float4 vv)
+{
+    return _mm_floor_ps(vv);
+}
+SIMD_CALL float2 floor(float2 x)
+{
+    return vec4to2(floor(vec2to4(x)));
+}
+
+#endif // SIMD_INT && SIMD_SSE
+
+// end of implementation
+//-----------------------------------
+#if SIMD_INT
+
+// bitselect
+// Hoping these casts float2 -> int2 don't truncate
+//  want this to map to _mm_cast calls
+SIMD_CALL float2 bitselect(float2 x, float2 y, int2 mask)
+{
+    return (float2)bitselect((int2)x, (int2)y, mask);
+}
+SIMD_CALL float3 bitselect(float3 x, float3 y, int3 mask)
+{
+    return (float3)bitselect((int3)x, (int3)y, mask);
+}
+SIMD_CALL float4 bitselect(float4 x, float4 y, int4 mask)
+{
+    return (float4)bitselect((int4)x, (int4)y, mask);
+}
+
+// select
+SIMD_CALL float2 select(float2 x, float2 y, int2 mask)
+{
+    return bitselect(x, y, mask >> 31);
+}
+SIMD_CALL float3 select(float3 x, float3 y, int3 mask)
+{
+    return bitselect(x, y, mask >> 31);
+}
+SIMD_CALL float4 select(float4 x, float4 y, int4 mask)
+{
+    return bitselect(x, y, mask >> 31);
+}
+
+#endif // SIMD_INT
+
+// TODO: consider casts instead of shuffles below, at least on inputs
+// float3 same size as float4, can't use cast on reduce calls.
+
+// float3 leftovers
+SIMD_CALL float3 min(float3 x, float3 y) { return vec4to3(min(vec3to4(x), vec3to4(y))); }
+SIMD_CALL float3 max(float3 x, float3 y) { return vec4to3(max(vec3to4(x), vec3to4(y))); }
+SIMD_CALL float3 muladd(float3 x, float3 y, float3 t) { return vec4to3(muladd(vec3to4(x), vec3to4(y), vec3to4(t))); }
+SIMD_CALL float reduce_min(float3 x) { return reduce_min(vec3to4(x)); }
+SIMD_CALL float reduce_max(float3 x) { return reduce_max(vec3to4(x)); }
+SIMD_CALL float3 round(float3 x) { return vec4to3(round(vec3to4(x))); }
+SIMD_CALL float3 ceil(float3 x) { return vec4to3(ceil(vec3to4(x))); }
+SIMD_CALL float3 floor(float3 x) { return vec4to3(floor(vec3to4(x))); }
+SIMD_CALL float3 sqrt(float3 x) { return vec4to3(sqrt(vec3to4(x))); }
+
+// rsqrt
+SIMD_CALL float4 rsqrt(float4 x) { return 1.0f / sqrt(x); }
+SIMD_CALL float2 rsqrt(float2 x) { return 1.0f / sqrt(x); }
+SIMD_CALL float3 rsqrt(float3 x) { return 1.0f / sqrt(x); }
+
+// recip
+SIMD_CALL float4 recip(float4 x) { return 1.0f / x; }
+SIMD_CALL float2 recip(float2 x) { return 1.0f / x; }
+SIMD_CALL float3 recip(float3 x) { return 1.0f / x; }
+
+SIMD_CALL float reduce_add(float2 x) { return reduce_addv(x).x; }
+SIMD_CALL float reduce_add(float3 x) { return reduce_addv(x).x; }
+SIMD_CALL float reduce_add(float4 x) { return reduce_addv(x).x; }
+
+// clamp
+// order matters here for Nan, left op returned on precise min/max
+SIMD_CALL float2 clamp(float2 x, float2 minv, float2 maxv)
+{
+    return min(maxv, max(minv, x));
+}
+SIMD_CALL float3 clamp(float3 x, float3 minv, float3 maxv)
+{
+    return min(maxv, max(minv, x));
+}
+SIMD_CALL float4 clamp(float4 x, float4 minv, float4 maxv)
+{
+    return min(maxv, max(minv, x));
+}
+
+// saturate
+SIMD_CALL float2 saturate(float2 x) { return clamp(x, 0, (float2)1); }
+SIMD_CALL float3 saturate(float3 x) { return clamp(x, 0, (float3)1); }
+SIMD_CALL float4 saturate(float4 x) { return clamp(x, 0, (float4)1); }
+
+// lerp - another easy one, could use muladd(t, y-x, x)
+SIMD_CALL float2 lerp(float2 x, float2 y, float2 t) { return x + t * (y - x); }
+SIMD_CALL float3 lerp(float3 x, float3 y, float3 t) { return x + t * (y - x); }
+SIMD_CALL float4 lerp(float4 x, float4 y, float4 t) { return x + t * (y - x); }
+
+SIMD_CALL float2 lerp(float2 x, float2 y, float t) { return x + t * (y - x); }
+SIMD_CALL float3 lerp(float3 x, float3 y, float t) { return x + t * (y - x); }
+SIMD_CALL float4 lerp(float4 x, float4 y, float t) { return x + t * (y - x); }
+
+// dot
+SIMD_CALL float dot(float2 x, float2 y) { return reduce_add(x * y); }
+SIMD_CALL float dot(float3 x, float3 y) { return reduce_add(x * y); }
+SIMD_CALL float dot(float4 x, float4 y) { return reduce_add(x * y); }
+
+// length_squared
+SIMD_CALL float length_squared(float2 x) { return reduce_add(x * x); }
+SIMD_CALL float length_squared(float3 x) { return reduce_add(x * x); }
+SIMD_CALL float length_squared(float4 x) { return reduce_add(x * x); }
+
+// length
+SIMD_CALL float length(float2 x) { return ::sqrt(reduce_add(x * x)); }
+SIMD_CALL float length(float3 x) { return ::sqrt(reduce_add(x * x)); }
+SIMD_CALL float length(float4 x) { return ::sqrt(reduce_add(x * x)); }
+
+// distance
+SIMD_CALL float distance(float2 x, float2 y) { return length(x - y); }
+SIMD_CALL float distance(float3 x, float3 y) { return length(x - y); }
+SIMD_CALL float distance(float4 x, float4 y) { return length(x - y); }
+
+// normalize
+// optimized by staying in reg
+// x * invlength(x)
+SIMD_CALL float4 normalize(float4 x) { return x / sqrt(reduce_addv(x * x)).x; }
+SIMD_CALL float2 normalize(float2 x) { return x / sqrt(reduce_addv(x * x)).x; }
+SIMD_CALL float3 normalize(float3 x) { return x / sqrt(reduce_addv(x * x)).x; }
+
+// abs
+SIMD_CALL float2 abs(float2 x)
+{
+    return bitselect(0.0, x, 0x7fffffff);
+}
+SIMD_CALL float3 abs(float3 x)
+{
+    return bitselect(0.0, x, 0x7fffffff);
+}
+SIMD_CALL float4 abs(float4 x)
+{
+    return bitselect(0.0, x, 0x7fffffff);
+}
+
+// cross
+SIMD_CALL float cross(float2 x, float2 y)
+{
+    return x.x * y.y - x.y * y.x;
+}
+SIMD_CALL float3 cross(float3 x, float3 y)
+{
+    return x.yzx * y.zxy - x.zxy * y.yzx;
+}
+
+// equal
+// == and != return a int234 vector, so need these to match other vecs
+SIMD_CALL bool equal(float2 x, float2 y)
+{
+    return all(x == y);
+}
+SIMD_CALL bool equal(float3 x, float3 y)
+{
+    return all(x == y);
+}
+SIMD_CALL bool equal(float4 x, float4 y)
+{
+    return all(x == y);
+}
+
+// equal_abs
+SIMD_CALL bool equal_abs(float2 x, float2 y, float tol)
+{
+    return all((abs(x - y) <= tol));
+}
+SIMD_CALL bool equal_abs(float3 x, float3 y, float tol)
+{
+    return all((abs(x - y) <= tol));
+}
+SIMD_CALL bool equal_abs(float4 x, float4 y, float tol)
+{
+    return all((abs(x - y) <= tol));
+}
+
+// equal_rel
+SIMD_CALL bool equal_rel(float2 x, float2 y, float tol)
+{
+    return all((abs(x - y) <= tol * ::abs(x.x)));
+}
+SIMD_CALL bool equal_rel(float3 x, float3 y, float tol)
+{
+    return all((abs(x - y) <= tol * ::abs(x.x)));
+}
+SIMD_CALL bool equal_rel(float4 x, float4 y, float tol)
+{
+    return all((abs(x - y) <= tol * ::abs(x.x)));
+}
+
+// step
+SIMD_CALL float2 step(float2 edge, float2 x)
+{
+    return bitselect((float2)1, 0, x < edge);
+}
+SIMD_CALL float3 step(float3 edge, float3 x)
+{
+    return bitselect((float3)1, 0, x < edge);
+}
+SIMD_CALL float4 step(float4 edge, float4 x)
+{
+    return bitselect((float4)1, 0, x < edge);
+}
+
+// smoothstep
+SIMD_CALL float2 smoothstep(float2 edge0, float2 edge1, float2 x)
+{
+    float2 t = saturate((x - edge0) / (edge0 - edge1));
+    return t * t * (3 - 2 * t);
+}
+SIMD_CALL float3 smoothstep(float3 edge0, float3 edge1, float3 x)
+{
+    float3 t = saturate((x - edge0) / (edge0 - edge1));
+    return t * t * (3 - 2 * t);
+}
+SIMD_CALL float4 smoothstep(float4 edge0, float4 edge1, float4 x)
+{
+    float4 t = saturate((x - edge0) / (edge0 - edge1));
+    return t * t * (3 - 2 * t);
+}
+
+// fract
+SIMD_CALL float2 fract(float2 x)
+{
+    return min(x - floor(x), 0x1.fffffep-1f);
+}
+SIMD_CALL float3 fract(float3 x)
+{
+    return min(x - floor(x), 0x1.fffffep-1f);
+}
+SIMD_CALL float4 fract(float4 x)
+{
+    return min(x - floor(x), 0x1.fffffep-1f);
+}
+
+SIMD_CALL bool is_nan(float2 x)
+{
+    return any(x != x);
+}
+SIMD_CALL bool is_nan(float3 x)
+{
+    return any(x != x);
+}
+SIMD_CALL bool is_nan(float4 x)
+{
+    return any(x != x);
+}
+
+SIMD_CALL float2 fix_nan(float2 x, float2 replace)
+{
+    return min(replace, x);
+}
+SIMD_CALL float3 fix_nan(float3 x, float3 replace)
+{
+    return min(replace, x);
+}
+SIMD_CALL float4 fix_nan(float4 x, float4 replace)
+{
+    return min(replace, x);
+}
+
+/* this is just to show examples of extended vector types, float8 should move out
+
+#if SIMD_FLOAT_EXT
+
+// These are cpu only math.  None of the gpus support these long types.
+// and MSL doesn't even support double.
+ // need to convert float4 to 8/16
+ float8 float8m(float4 x, float4 y) {
+ }
+ float16 float16m(float8 x, float8 y) {
+ }
+
+// how important are 8/16 ops for float and 8 for double?  Could reduce with only doing up to 4.
+// can see doing more ops on smaller types.  Slower when these have to route through simd4.
+
+
+SIMD_CALL float8 clamp(float8 x, float8 min, float8 max) {
+    return min(max(x, min), max);
+}
+SIMD_CALL float reduce_min(float8 x) {
+     return reduce_min(min(x.lo, x.hi));
+}
+SIMD_CALL float reduce_max(float8 x) {
+     return reduce_max(max(x.lo, x.hi));
+}
+SIMD_CALL float8 muladd(float8 x, float8 y, float8 t) {
+     return float8m(muladd(x.lo, y.lo, z.lo), muladd(x.hi, y.hi, z.hi));
+}
+SIMD_CALL float8 lerp(float8 x, float8 y, float8 t) {
+     return x + t*(y - x);
+}
+SIMD_CALL float reduce_add(float8 x) {
+    return reduce_add(x.lo + x.hi);
+}
+SIMD_CALL float normalize(float8 x) {
+     return x / length(x);
+}
+
+// float16 calling up to float8
+SIMD_CALL float16 clamp(float16 x, float16 min, float16 max) {
+    return min(max(x, min), max);
+}
+SIMD_CALL float reduce_min(float16 x) {
+    return fmin(reduce_min(x.lo), reduce_min(x.hi));
+}
+SIMD_CALL float reduce_max(float16 x) {
+    return fmax(reduce_max(x.lo), reduce_max(x.hi));
+}
+SIMD_CALL float16 muladd(float16 x, float16 y, float16 t) {
+    return float16m(muladd(x.lo, y.lo, z.lo), muladd(x.hi, y.hi, z.hi));
+}
+SIMD_CALL float16 lerp(float16 x, float16 y, float16 t) {
+    return x + t*(y - x);
+}
+SIMD_CALL float reduce_add(float16 x) {
+    return reduce_add(x.lo + x.hi);
+}
+SIMD_CALL float normalize(float16 x) {
+    return x / length(x);
+}
+
+#endif // SIMD_FLOAT_EXT
+*/
+
+// make "m" ctors for vecs.  This avoids wrapping the type in a struct.
+// vector types are C typedef, and so cannot have member functions.
+// Be careful with initializers = { val }, only sets first element of vector
+// and not all the values.  Use = val; or one of the calls below to be safe.
+
+SIMD_CALL float2 float2m(float x)
+{
+    return x;
+}
+SIMD_CALL float2 float2m(float x, float y)
+{
+    return {x, y};
+}
+
+SIMD_CALL float3 float3m(float x)
+{
+    return x;
+}
+SIMD_CALL float3 float3m(float x, float y, float z)
+{
+    return {x, y, z};
+}
+SIMD_CALL float3 float3m(float2 v, float z)
+{
+    float3 r;
+    r.xy = v;
+    r.z = z;
+    return r;
+}
+
+SIMD_CALL float4 float4m(float x)
+{
+    return x;
+}
+SIMD_CALL float4 float4m(float2 xy, float2 zw)
+{
+    float4 r;
+    r.xy = xy;
+    r.zw = zw;
+    return r;
+}
+SIMD_CALL float4 float4m(float x, float y, float z, float w = 1.0f)
+{
+    return {x, y, z, w};
+}
+SIMD_CALL float4 float4m(float3 v, float w = 1.0f)
+{
+    float4 r;
+    r.xyz = v;
+    r.w = w;
+    return r;
+}
+
+// fast conversions where possible
+// need non-const too
+SIMD_CALL const float3& as_float3(const float4& m)
+{
+    return reinterpret_cast<const float3&>(m);
+}
+SIMD_CALL const float3* as_float3(const float4* m)
+{
+    return reinterpret_cast<const float3*>(m);
+}
+
+// this one is dangerous, since w is undefined
+//SIMD_CALL const float4& as_float4(const float3& m) {
+//    return reinterpret_cast<const float4&>(m);
+//}
+
+// clang-format off
+
+// power series
+macroVectorRepeatFnDecl(float, log)
+macroVectorRepeatFnDecl(float, exp)
+
+// trig
+// TODO: more accurate cospi, sinpi, ...
+macroVectorRepeatFnDecl(float, cos)
+macroVectorRepeatFnDecl(float, sin)
+macroVectorRepeatFnDecl(float, tan)
+
+macroVectorRepeatFnDecl(float, acos)
+macroVectorRepeatFnDecl(float, asin)
+macroVectorRepeatFnDecl(float, atan)
+
+macroVectorRepeatFn2Decl(float, atan2)
+
+    // clang-format on
+
+    // sincos requires accel 5 lib, and takes 2 ptrs
+    // may need math fallback for some calls
+    // macroVectorRepeatFn2Decl(float, sincos)
+
+    // pow
+    // can xy be <= 0 ?, no will return Nan in log/exp approx
+    SIMD_CALL float2 pow(float2 x, float2 y)
+{
+    return exp(log(x) * y);
+}
+SIMD_CALL float3 pow(float3 x, float3 y)
+{
+    return exp(log(x) * y);
+}
+SIMD_CALL float4 pow(float4 x, float4 y)
+{
+    return exp(log(x) * y);
+}
+
+// TODO: add more math ops
+
+//-----------------------------------
+// constants
+
+// TODO: better way to name these, can there be float2::zero()
+// also could maybe use that for fake vector ctors.
+
+const float2& float2_zero();
+const float2& float2_ones();
+
+const float2& float2_posx();
+const float2& float2_posy();
+const float2& float2_negx();
+const float2& float2_negy();
+
+//----
+
+const float3& float3_zero();
+const float3& float3_ones();
+
+const float3& float3_posx();
+const float3& float3_posy();
+const float3& float3_posz();
+const float3& float3_negx();
+const float3& float3_negy();
+const float3& float3_negz();
+
+//----
+
+const float4& float4_zero();
+const float4& float4_ones();
+
+const float4& float4_posx();
+const float4& float4_posy();
+const float4& float4_posz();
+const float4& float4_posw();
+const float4& float4_negx();
+const float4& float4_negy();
+const float4& float4_negz();
+const float4& float4_negw();
+
+const float4& float4_posxw();
+const float4& float4_posyw();
+const float4& float4_poszw();
+const float4& float4_negxw();
+const float4& float4_negyw();
+const float4& float4_negzw();
+
+//-----------------------------------
+// matrix
+
+// column matrix, so postmul vectors
+// (projToCamera * cameraToWorld * worldToModel) * modelVec
+
+struct float2x2 : float2x2a {
+    // can be split out to traits
+    static constexpr int col = 2;
+    static constexpr int row = 2;
+    using column_t = float2;
+    using scalar_t = float;
+    using base = float2x2a;
+
+    static const float2x2& zero();
+    static const float2x2& identity();
+
+    float2x2() {} // default uninit
+    explicit float2x2(float2 diag);
+    float2x2(float2 c0, float2 c1)
+        : base((base){c0, c1}) {}
+    float2x2(const base& m)
+        : base(m) {}
+
+    // simd lacks these ops
+    float2& operator[](int idx) { return columns[idx]; }
+    const float2& operator[](int idx) const { return columns[idx]; }
+};
+
+struct float3x3 : float3x3a {
+    static constexpr int col = 3;
+    static constexpr int row = 3;
+    using column_t = float3;
+    using scalar_t = float;
+    using base = float3x3a;
+
+    // Done as wordy c funcs otherwize.  Funcs allow statics to init.
+    static const float3x3& zero();
+    static const float3x3& identity();
+
+    float3x3() {} // default uninit
+    explicit float3x3(float3 diag);
+    float3x3(float3 c0, float3 c1, float3 c2)
+        : base((base){c0, c1, c2}) {}
+    float3x3(const base& m)
+        : base(m) {}
+
+    float3& operator[](int idx) { return columns[idx]; }
+    const float3& operator[](int idx) const { return columns[idx]; }
+};
+
+// This is mostly a transposed holder for a 4x4, so very few ops defined
+// Can also serve as a SOA for some types of cpu math.
+struct float3x4 : float3x4a {
+    static constexpr int col = 3;
+    static constexpr int row = 4;
+    using column_t = float4;
+    using scalar_t = float;
+    using base = float3x4a;
+
+    static const float3x4& zero();
+    static const float3x4& identity();
+
+    float3x4() {} // default uninit
+    explicit float3x4(float3 diag);
+    float3x4(float4 c0, float4 c1, float4 c2)
+        : base((base){c0, c1, c2}) {}
+    float3x4(const float3x4a& m)
+        : base(m) {}
+
+    float4& operator[](int idx) { return columns[idx]; }
+    const float4& operator[](int idx) const { return columns[idx]; }
+};
+
+struct float4x4 : float4x4a {
+    static constexpr int col = 4;
+    static constexpr int row = 4;
+    using column_t = float4;
+    using scalar_t = float;
+    using base = float4x4a;
+
+    static const float4x4& zero();
+    static const float4x4& identity();
+
+    float4x4() {} // default uninit
+    explicit float4x4(float4 diag);
+    float4x4(float4 c0, float4 c1, float4 c2, float4 c3)
+        : base((base){c0, c1, c2, c3}) {}
+    float4x4(const base& m)
+        : base(m) {}
+
+    float4& operator[](int idx) { return columns[idx]; }
+    const float4& operator[](int idx) const { return columns[idx]; }
+};
+
+// transposes to convert between matrix type
+float4x4 float4x4m(const float3x4& m);
+float3x4 float3x4m(const float4x4& m);
+
+// set diagonal to vec and rest to 0
+float2x2 diagonal_matrix(float2 x);
+float3x3 diagonal_matrix(float3 x);
+float3x4 diagonal_matrix3x4(float3 x);
+float4x4 diagonal_matrix(float4 x);
+
+// transpose
+float2x2 transpose(const float2x2& x);
+float3x3 transpose(const float3x3& x);
+float4x4 transpose(const float4x4& x);
+
+// general inverses - faster ones for trs
+float2x2 inverse(const float2x2& x);
+float3x3 inverse(const float3x3& x);
+float4x4 inverse(const float4x4& x);
+
+// determinant
+float determinant(const float2x2& x);
+float determinant(const float3x3& x);
+float determinant(const float4x4& x);
+
+// diagonal sum
+float trace(const float2x2& x);
+float trace(const float3x3& x);
+float trace(const float4x4& x);
+
+// m * m
+float2x2 mul(const float2x2& x, const float2x2& y);
+float3x3 mul(const float3x3& x, const float3x3& y);
+float4x4 mul(const float4x4& x, const float4x4& y);
+
+// vrow * m - premul = dot + premul
+float2 mul(float2 y, const float2x2& x);
+float3 mul(float3 y, const float3x3& x);
+float4 mul(float4 y, const float4x4& x);
+
+// m * vcol - postmul = mul + mad (prefer this)
+float2 mul(const float2x2& x, float2 y);
+float3 mul(const float3x3& x, float3 y);
+float4 mul(const float4x4& x, float4 y);
+
+// sub
+float2x2 sub(const float2x2& x, const float2x2& y);
+float3x3 sub(const float3x3& x, const float3x3& y);
+float4x4 sub(const float4x4& x, const float4x4& y);
+
+// add
+float2x2 add(const float2x2& x, const float2x2& y);
+float3x3 add(const float3x3& x, const float3x3& y);
+float4x4 add(const float4x4& x, const float4x4& y);
+
+// equal
+bool equal(const float2x2& x, const float2x2& y);
+bool equal(const float3x3& x, const float3x3& y);
+bool equal(const float4x4& x, const float4x4& y);
+
+// equal_abs
+bool equal_abs(const float2x2& x, const float2x2& y, float tol);
+bool equal_abs(const float3x3& x, const float3x3& y, float tol);
+bool equal_abs(const float4x4& x, const float4x4& y, float tol);
+
+// equal_rel
+bool equal_rel(const float2x2& x, const float2x2& y, float tol);
+bool equal_rel(const float3x3& x, const float3x3& y, float tol);
+bool equal_rel(const float4x4& x, const float4x4& y, float tol);
+
+// TODO: these think they are all member functions
+
+// clang-format off
+
+// operators for C++
+macroMatrixOps(float2x2);
+macroMatrixOps(float3x3);
+// TODO: no mat ops on storage type float3x4
+// macroMatrixOps(float3x4s);
+macroMatrixOps(float4x4);
+
+// clang-format on
+
+// fast conversions where possible
+SIMD_CALL const float3x3& as_float3x3(const float4x4& m)
+{
+    return reinterpret_cast<const float3x3&>(m);
+}
+
+//-----------------------
+// quat
+
+// Only need a fp32 quat.  double/half are pretty worthless.
+struct quatf {
+    // TODO: should all ctor be SIMD_CALL ?
+    quatf() : v{0.0f, 0.0f, 0.0f, 1.0f} {}
+    quatf(float x, float y, float z, float w) : v{x, y, z, w} {}
+    quatf(float3 vv, float angle);
+    explicit quatf(float4 vv) : v(vv) {}
+
+    static const quatf& zero();
+    static const quatf& identity();
+
+    // image = axis * sin(theta/2), real = cos(theta/2)
+    const float3& imag() const { return as_float3(v); }
+    float real() const { return v.w; }
+
+    float4 v;
+};
+
+// this is conjugate, so only axis is inverted
+SIMD_CALL quatf operator-(quatf q)
+{
+    float4 qv = q.v;
+    qv.xyz = -qv.xyz;
+    return quatf(qv);
+}
+
+SIMD_CALL float3 operator*(quatf q, float3 v)
+{
+    // see https://fgiesen.wordpress.com/2019/02/09/rotating-a-single-vector-using-a-quaternion/
+    //float4 qv = q.v;
+    //float3 t = 2.0f * cross(qv.xyz, v);
+    //return v + qv.w * t + cross(qv.xyz, t);
+    
+    // simplified form of above
+    float4 qv = q.v;
+    return v + 2.0 * cross(qv.xyz, cross(qv.xyz, v) + qv.w * v);
+}
+
+SIMD_CALL bool equal(quatf x, quatf y)
+{
+    return all(x.v == y.v);
+}
+SIMD_CALL bool operator==(quatf x, quatf y)
+{
+    return all(x.v == y.v);
+}
+
+float4x4 float4x4m(quatf q);
+
+// how many quatf ops are needed?
+// TODO: need matrix into quatf
+// TDOO: need shortest arc correction (dot(q0.v, q1.v) < 0) negate
+// TODO: need negate (or conjuagate?)
+// TODO: what about math ops
+
+SIMD_CALL quatf lerp(quatf q0, quatf q1, float t)
+{
+    if (dot(q0.v, q1.v) < 0.0f)
+        q1 = -q1; // conjugate
+
+    float4 v = lerp(q0.v, q1.v, t);
+    return quatf(v);
+}
+quatf slerp(quatf q0, quatf q1, float t);
+
+void quat_bezier_cp(quatf q0, quatf q1, quatf q2, quatf q3,
+                    quatf& a1, quatf& b2);
+quatf quat_bezer_lerp(quatf a, quatf b, quatf c, quatf d, float t);
+quatf quat_bezer_slerp(quatf a, quatf b, quatf c, quatf d, float t);
+
+quatf inverse(quatf q);
+
+SIMD_CALL quatf normalize(quatf q)
+{
+    return quatf(normalize(q.v));
+}
+
+//----------------
+// affine and convenience ctors
+
+// in-place affine transose
+void transpose_affine(float4x4& m);
+
+// fast inverses for translate, rotate, scale
+float4x4 inverse_tr(const float4x4& mtx);
+float4x4 inverse_tru(const float4x4& mtx);
+float4x4 inverse_trs(const float4x4& mtx);
+
+float4x4 float4x4m(char axis, float radians);
+
+SIMD_CALL float4x4 float4x4m(float3 axis, float radians)
+{
+    return float4x4m(quatf(axis, radians));
+}
+
+// These sizes are positive and do not include inversion
+SIMD_CALL float decompose_size(const float4x4& m)
+{
+    // assumes m[0].w is 0
+    return length(m[0]);
+}
+SIMD_CALL float3 decompose_scale(const float4x4& m)
+{
+    // assumes m[i].w is 0
+    return sqrt(float3m(length_squared(m[0]),
+                        length_squared(m[1]),
+                        length_squared(m[2])));
+}
+SIMD_CALL float decompose_scale_max(const float4x4& m)
+{
+    return reduce_max(decompose_scale(m));
+}
+
+float3x3 float3x3m(quatf qq);
+
+// m in here?
+float4x4 float4x4m_tr(float3 t, quatf r);
+float4x4 float4x4m_tru(float3 t, quatf r, float scale);
+float4x4 float4x4m_trs(float3 t, quatf r, float3 scale);
+
+// can build inverse from same data
+float4x4 float4x4m_trs(float3 t, quatf r, float3 scale);
+float4x4 float4x4m_inverse_trs(float3 t, quatf r, float3 scale);
+
+float4x4 perspective_rhcs(float fovyRadians, float aspectXtoY, float nearZ, float farZ = FLT_MAX);
+float4x4 perspective_rhcs(float4 tangents, float nearZ, float farZ = FLT_MAX);
+float4x4 orthographic_rhcs(float4 rect, float nearZ, float farZ);
+
+SIMD_CALL float4x4 rotation(float3 axis, float radians)
+{
+    quatf q(axis, radians);
+    return float4x4m(q);
+}
+SIMD_CALL float4x4 scale(float3 scale)
+{
+    return float4x4(float4m(scale, 1.0f));
+}
+SIMD_CALL float4x4 translation(float3 t)
+{
+    float4x4 m(float4x4::identity());
+    m[3] = float4m(t, 1);
+    return m;
+}
+
+} //namespace SIMD_NAMESPACE
+
+#endif // __cplusplus
+
+#endif // USE_SIMDLIB && SIMD_FLOAT
diff --git a/libkram/vectormath/float4a.cpp b/libkram/vectormath/float4a.cpp
new file mode 100644
index 00000000..e89155dd
--- /dev/null
+++ b/libkram/vectormath/float4a.cpp
@@ -0,0 +1,72 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#if !USE_SIMDLIB
+
+// https://patchwork.ozlabs.org/project/gcc/patch/559BC75A.1080606@arm.com/
+// https://gcc.gnu.org/onlinedocs/gcc-7.5.0/gcc/Half-Precision.html
+// https://developer.arm.com/documentation/dui0491/i/Using-NEON-Support/Converting-vectors
+
+#include "float4a.h"
+
+// Bury these for now.  They required -mf16c for Intel to be
+// defined, and that's kind of a pain right now.
+namespace SIMD_NAMESPACE {
+
+#if SIMD_SSE
+
+// using casts instead of vv.reg, so these calls work with Apple SIMD too
+
+float4 float4m(half4 vv)
+{
+    // https://patchwork.ozlabs.org/project/gcc/patch/559BC75A.1080606@arm.com/
+    // https://gcc.gnu.org/onlinedocs/gcc-7.5.0/gcc/Half-Precision.html
+    // https://developer.arm.com/documentation/dui0491/i/Using-NEON-Support/Converting-vectors
+    __m128i reg16 = _mm_setzero_si128();
+
+    // TODO: switch to load low 64-bits, but don't know which one _mm_cvtsi32_si128(&vv.reg); ?
+    // want 0 extend here, sse overuses int32_t when really unsigned and zero extended value
+    reg16 = _mm_insert_epi16(reg16, vv[0], 0);
+    reg16 = _mm_insert_epi16(reg16, vv[1], 1);
+    reg16 = _mm_insert_epi16(reg16, vv[2], 2);
+    reg16 = _mm_insert_epi16(reg16, vv[3], 3);
+
+    return simd::float4(_mm_cvtph_ps(reg16));
+}
+
+half4 half4m(float4 vv)
+{
+    __m128i reg16 = _mm_cvtps_ph(*(const __m128*)&vv, 0); // 4xfp32-> 4xfp16,  round to nearest-even
+
+    // TODO: switch to store/steam, but don't know which one _mm_storeu_epi16 ?
+    half4 val; // = 0;
+
+    // 0 extended
+    val[0] = (half)_mm_extract_epi16(reg16, 0);
+    val[1] = (half)_mm_extract_epi16(reg16, 1);
+    val[2] = (half)_mm_extract_epi16(reg16, 2);
+    val[3] = (half)_mm_extract_epi16(reg16, 3);
+    return val;
+}
+
+#endif
+
+#if SIMD_NEON
+
+// using casts intead of vv.reg, so these calls work with Apple SIMD too
+// Note: could just use the sse2 neon version
+
+float4 float4m(half4 vv)
+{
+    return float4(vcvt_f32_f16(*(const float16x4_t*)&vv));
+}
+half4 half4m(float4 vv)
+{
+    return half4(vcvt_f16_f32(*(const float32x4_t*)&vv));
+}
+#endif
+
+} //namespace SIMD_NAMESPACE
+
+#endif
diff --git a/libkram/vectormath/float4a.h b/libkram/vectormath/float4a.h
new file mode 100644
index 00000000..946bb926
--- /dev/null
+++ b/libkram/vectormath/float4a.h
@@ -0,0 +1,110 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+#if !USE_SIMDLIB
+
+// This is Apple simd (it's huuuggge!)
+// Also can't use the half4 type until iOS18 + macOS15 minspec, so need fallback.
+#include <simd/simd.h>
+
+// only support avx2 and Neon, no avx-512 at first
+#if defined __ARM_NEON
+#define SIMD_SSE 0
+#define SIMD_NEON 1
+#elif defined __AVX2__ // x64 AVX2 or higher, can lower to AVX
+#define SIMD_SSE 1
+#define SIMD_NEON 0
+#else
+#warning unuspported simd arch
+#endif
+
+#define SIMD_NAMESPACE simd
+
+#if !__is_identifier(_Float16)
+#define SIMD_HALF_FLOAT16 1
+#else
+#define SIMD_HALF_FLOAT16 0
+#endif
+
+namespace SIMD_NAMESPACE {
+
+// functional ctor
+inline float4 float4m(float3 v, float w)
+{
+    return vector4(v, w);
+}
+
+inline float2 float2m(float x, float y)
+{
+    return {x, y};
+}
+inline float3 float3m(float x, float y, float z)
+{
+    return {x, y, z};
+}
+inline float4 float4m(float x, float y, float z, float w)
+{
+    return {x, y, z, w};
+}
+
+inline float2 float2m(float x)
+{
+    return x;
+}
+
+inline float3 float3m(float x)
+{
+    return x;
+}
+
+inline float4 float4m(float x)
+{
+    return x;
+}
+
+//inline float saturate(float v) {
+//    return std::clamp(v, 0.0f, 1.0f);
+//}
+//inline double saturate(double v) {
+//    return std::clamp(v, 0.0, 1.0);
+//}
+inline float2 saturate(float2 v)
+{
+    return simd_clamp(v, 0.0f, 1.0f);
+}
+inline float3 saturate(float3 v)
+{
+    return simd_clamp(v, 0.0f, 1.0f);
+}
+inline float4 saturate(float4 v)
+{
+    return simd_clamp(v, 0.0f, 1.0f);
+}
+
+#if SIMD_HALF_FLOAT16
+using half = _Float16;
+#else
+// for lack of a better type
+using half = int16_t;
+#endif
+
+#define vec2to4(x) (x).xyyy
+#define vec3to4(x) (x).xyzz
+#define vec4to2(x) (x).xy
+#define vec4to3(x) (x).xyz
+
+// define half ops just for conversion
+half4 half4m(float4 __x);
+inline half2 half2m(float2 __x) { return vec4to2(half4m(vec2to4(__x))); }
+inline half3 half3m(float3 __x) { return vec4to3(half4m(vec3to4(__x))); }
+
+float4 float4m(half4 __x);
+inline float2 float2m(half2 __x) { return vec4to2(float4m(vec2to4(__x))); }
+inline float3 float3m(half3 __x) { return vec4to3(float4m(vec3to4(__x))); }
+
+} // namespace SIMD_NAMESPACE
+
+#endif
diff --git a/libkram/vectormath/half234.h b/libkram/vectormath/half234.h
new file mode 100644
index 00000000..c8ccdeea
--- /dev/null
+++ b/libkram/vectormath/half234.h
@@ -0,0 +1,126 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+// This is not yet standalone.  vectormath234.h includes it.
+#if USE_SIMDLIB && SIMD_HALF
+
+// Android doesn't really have _Float16, so would need a u/int16_t mapped placeholder
+// The not identifier means its a system type.
+#if !__is_identifier(_Float16)
+#define SIMD_HALF_FLOAT16 1
+#else
+#define SIMD_HALF_FLOAT16 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#if SIMD_HALF_FLOAT16
+typedef _Float16 half;
+#else
+// This won't work with the operators.  Consider _fp16 storage type which does all math in fp32.
+// But even Android doesn't support that with +fp16.
+// TODO: use half struct here that can do math slowly (prob in fp32x4, then convert back)
+typedef short half;
+#endif // SIMD_HALF_FLOAT16
+
+// This means math and conversions don't work, so have to use simd ops
+#define SIMD_HALF4_ONLY !SIMD_HALF_FLOAT16
+
+// clang-format off
+
+// Half isn't something that should have math ops yet.  Just useful as packed type.
+// This does math, but really needs _Float16 to work properly for the operators.
+// That's not available on Android devices like it should be, but the Neon
+// fp16x4 <-> fp32x4 conversions are.
+
+// define c vector types
+macroVector2TypesAligned(half, half)
+macroVector2TypesPacked(half, half)
+
+// No matrix type defined right now.
+
+// glue to Accelerate
+#if SIMD_ACCELERATE_MATH_NAMES
+macroVector8TypesStorageRenames(half, simd_half)
+#endif // SIMD_ACCELERATE_MATH_NAMES
+
+#ifdef __cplusplus
+}
+
+namespace SIMD_NAMESPACE {
+
+macroVector2TypesStorageRenames(half, half)
+
+    // clang-format on
+
+    SIMD_CALL half2 half2m(half x)
+{
+    return x;
+}
+SIMD_CALL half2 half2m(half x, half y)
+{
+    return {x, y};
+}
+
+SIMD_CALL half3 half3m(half x)
+{
+    return x;
+}
+SIMD_CALL half3 half3m(half x, half y, half z)
+{
+    return {x, y, z};
+}
+SIMD_CALL half3 half3m(half2 v, half z)
+{
+    half3 r;
+    r.xy = v;
+    r.z = z;
+    return r;
+}
+
+SIMD_CALL half4 half4m(half x)
+{
+    return x;
+}
+SIMD_CALL half4 half4m(half2 xy, half2 zw)
+{
+    half4 r;
+    r.xy = xy;
+    r.zw = zw;
+    return r;
+}
+SIMD_CALL half4 half4m(half x, half y, half z, half w = (half)1.0)
+{
+    return {x, y, z, w};
+}
+SIMD_CALL half4 half4m(half3 v, float w = (half)1.0)
+{
+    half4 r;
+    r.xyz = v;
+    r.w = w;
+    return r;
+}
+
+SIMD_CALL half4 zeroext(half2 x)
+{
+    half4 v;
+    v.xy = x;
+    v.zw = 0;
+    return v;
+}
+SIMD_CALL half4 zeroext(half3 x)
+{
+    half4 v;
+    v.xyz = x;
+    v.w = 0;
+    return v;
+}
+
+} //namespace SIMD_NAMESPACE
+#endif // __cplusplus
+#endif // USE_SIMDLIB && SIMD_HALF
diff --git a/libkram/vectormath/int234.h b/libkram/vectormath/int234.h
new file mode 100644
index 00000000..767f3ec3
--- /dev/null
+++ b/libkram/vectormath/int234.h
@@ -0,0 +1,203 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+// This is not yet standalone.  vectormath234.h includes it.
+#if USE_SIMDLIB && SIMD_INT
+
+// clang-format off
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// define c vector types
+macroVector4TypesAligned(int, int)
+macroVector4TypesPacked(int, int)
+
+#if SIMD_ACCELERATE_MATH_NAMES
+macroVector4TypesStorageRenames(int, simd_int)
+#endif // SIMD_ACCELERATE_MATH_NAMES
+
+#ifdef __cplusplus
+}
+
+namespace SIMD_NAMESPACE {
+
+macroVector4TypesStorageRenames(int, int)
+
+    // clang-format on
+
+    SIMD_CALL int4 zeroext(int2 x)
+{
+    int4 v = 0;
+    v.xy = x;
+    return v;
+}
+SIMD_CALL int4 zeroext(int3 x)
+{
+    int4 v = 0;
+    v.xyz = x;
+    return v;
+}
+
+//-----------------------------------
+// imlementation - only code simd arch specific
+
+#if SIMD_NEON
+
+SIMD_CALL bool any(int2 x)
+{
+    return vmaxv_u32(x) & 0x80000000;
+}
+SIMD_CALL bool any(int4 x)
+{
+    return vmaxvq_u32(x) & 0x80000000;
+}
+
+SIMD_CALL bool all(int2 x)
+{
+    return vminv_u32(x) & 0x80000000;
+}
+SIMD_CALL bool all(int4 x)
+{
+    return vminvq_u32(x) & 0x80000000;
+}
+
+SIMD_CALL int reduce_add(int2 x)
+{
+    x = vpadd_s32(x, x);
+    return x.x; // repeat x to all values
+}
+SIMD_CALL int reduce_add(int4 x)
+{
+    // 4:1 reduction
+    x = vpaddq_s32(x, x); // xy = x+y,z+w
+    x = vpaddq_s32(x, x); // x  = x+y
+    return x.x; // repeat x to all values
+}
+SIMD_CALL int reduce_add(int3 x)
+{
+    return reduce_add(zeroext(x));
+}
+
+#endif // SIMD_NEON
+
+// These take in int types, this is what comparison gens from a < b, etc.
+#if SIMD_SSE
+
+SIMD_CALL bool any(int2 x)
+{
+    return _mm_movemask_ps(vec2to4(x)) & 0x3;
+}
+SIMD_CALL bool any(int4 x)
+{
+    return _mm_movemask_ps((__m128)x);
+}
+
+SIMD_CALL bool all(int2 x)
+{
+    return (_mm_movemask_ps(vec2to4(x)) & 0x3) == 0x3; // 2 bits
+}
+SIMD_CALL bool all(int4 x)
+{
+    return _mm_movemask_ps((__m128)x) == 0xf; // 4 bits
+}
+
+// TODO: need SSE ops for this,
+SIMD_CALL int reduce_add(int4 x)
+{
+    int2 r = x.lo + x.hi;
+    return r.x + r.y;
+}
+SIMD_CALL int reduce_add(int2 x)
+{
+    return x.x + x.y;
+}
+SIMD_CALL int reduce_add(int3 x)
+{
+    return x.x + x.y + x.z;
+}
+
+#endif // SIMD_SSE
+
+// any-all
+SIMD_CALL bool any(int3 x)
+{
+    return any(vec3to4(x));
+}
+SIMD_CALL bool all(int3 x)
+{
+    return all(vec3to4(x));
+}
+
+// end of implementation
+//-----------------------------------
+
+// bitselect
+SIMD_CALL int2 bitselect(int2 x, int2 y, int2 mask)
+{
+    return (x & ~mask) | (y & mask);
+}
+SIMD_CALL int3 bitselect(int3 x, int3 y, int3 mask)
+{
+    return (x & ~mask) | (y & mask);
+}
+SIMD_CALL int4 bitselect(int4 x, int4 y, int4 mask)
+{
+    return (x & ~mask) | (y & mask);
+}
+
+SIMD_CALL int2 int2m(int x)
+{
+    return x;
+}
+SIMD_CALL int2 int2m(int x, int y)
+{
+    return {x, y};
+}
+
+SIMD_CALL int3 int3m(int x)
+{
+    return x;
+}
+SIMD_CALL int3 int3m(int x, int y, int z)
+{
+    return {x, y, z};
+}
+SIMD_CALL int3 int3m(int2 v, int z)
+{
+    int3 r;
+    r.xy = v;
+    r.z = z;
+    return r;
+}
+
+SIMD_CALL int4 int4m(int x)
+{
+    return x;
+}
+SIMD_CALL int4 int4m(int2 xy, int2 zw)
+{
+    int4 r;
+    r.xy = xy;
+    r.zw = zw;
+    return r;
+}
+SIMD_CALL int4 int4m(int x, int y, int z, int w)
+{
+    return {x, y, z, w};
+}
+SIMD_CALL int4 int4m(int3 v, int w)
+{
+    int4 r;
+    r.xyz = v;
+    r.w = w;
+    return r;
+}
+
+} //namespace SIMD_NAMESPACE
+#endif // __cplusplus
+#endif // USE_SIMDLIB && SIMD_INT
diff --git a/libkram/vectormath/long234.h b/libkram/vectormath/long234.h
new file mode 100644
index 00000000..77d82712
--- /dev/null
+++ b/libkram/vectormath/long234.h
@@ -0,0 +1,171 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+// This is not yet standalone.  vectormath234.h includes it.
+#if USE_SIMDLIB && SIMD_LONG
+
+#ifdef _WIN32
+// Windows breaks portable code.
+typedef long long long1;
+#else
+typedef long long1;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// clang-format off
+
+// define c vector types
+// Apple uses long type here (32-bit) instead of long32_t
+macroVector8TypesAligned(long1, long)
+macroVector8TypesPacked(long1, long)
+
+#if SIMD_ACCELERATE_MATH_NAMES
+macroVector8TypesStorageRenames(long, simd_long)
+#endif // SIMD_ACCELERATE_MATH_NAMES
+
+#ifdef __cplusplus
+}
+
+namespace SIMD_NAMESPACE {
+
+macroVector8TypesStorageRenames(long, long)
+
+// clang-format on
+
+//-----------------------------------
+// imlementation - only code simd arch specific
+
+#if SIMD_NEON
+
+    SIMD_CALL bool any(long2 x)
+{
+    return (x.x | x.y) & 0x8000000000000000U;
+}
+SIMD_CALL bool any(long3 x)
+{
+    return (x.x | x.y | x.z) & 0x8000000000000000U;
+}
+SIMD_CALL bool any(long4 x)
+{
+    return any(x.lo | x.hi);
+}
+
+SIMD_CALL bool all(long2 x)
+{
+    return (x.x & x.y) & 0x8000000000000000U;
+}
+SIMD_CALL bool all(long3 x)
+{
+    return (x.x & x.y & x.z) & 0x8000000000000000U;
+}
+SIMD_CALL bool all(long4 x)
+{
+    return all(x.lo & x.hi);
+}
+
+#endif // SIMD_NEON
+
+// These take in long types, this is what comparison gens from a < b, etc.
+#if SIMD_SSE
+
+SIMD_CALL bool any(long2 x)
+{
+    return _mm_movemask_pd(x) & 0x3; // 2 bits
+}
+SIMD_CALL bool any(long3 x)
+{
+    return (x.x | x.y) & 0x8000000000000000U;
+}
+SIMD_CALL bool any(long4 x)
+{
+    return any(x.lo | x.hi);
+}
+
+SIMD_CALL bool all(long2 x)
+{
+    return (_mm_movemask_pd(x) & 0x3) == 0x3; // 2 bits
+}
+SIMD_CALL bool all(long3 x)
+{
+    return (x.x & x.y & x.z) & 0x8000000000000000U;
+}
+SIMD_CALL bool all(long4 x)
+{
+    return any(x.lo & x.hi);
+}
+#endif // SIMD_SSE
+
+// end of implementation
+//-----------------------------------
+
+// bitselect
+SIMD_CALL long2 bitselect(long2 x, long2 y, long2 mask)
+{
+    return (x & ~mask) | (y & mask);
+}
+SIMD_CALL long3 bitselect(long3 x, long3 y, long3 mask)
+{
+    return (x & ~mask) | (y & mask);
+}
+SIMD_CALL long4 bitselect(long4 x, long4 y, long4 mask)
+{
+    return (x & ~mask) | (y & mask);
+}
+
+SIMD_CALL long2 long2m(long1 x)
+{
+    return x;
+}
+SIMD_CALL long2 long2m(long1 x, long1 y)
+{
+    return {x, y};
+}
+
+SIMD_CALL long3 long3m(long1 x)
+{
+    return x;
+}
+SIMD_CALL long3 long3m(long1 x, long1 y, long1 z)
+{
+    return {x, y, z};
+}
+SIMD_CALL long3 long3m(long2 v, long1 z)
+{
+    long3 r;
+    r.xy = v;
+    r.z = z;
+    return r;
+}
+
+SIMD_CALL long4 long4m(long1 x)
+{
+    return x;
+}
+SIMD_CALL long4 long4m(long2 xy, long2 zw)
+{
+    long4 r;
+    r.xy = xy;
+    r.zw = zw;
+    return r;
+}
+SIMD_CALL long4 long4m(long1 x, long1 y, long1 z, long1 w)
+{
+    return {x, y, z, w};
+}
+SIMD_CALL long4 long4m(long3 v, long1 w)
+{
+    long4 r;
+    r.xyz = v;
+    r.w = w;
+    return r;
+}
+
+} //namespace SIMD_NAMESPACE
+#endif // __cplusplus
+#endif // USE_SIMDLIB && SIMD_LONG
diff --git a/libkram/vectormath/module.modulemap b/libkram/vectormath/module.modulemap
new file mode 100644
index 00000000..8f7ad310
--- /dev/null
+++ b/libkram/vectormath/module.modulemap
@@ -0,0 +1,17 @@
+module vectormath234 {
+    // All headers are pulled in by this.
+    requires cplusplus20
+    
+    // This defaults to namespace simdk
+    header "vectormath234.h"
+    
+    // These aren't yet independent includes.
+    // header "int234.h"
+    // header "long234.h"
+    // header "float234.h"
+    // header "double234.h"
+    // header "bounds234.h"
+    
+    export *
+}
+
diff --git a/libkram/vectormath/sse2neon-arm64.h b/libkram/vectormath/sse2neon-arm64.h
new file mode 100644
index 00000000..79a6c38e
--- /dev/null
+++ b/libkram/vectormath/sse2neon-arm64.h
@@ -0,0 +1,7969 @@
+#pragma once
+// clang-format off
+
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+//#include <TargetConditionals.h>
+//#if TARGET_OS_MACCATALYST
+//#warning - this code won't compile for iOS MacCatalyst, switch target.
+//#endif
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Copyright (c) 2015-2024 SSE2NEON Contributors.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+//   Anthony Roberts <anthony.roberts@linaro.org>
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+//#ifndef SSE2NEON_PRECISE_MINMAX
+//#define SSE2NEON_PRECISE_MINMAX (0)
+//#endif
+///* _mm_rcp_ps */
+//#ifndef SSE2NEON_PRECISE_DIV
+//#define SSE2NEON_PRECISE_DIV (0)
+//#endif
+///* _mm_sqrt_ps and _mm_rsqrt_ps */
+//#ifndef SSE2NEON_PRECISE_SQRT
+//#define SSE2NEON_PRECISE_SQRT (0)
+//#endif
+///* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* Enable inclusion of windows.h on MSVC platforms
+ * This makes _mm_clflush functional on windows, as there is no builtin.
+ */
+//#ifndef SSE2NEON_INCLUDE_WINDOWS_H
+//#define SSE2NEON_INCLUDE_WINDOWS_H (0)
+//#endif
+
+/* compiler specific definitions */
+//#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+//#elif defined(_MSC_VER)
+//#if _MSVC_TRADITIONAL
+//#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
+//#endif
+//#ifndef FORCE_INLINE
+//#define FORCE_INLINE static inline
+//#endif
+//#ifndef ALIGN_STRUCT
+//#define ALIGN_STRUCT(x) __declspec(align(x))
+//#endif
+//#define _sse2neon_likely(x) (x)
+//#define _sse2neon_unlikely(x) (x)
+//#else
+//#pragma message("Macro name collisions may happen with unsupported compilers.")
+//#endif
+
+
+//#if defined(__GNUC__) && !defined(__clang__)
+//#pragma push_macro("FORCE_INLINE_OPTNONE")
+//#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0")))
+//#elif defined(__clang__)
+#pragma push_macro("FORCE_INLINE_OPTNONE")
+#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone))
+//#else
+//#define FORCE_INLINE_OPTNONE FORCE_INLINE
+//#endif
+
+//#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
+//#warning "GCC versions earlier than 10 are not supported."
+//#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
+}
+
+//#if defined(_WIN32)
+///* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+// * from both MinGW-w64 and MSVC.
+// */
+//#define SSE2NEON_ALLOC_DEFINED
+//#endif
+//
+///* If using MSVC */
+//#ifdef _MSC_VER
+//#include <intrin.h>
+//#if SSE2NEON_INCLUDE_WINDOWS_H
+//#include <processthreadsapi.h>
+//#include <windows.h>
+//#endif
+
+#if !defined(__cplusplus)
+#error SSE2NEON only supports C++ compilation with this compiler
+#endif
+
+//#ifdef SSE2NEON_ALLOC_DEFINED
+//#include <malloc.h>
+//#endif
+
+//#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+//    (defined(_M_ARM64) || defined(__arm64__))
+//#define SSE2NEON_HAS_BITSCAN64
+//#endif
+//#endif
+
+//#if defined(__GNUC__) || defined(__clang__)
+#define _sse2neon_define0(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define1(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define2(type, a, b, body) \
+    __extension__({                         \
+        type _a = (a), _b = (b);            \
+        body                                \
+    })
+#define _sse2neon_return(ret) (ret)
+//#else
+//#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
+//#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
+//#define _sse2neon_define2(type, a, b, body) \
+//    [](type _a, type _b) { body }((a), (b))
+//#define _sse2neon_return(ret) return ret
+//#endif
+
+#define _sse2neon_init(...) \
+    {                       \
+        __VA_ARGS__         \
+    }
+
+/* Compiler barrier */
+//#if defined(_MSC_VER) && !defined(__clang__)
+//#define SSE2NEON_BARRIER() _ReadWriteBarrier()
+//#else
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+//#endif
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else /* MSVC */
+    __dmb(_ARM64_BARRIER_ISH);
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+/*
+ #if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#endif
+#else
+#error \
+    "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
+(you could try setting target explicitly with -march or -mcpu)"
+#endif
+#endif
+*/
+
+#include <arm_neon.h>
+//#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
+//#if defined __has_include && __has_include(<arm_acle.h>)
+//#include <arm_acle.h>
+//#endif
+//#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchitectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__)
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm fallback */
+//#if !defined(__aarch64__) && !defined(_M_ARM64)
+//#include <math.h>
+//#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+//#if (!defined(__aarch64__) && !defined(_M_ARM64))
+//#include <sys/time.h>
+//#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+//#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+///* Compatibility with gcc <= 9 */
+//#if defined(__GNUC__) && (__GNUC__ <= 9)
+//#define __has_builtin(x) HAS##x
+//#define HAS__builtin_popcount 1
+//#define HAS__builtin_popcountll 1
+//
+//// __builtin_shuffle introduced in GCC 4.7.0
+//#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+//#define HAS__builtin_shuffle 1
+//#else
+//#define HAS__builtin_shuffle 0
+//#endif
+//
+//#define HAS__builtin_shufflevector 0
+//#define HAS__builtin_nontemporal_store 0
+//#else
+//#define __has_builtin(x) 0
+//#endif
+//#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+//#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+//#elif __has_builtin(__builtin_shuffle)
+//#define _sse2neon_shuffle(type, a, b, ...) \
+//    __extension__({                        \
+//        type tmp = {__VA_ARGS__};          \
+//        __builtin_shuffle(a, b, tmp);      \
+//    })
+//#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+//#if defined(__aarch64__) || defined(_M_ARM64)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+//#else
+//typedef float32x4_t __m128d;
+//#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128(void);
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+
+// emulate vaddvq u8 variant
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors contain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+    uint32_t res3;
+} fpcr_bitfield;
+
+/*
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); // TODO: use vzip ?
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+*/
+
+// For MSVC, we check only if it is ARM64, as every single ARM64 processor
+// supported by WoA has crypto extensions. If this changes in the future,
+// this can be verified via the runtime-only method of:
+// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
+#if (defined(_M_ARM64) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&             \
+     (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    __n64 a1 = {a}, b1 = {b};
+//    return vreinterpretq_u64_p128(vmull_p64(a1, b1));
+//#else
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+//#endif
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    vreinterpretq_m128i_s32(vsetq_lane_s32(                                 \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3),     \
+        vsetq_lane_s32(                                                     \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a),       \
+                                          ((imm) >> 2) & 0x3),              \
+                           vmovq_n_s32(vgetq_lane_s32(                      \
+                               vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
+                           1),                                              \
+            2),                                                             \
+        3))
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                      \
+    vreinterpretq_m128_f32(vsetq_lane_f32(                                     \
+        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3),         \
+        vsetq_lane_f32(                                                        \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3),     \
+            vsetq_lane_f32(                                                    \
+                vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+                vmovq_n_f32(                                                   \
+                    vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
+                1),                                                            \
+            2),                                                                \
+        3))
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    _sse2neon_define1(                                                        \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);              \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    _sse2neon_define1(                                                         \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);               \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
+// division by multiplying a by b's reciprocal before using the Newton-Raphson
+// method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// Warning: ARMv7-A does not produce the same result compared to Intel and not
+// IEEE-compliant.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+//#if !defined(SSE2NEON_ALLOC_DEFINED)
+//FORCE_INLINE void _mm_free(void *addr)
+//{
+//    free(addr);
+//}
+//#endif
+
+FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
+{
+    uint64_t value;
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    value = _ReadStatusReg(ARM64_FPCR);
+//#else
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
+//#endif
+    return value;
+}
+
+FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
+{
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    _WriteStatusReg(ARM64_FPCR, value);
+//#else
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
+//#endif
+}
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm) \
+    vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+//#if !defined(SSE2NEON_ALLOC_DEFINED)
+//FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+//{
+//    void *ptr;
+//    if (align == 1)
+//        return malloc(size);
+//    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+//        align = sizeof(void *);
+//    if (!posix_memalign(&ptr, align, size))
+//        return ptr;
+//    return NULL;
+//}
+//#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+    static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+    static const int32_t shift[4] = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache hierarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    (void) i;
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    switch (i) {
+//    case _MM_HINT_NTA:
+//        __prefetch2(p, 1);
+//        break;
+//    case _MM_HINT_T0:
+//        __prefetch2(p, 0);
+//        break;
+//    case _MM_HINT_T1:
+//        __prefetch2(p, 2);
+//        break;
+//    case _MM_HINT_T2:
+//        __prefetch2(p, 4);
+//        break;
+//    }
+//#else
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+//#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    return _mm_div_ps(_mm_set1_ps(1.0f), in);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    return _mm_rcp_ps(_mm_sqrt_ps(in));
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+    _sse2neon_set_fpcr(r.value);
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+    _sse2neon_set_fpcr(r.value);
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr(void)
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+//#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                       \
+    vreinterpret_m64_s16(vshuffle_s16(                                 \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
+//#else
+//#define _mm_shuffle_pi16(a, imm)                                              \
+//    _sse2neon_define1(                                                        \
+//        __m64, a, int16x4_t ret;                                              \
+//        ret = vmov_n_s16(                                                     \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3)));          \
+//        ret = vset_lane_s16(                                                  \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
+//            1);                                                               \
+//        ret = vset_lane_s16(                                                  \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
+//            2);                                                               \
+//        ret = vset_lane_s16(                                                  \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
+//            3);                                                               \
+//        _sse2neon_return(vreinterpret_m64_s16(ret));)
+//#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+//#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+//#else  // generic
+//#define _mm_shuffle_ps(a, b, imm)                            \
+//    _sse2neon_define2(                                       \
+//        __m128, a, b, __m128 ret; switch (imm) {             \
+//            case _MM_SHUFFLE(1, 0, 3, 2):                    \
+//                ret = _mm_shuffle_ps_1032(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 3, 0, 1):                    \
+//                ret = _mm_shuffle_ps_2301(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 3, 2, 1):                    \
+//                ret = _mm_shuffle_ps_0321(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 1, 0, 3):                    \
+//                ret = _mm_shuffle_ps_2103(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(1, 0, 1, 0):                    \
+//                ret = _mm_movelh_ps(_a, _b);                 \
+//                break;                                       \
+//            case _MM_SHUFFLE(1, 0, 0, 1):                    \
+//                ret = _mm_shuffle_ps_1001(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 1, 0, 1):                    \
+//                ret = _mm_shuffle_ps_0101(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(3, 2, 1, 0):                    \
+//                ret = _mm_shuffle_ps_3210(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 0, 1, 1):                    \
+//                ret = _mm_shuffle_ps_0011(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 0, 2, 2):                    \
+//                ret = _mm_shuffle_ps_0022(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 2, 0, 0):                    \
+//                ret = _mm_shuffle_ps_2200(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(3, 2, 0, 2):                    \
+//                ret = _mm_shuffle_ps_3202(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(3, 2, 3, 2):                    \
+//                ret = _mm_movehl_ps(_b, _a);                 \
+//                break;                                       \
+//            case _MM_SHUFFLE(1, 1, 3, 3):                    \
+//                ret = _mm_shuffle_ps_1133(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 0, 1, 0):                    \
+//                ret = _mm_shuffle_ps_2010(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 0, 0, 1):                    \
+//                ret = _mm_shuffle_ps_2001(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 0, 3, 2):                    \
+//                ret = _mm_shuffle_ps_2032(_a, _b);           \
+//                break;                                       \
+//            default:                                         \
+//                ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
+//                break;                                       \
+//        } _sse2neon_return(ret);)
+//#endif
+
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* don't need these calls
+ 
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_si128();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_ps();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+*/
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
+    FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+   return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#else
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+   return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm) \
+    vreinterpretq_m128i_s16(        \
+        vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return (int) (vgetq_lane_u64(high_bits, 0) |
+                  (vgetq_lane_u64(high_bits, 1) << 1));
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause(void)
+{
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    __isb(_ARM64_BARRIER_SY);
+//#else
+    __asm__ __volatile__("isb\n");
+//#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+}
+
+// Broadcast 16-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+//#if defined(_sse2neon_shuffle)
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+//#else  // generic
+//#define _mm_shuffle_epi32(a, imm)                           \
+//    _sse2neon_define1(                                      \
+//        __m128i, a, __m128i ret; switch (imm) {             \
+//            case _MM_SHUFFLE(1, 0, 3, 2):                   \
+//                ret = _mm_shuffle_epi_1032(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 3, 0, 1):                   \
+//                ret = _mm_shuffle_epi_2301(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 3, 2, 1):                   \
+//                ret = _mm_shuffle_epi_0321(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 1, 0, 3):                   \
+//                ret = _mm_shuffle_epi_2103(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(1, 0, 1, 0):                   \
+//                ret = _mm_shuffle_epi_1010(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(1, 0, 0, 1):                   \
+//                ret = _mm_shuffle_epi_1001(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 1, 0, 1):                   \
+//                ret = _mm_shuffle_epi_0101(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 2, 1, 1):                   \
+//                ret = _mm_shuffle_epi_2211(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 1, 2, 2):                   \
+//                ret = _mm_shuffle_epi_0122(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(3, 3, 3, 2):                   \
+//                ret = _mm_shuffle_epi_3332(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 0, 0, 0):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 0);       \
+//                break;                                      \
+//            case _MM_SHUFFLE(1, 1, 1, 1):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 1);       \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 2, 2, 2):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 2);       \
+//                break;                                      \
+//            case _MM_SHUFFLE(3, 3, 3, 3):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 3);       \
+//                break;                                      \
+//            default:                                        \
+//                ret = _mm_shuffle_epi32_default(_a, (imm)); \
+//                break;                                      \
+//        } _sse2neon_return(ret);)
+//#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+//#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+//#else
+//#define _mm_shuffle_pd(a, b, imm8)                                     \
+//    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+//        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+//        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+//#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+//#if defined(_sse2neon_shuffle)
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+//#else  // generic
+//#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+//#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+//#if defined(_sse2neon_shuffle)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+//#else  // generic
+//#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+//#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                              \
+    _sse2neon_define1(                                                      \
+        __m128i, a, int8x16_t ret;                                          \
+        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
+                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(
+        vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) {        \
+            ret = _a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {               \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31));                \
+        } _sse2neon_return(ret);)
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u16(                                    \
+                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u32(                                    \
+                vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u64(                                    \
+                vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                         \
+    _sse2neon_define1(                                                 \
+        __m128i, a, int8x16_t ret;                                     \
+        if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
+                            (imm > 15 ? 0 : imm));                     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+   vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (__m128d *) p);
+#else
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+/* don't need this call
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+#if defined(_MSC_VER) && !defined(__clang__)
+    a = _mm_setzero_pd();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+*/
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
+{
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#if defined(__GNUC__) && !defined(__clang__)
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+#else
+#define _mm_alignr_epi8(a, b, imm)                                          \
+    _sse2neon_define2(                                                      \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
+        else if (imm >= 16) ret =                                           \
+            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
+        else ret =                                                          \
+            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
+        _sse2neon_return(ret);)
+
+#endif
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    _sse2neon_define2(                                                      \
+        __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) {      \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low;                                              \
+            uint8x8_t tmp_high;                                             \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(_a);                          \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(_b);                          \
+                tmp_high = vreinterpret_u8_m64(_a);                         \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        } _sse2neon_return(ret);)
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                      \
+    _sse2neon_define2(                                                  \
+        __m128i, a, b,                                                  \
+        const uint16_t _mask[8] =                                       \
+            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
+        uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
+        uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
+            vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                              \
+    _sse2neon_define2(                                                       \
+        __m128d, a, b,                                                       \
+        const uint64_t _mask[2] =                                            \
+            _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),  \
+                           ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);                             \
+        uint64x2_t __a = vreinterpretq_u64_m128d(_a);                        \
+        uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return(      \
+            vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+    float32x4_t elementwise_prod = _mm_mul_ps(a, b);
+
+   /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+
+    if ((imm & 0x0F) == 0x0F) {
+        if (!(imm & (1 << 4)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
+        if (!(imm & (1 << 5)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
+        if (!(imm & (1 << 6)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
+        if (!(imm & (1 << 7)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
+
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+
+    float s = 0.0f;
+
+    if (imm & (1 << 4))
+        s += vgetq_lane_f32(elementwise_prod, 0);
+    if (imm & (1 << 5))
+        s += vgetq_lane_f32(elementwise_prod, 1);
+    if (imm & (1 << 6))
+        s += vgetq_lane_f32(elementwise_prod, 2);
+    if (imm & (1 << 7))
+        s += vgetq_lane_f32(elementwise_prod, 3);
+
+    const float32_t res[4] = {
+        (imm & 0x1) ? s : 0.0f,
+        (imm & 0x2) ? s : 0.0f,
+        (imm & 0x4) ? s : 0.0f,
+        (imm & 0x8) ? s : 0.0f,
+    };
+    return vreinterpretq_m128_f32(vld1q_f32(res));
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm) \
+    vreinterpretq_m128i_s32(        \
+        vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm) \
+    vreinterpretq_m128i_s64(        \
+        vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm) \
+    vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                            \
+    _sse2neon_define2(                                                       \
+        __m128, a, b,                                                        \
+        float32x4_t tmp1 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                   \
+        float32x4_t tmp2 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
+                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
+        const uint32_t data[4] =                                             \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
+        uint32x4_t mask = vld1q_u32(data);                                   \
+        float32x4_t all_zeros = vdupq_n_f32(0);                              \
+                                                                             \
+        _sse2neon_return(vreinterpretq_m128_f32(                             \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+//if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+//#elif defined(_MSC_VER)
+//        __assume(0);
+//#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+//#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+//#elif defined(_MSC_VER)
+//        __assume(0);
+//#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+// Note: Argument names may be wrong in the Intel intrinsics guide.
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t v = vreinterpretq_u64_m128i(a);
+    uint64x2_t m = vreinterpretq_u64_m128i(mask);
+
+    // find ones (set-bits) and zeros (clear-bits) under clip mask
+    uint64x2_t ones = vandq_u64(m, v);
+    uint64x2_t zeros = vbicq_u64(m, v);
+
+    // If both 128-bit variables are populated (non-zero) then return 1.
+    // For comparison purposes, first compact each var down to 32-bits.
+    uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
+
+    // if folding minimum is non-zero then both vars must be non-zero
+    return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *)mtx;              \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+        case _SIDD_NEGATIVE_POLARITY:
+            res ^= 0xffffffff;
+            break;
+        case _SIDD_MASKED_NEGATIVE_POLARITY:
+            res ^= (1 << lb) - 1;
+            break;
+        default:
+            break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+    //#if defined(_MSC_VER) && !defined(__clang__)
+    //    unsigned long cnt = 0;
+    //    if (_BitScanReverse(&cnt, x))
+    //        return 31 - cnt;
+    //    return 32;
+    //#else
+    return x != 0 ? __builtin_clz(x) : 32;
+    //#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+    //#if defined(_MSC_VER) && !defined(__clang__)
+    //    unsigned long cnt = 0;
+    //    if (_BitScanForward(&cnt, x))
+    //        return cnt;
+    //    return 32;
+    //#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+    //#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+    //#ifdef _MSC_VER
+    //    unsigned long cnt;
+    //#if defined(SSE2NEON_HAS_BITSCAN64)
+    //    if (_BitScanForward64(&cnt, x))
+    //        return (int) (cnt);
+    //#else
+    //    if (_BitScanForward(&cnt, (unsigned long) (x)))
+    //        return (int) cnt;
+    //    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+    //        return (int) (cnt + 32);
+    //#endif /* SSE2NEON_HAS_BITSCAN64 */
+    //    return 64;
+    //#else /* assume GNU compatible compilers */
+    return x != 0 ? __builtin_ctzll(x) : 64;
+    //#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        }                                                                      \
+        else {                                                                 \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    }                                                                          \
+    else {                                                                     \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        }                                                                      \
+        else {                                                                 \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void)a;
+    (void)b;
+    (void)lb;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void)a;
+    (void)b;
+    (void)la;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        }                                                                \
+        else {                                                           \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    (void)b;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    (void)a;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cd((uint32_t)crc, v);
+#else
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+#if defined(__ARM_FEATURE_CRYPTO)
+    // Adapted from: https://mary.rs/lab/crc32/
+    // Barrent reduction
+    uint64x2_t orig =
+        vcombine_u64(vcreate_u64((uint64_t)(crc) << 24), vcreate_u64(0x0));
+    uint64x2_t tmp = orig;
+
+    // Polynomial P(x) of CRC32C
+    uint64_t p = 0x105EC76F1;
+    // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
+    // 2^{64} / P(x) \rfloor = 0x11f91caf6
+    uint64_t mu = 0x1dea713f1;
+
+    // Multiply by mu_{64}
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
+    // Divide by 2^{64} (mask away the unnecessary bits)
+    tmp =
+        vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
+    // Multiply by P(x) (shifted left by 1 for alignment reasons)
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
+    // Subtract original from result
+    tmp = veorq_u64(tmp, orig);
+
+    // Extract the 'lower' (in bit-reflected sense) 32 bits
+    crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
+#else // Fall back to the generic table lookup approach
+    // Adapted from: https://create.stephan-brumme.com/crc32/
+    // Apply half-byte comparison algorithm for the best ratio between
+    // performance and lookup table.
+
+    // The lookup table just needs to store every 16th entry
+    // of the standard look-up table.
+    static const uint32_t crc32_half_byte_tbl[] = {
+        0x00000000,
+        0x105ec76f,
+        0x20bd8ede,
+        0x30e349b1,
+        0x417b1dbc,
+        0x5125dad3,
+        0x61c69362,
+        0x7198540d,
+        0x82f63b78,
+        0x92a8fc17,
+        0xa24bb5a6,
+        0xb21572c9,
+        0xc38d26c4,
+        0xd3d3e1ab,
+        0xe330a81a,
+        0xf36e6f75,
+    };
+
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+#endif
+#endif
+    return crc;
+}
+
+/* AES */
+
+// Some aes emulation for non arm64.  Can remove all this
+#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+//#if !defined(__aarch64__) && !defined(_M_ARM64)
+//#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+//#define SSE2NEON_MULTIPLY(x, y)                                  \
+//    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+//     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+//     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+//     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+//#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t shift_rows[] = {
+        0x0,
+        0x5,
+        0xa,
+        0xf,
+        0x4,
+        0x9,
+        0xe,
+        0x3,
+        0x8,
+        0xd,
+        0x2,
+        0x7,
+        0xc,
+        0x1,
+        0x6,
+        0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1,
+        0x2,
+        0x3,
+        0x0,
+        0x5,
+        0x6,
+        0x7,
+        0x4,
+        0x9,
+        0xa,
+        0xb,
+        0x8,
+        0xd,
+        0xe,
+        0xf,
+        0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t inv_shift_rows[] = {
+        0x0,
+        0xd,
+        0xa,
+        0x7,
+        0x4,
+        0x1,
+        0xe,
+        0xb,
+        0x8,
+        0x5,
+        0x2,
+        0xf,
+        0xc,
+        0x9,
+        0x6,
+        0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1,
+        0x2,
+        0x3,
+        0x0,
+        0x5,
+        0x6,
+        0x7,
+        0x4,
+        0x9,
+        0xa,
+        0xb,
+        0x8,
+        0xd,
+        0xe,
+        0xf,
+        0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t)(((int8x16_t)w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t)vrev32q_u16((uint16x8_t)w);
+
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) &
+                                0x1b); // multiplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t shift_rows[] = {
+        0x0,
+        0x5,
+        0xa,
+        0xf,
+        0x4,
+        0x9,
+        0xe,
+        0x3,
+        0x8,
+        0xd,
+        0x2,
+        0x7,
+        0xc,
+        0x1,
+        0x6,
+        0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t inv_shift_rows[] = {
+        0x0,
+        0xd,
+        0xa,
+        0x7,
+        0x4,
+        0x1,
+        0xe,
+        0xb,
+        0x8,
+        0x5,
+        0x2,
+        0xf,
+        0xc,
+        0x9,
+        0x6,
+        0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    static const uint8_t ror32by8[] = {
+        0x1,
+        0x2,
+        0x3,
+        0x0,
+        0x5,
+        0x6,
+        0x7,
+        0x4,
+        0x9,
+        0xa,
+        0xb,
+        0x8,
+        0xd,
+        0xe,
+        0xf,
+        0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t)(((int8x16_t)w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t)vrev32q_u16((uint16x8_t)w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+//#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+//#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(b)));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
+                 vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    //#if !defined(_MSC_VER) || defined(__clang__)
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned)rcon, 0, (unsigned)rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+    //#else
+    //    // We have to do this hack because MSVC is strictly adhering to the CPP
+    //    // standard, in particular C++03 8.5.1 sub-section 15, which states that
+    //    // unions must be initialized by their first member type.
+    //
+    //    // As per the Windows ARM64 ABI, it is always little endian, so this works
+    //    __n128 dest{
+    //        ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
+    //            ((uint64_t) u8.n128_u8[0xE] << 16) |
+    //            ((uint64_t) u8.n128_u8[0xB] << 24) |
+    //            ((uint64_t) u8.n128_u8[0x1] << 32) |
+    //            ((uint64_t) u8.n128_u8[0xE] << 40) |
+    //            ((uint64_t) u8.n128_u8[0xB] << 48) |
+    //            ((uint64_t) u8.n128_u8[0x4] << 56),
+    //        ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
+    //            ((uint64_t) u8.n128_u8[0x6] << 16) |
+    //            ((uint64_t) u8.n128_u8[0x3] << 24) |
+    //            ((uint64_t) u8.n128_u8[0x9] << 32) |
+    //            ((uint64_t) u8.n128_u8[0x6] << 40) |
+    //            ((uint64_t) u8.n128_u8[0x3] << 48) |
+    //            ((uint64_t) u8.n128_u8[0xC] << 56)};
+    //
+    //    dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
+    //    dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
+    //
+    //    return dest;
+    //#endif
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+        case 0x00:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+        case 0x01:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+        case 0x10:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+        case 0x11:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+        default:
+            abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+//#elif defined(_MSC_VER)
+//    return _CountOneBits(a);
+#else
+    return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+//#elif defined(_MSC_VER)
+//    return _CountOneBits64(a);
+#else
+    return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+}
+
+FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(
+    unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+    _sse2neon_set_fpcr(r.value);
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    //#if defined(_MSC_VER) && !defined(__clang__)
+    //    val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
+    //#else
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+    //#endif
+
+    return val;
+}
+
+//#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#pragma pop_macro("FORCE_INLINE_OPTNONE")
+//#endif
+
+//#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+//#endif
+
+#endif
diff --git a/libkram/vectormath/sse2neon.h b/libkram/vectormath/sse2neon.h
new file mode 100644
index 00000000..79a6c38e
--- /dev/null
+++ b/libkram/vectormath/sse2neon.h
@@ -0,0 +1,7969 @@
+#pragma once
+// clang-format off
+
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+//#include <TargetConditionals.h>
+//#if TARGET_OS_MACCATALYST
+//#warning - this code won't compile for iOS MacCatalyst, switch target.
+//#endif
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Copyright (c) 2015-2024 SSE2NEON Contributors.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+//   Anthony Roberts <anthony.roberts@linaro.org>
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+//#ifndef SSE2NEON_PRECISE_MINMAX
+//#define SSE2NEON_PRECISE_MINMAX (0)
+//#endif
+///* _mm_rcp_ps */
+//#ifndef SSE2NEON_PRECISE_DIV
+//#define SSE2NEON_PRECISE_DIV (0)
+//#endif
+///* _mm_sqrt_ps and _mm_rsqrt_ps */
+//#ifndef SSE2NEON_PRECISE_SQRT
+//#define SSE2NEON_PRECISE_SQRT (0)
+//#endif
+///* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* Enable inclusion of windows.h on MSVC platforms
+ * This makes _mm_clflush functional on windows, as there is no builtin.
+ */
+//#ifndef SSE2NEON_INCLUDE_WINDOWS_H
+//#define SSE2NEON_INCLUDE_WINDOWS_H (0)
+//#endif
+
+/* compiler specific definitions */
+//#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+//#elif defined(_MSC_VER)
+//#if _MSVC_TRADITIONAL
+//#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
+//#endif
+//#ifndef FORCE_INLINE
+//#define FORCE_INLINE static inline
+//#endif
+//#ifndef ALIGN_STRUCT
+//#define ALIGN_STRUCT(x) __declspec(align(x))
+//#endif
+//#define _sse2neon_likely(x) (x)
+//#define _sse2neon_unlikely(x) (x)
+//#else
+//#pragma message("Macro name collisions may happen with unsupported compilers.")
+//#endif
+
+
+//#if defined(__GNUC__) && !defined(__clang__)
+//#pragma push_macro("FORCE_INLINE_OPTNONE")
+//#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0")))
+//#elif defined(__clang__)
+#pragma push_macro("FORCE_INLINE_OPTNONE")
+#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone))
+//#else
+//#define FORCE_INLINE_OPTNONE FORCE_INLINE
+//#endif
+
+//#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
+//#warning "GCC versions earlier than 10 are not supported."
+//#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
+}
+
+//#if defined(_WIN32)
+///* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+// * from both MinGW-w64 and MSVC.
+// */
+//#define SSE2NEON_ALLOC_DEFINED
+//#endif
+//
+///* If using MSVC */
+//#ifdef _MSC_VER
+//#include <intrin.h>
+//#if SSE2NEON_INCLUDE_WINDOWS_H
+//#include <processthreadsapi.h>
+//#include <windows.h>
+//#endif
+
+#if !defined(__cplusplus)
+#error SSE2NEON only supports C++ compilation with this compiler
+#endif
+
+//#ifdef SSE2NEON_ALLOC_DEFINED
+//#include <malloc.h>
+//#endif
+
+//#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+//    (defined(_M_ARM64) || defined(__arm64__))
+//#define SSE2NEON_HAS_BITSCAN64
+//#endif
+//#endif
+
+//#if defined(__GNUC__) || defined(__clang__)
+#define _sse2neon_define0(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define1(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define2(type, a, b, body) \
+    __extension__({                         \
+        type _a = (a), _b = (b);            \
+        body                                \
+    })
+#define _sse2neon_return(ret) (ret)
+//#else
+//#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
+//#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
+//#define _sse2neon_define2(type, a, b, body) \
+//    [](type _a, type _b) { body }((a), (b))
+//#define _sse2neon_return(ret) return ret
+//#endif
+
+#define _sse2neon_init(...) \
+    {                       \
+        __VA_ARGS__         \
+    }
+
+/* Compiler barrier */
+//#if defined(_MSC_VER) && !defined(__clang__)
+//#define SSE2NEON_BARRIER() _ReadWriteBarrier()
+//#else
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+//#endif
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else /* MSVC */
+    __dmb(_ARM64_BARRIER_ISH);
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+/*
+ #if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#endif
+#else
+#error \
+    "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
+(you could try setting target explicitly with -march or -mcpu)"
+#endif
+#endif
+*/
+
+#include <arm_neon.h>
+//#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
+//#if defined __has_include && __has_include(<arm_acle.h>)
+//#include <arm_acle.h>
+//#endif
+//#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchitectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__)
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm fallback */
+//#if !defined(__aarch64__) && !defined(_M_ARM64)
+//#include <math.h>
+//#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+//#if (!defined(__aarch64__) && !defined(_M_ARM64))
+//#include <sys/time.h>
+//#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+//#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+///* Compatibility with gcc <= 9 */
+//#if defined(__GNUC__) && (__GNUC__ <= 9)
+//#define __has_builtin(x) HAS##x
+//#define HAS__builtin_popcount 1
+//#define HAS__builtin_popcountll 1
+//
+//// __builtin_shuffle introduced in GCC 4.7.0
+//#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+//#define HAS__builtin_shuffle 1
+//#else
+//#define HAS__builtin_shuffle 0
+//#endif
+//
+//#define HAS__builtin_shufflevector 0
+//#define HAS__builtin_nontemporal_store 0
+//#else
+//#define __has_builtin(x) 0
+//#endif
+//#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+//#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+//#elif __has_builtin(__builtin_shuffle)
+//#define _sse2neon_shuffle(type, a, b, ...) \
+//    __extension__({                        \
+//        type tmp = {__VA_ARGS__};          \
+//        __builtin_shuffle(a, b, tmp);      \
+//    })
+//#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+//#if defined(__aarch64__) || defined(_M_ARM64)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+//#else
+//typedef float32x4_t __m128d;
+//#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128(void);
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+
+// emulate vaddvq u8 variant
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors contain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+    uint32_t res3;
+} fpcr_bitfield;
+
+/*
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); // TODO: use vzip ?
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+*/
+
+// For MSVC, we check only if it is ARM64, as every single ARM64 processor
+// supported by WoA has crypto extensions. If this changes in the future,
+// this can be verified via the runtime-only method of:
+// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
+#if (defined(_M_ARM64) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&             \
+     (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    __n64 a1 = {a}, b1 = {b};
+//    return vreinterpretq_u64_p128(vmull_p64(a1, b1));
+//#else
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+//#endif
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    vreinterpretq_m128i_s32(vsetq_lane_s32(                                 \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3),     \
+        vsetq_lane_s32(                                                     \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a),       \
+                                          ((imm) >> 2) & 0x3),              \
+                           vmovq_n_s32(vgetq_lane_s32(                      \
+                               vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
+                           1),                                              \
+            2),                                                             \
+        3))
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                      \
+    vreinterpretq_m128_f32(vsetq_lane_f32(                                     \
+        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3),         \
+        vsetq_lane_f32(                                                        \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3),     \
+            vsetq_lane_f32(                                                    \
+                vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+                vmovq_n_f32(                                                   \
+                    vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
+                1),                                                            \
+            2),                                                                \
+        3))
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    _sse2neon_define1(                                                        \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);              \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    _sse2neon_define1(                                                         \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);               \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
+// division by multiplying a by b's reciprocal before using the Newton-Raphson
+// method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// Warning: ARMv7-A does not produce the same result compared to Intel and not
+// IEEE-compliant.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+//#if !defined(SSE2NEON_ALLOC_DEFINED)
+//FORCE_INLINE void _mm_free(void *addr)
+//{
+//    free(addr);
+//}
+//#endif
+
+FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
+{
+    uint64_t value;
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    value = _ReadStatusReg(ARM64_FPCR);
+//#else
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
+//#endif
+    return value;
+}
+
+FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
+{
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    _WriteStatusReg(ARM64_FPCR, value);
+//#else
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
+//#endif
+}
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm) \
+    vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+//#if !defined(SSE2NEON_ALLOC_DEFINED)
+//FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+//{
+//    void *ptr;
+//    if (align == 1)
+//        return malloc(size);
+//    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+//        align = sizeof(void *);
+//    if (!posix_memalign(&ptr, align, size))
+//        return ptr;
+//    return NULL;
+//}
+//#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+    static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+    static const int32_t shift[4] = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache hierarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    (void) i;
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    switch (i) {
+//    case _MM_HINT_NTA:
+//        __prefetch2(p, 1);
+//        break;
+//    case _MM_HINT_T0:
+//        __prefetch2(p, 0);
+//        break;
+//    case _MM_HINT_T1:
+//        __prefetch2(p, 2);
+//        break;
+//    case _MM_HINT_T2:
+//        __prefetch2(p, 4);
+//        break;
+//    }
+//#else
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+//#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    return _mm_div_ps(_mm_set1_ps(1.0f), in);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    return _mm_rcp_ps(_mm_sqrt_ps(in));
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+    _sse2neon_set_fpcr(r.value);
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+    _sse2neon_set_fpcr(r.value);
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr(void)
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+//#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                       \
+    vreinterpret_m64_s16(vshuffle_s16(                                 \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
+//#else
+//#define _mm_shuffle_pi16(a, imm)                                              \
+//    _sse2neon_define1(                                                        \
+//        __m64, a, int16x4_t ret;                                              \
+//        ret = vmov_n_s16(                                                     \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3)));          \
+//        ret = vset_lane_s16(                                                  \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
+//            1);                                                               \
+//        ret = vset_lane_s16(                                                  \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
+//            2);                                                               \
+//        ret = vset_lane_s16(                                                  \
+//            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
+//            3);                                                               \
+//        _sse2neon_return(vreinterpret_m64_s16(ret));)
+//#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+//#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+//#else  // generic
+//#define _mm_shuffle_ps(a, b, imm)                            \
+//    _sse2neon_define2(                                       \
+//        __m128, a, b, __m128 ret; switch (imm) {             \
+//            case _MM_SHUFFLE(1, 0, 3, 2):                    \
+//                ret = _mm_shuffle_ps_1032(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 3, 0, 1):                    \
+//                ret = _mm_shuffle_ps_2301(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 3, 2, 1):                    \
+//                ret = _mm_shuffle_ps_0321(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 1, 0, 3):                    \
+//                ret = _mm_shuffle_ps_2103(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(1, 0, 1, 0):                    \
+//                ret = _mm_movelh_ps(_a, _b);                 \
+//                break;                                       \
+//            case _MM_SHUFFLE(1, 0, 0, 1):                    \
+//                ret = _mm_shuffle_ps_1001(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 1, 0, 1):                    \
+//                ret = _mm_shuffle_ps_0101(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(3, 2, 1, 0):                    \
+//                ret = _mm_shuffle_ps_3210(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 0, 1, 1):                    \
+//                ret = _mm_shuffle_ps_0011(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(0, 0, 2, 2):                    \
+//                ret = _mm_shuffle_ps_0022(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 2, 0, 0):                    \
+//                ret = _mm_shuffle_ps_2200(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(3, 2, 0, 2):                    \
+//                ret = _mm_shuffle_ps_3202(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(3, 2, 3, 2):                    \
+//                ret = _mm_movehl_ps(_b, _a);                 \
+//                break;                                       \
+//            case _MM_SHUFFLE(1, 1, 3, 3):                    \
+//                ret = _mm_shuffle_ps_1133(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 0, 1, 0):                    \
+//                ret = _mm_shuffle_ps_2010(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 0, 0, 1):                    \
+//                ret = _mm_shuffle_ps_2001(_a, _b);           \
+//                break;                                       \
+//            case _MM_SHUFFLE(2, 0, 3, 2):                    \
+//                ret = _mm_shuffle_ps_2032(_a, _b);           \
+//                break;                                       \
+//            default:                                         \
+//                ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
+//                break;                                       \
+//        } _sse2neon_return(ret);)
+//#endif
+
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* don't need these calls
+ 
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_si128();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_ps();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+*/
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
+    FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+   return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#else
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+   return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm) \
+    vreinterpretq_m128i_s16(        \
+        vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return (int) (vgetq_lane_u64(high_bits, 0) |
+                  (vgetq_lane_u64(high_bits, 1) << 1));
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause(void)
+{
+//#if defined(_MSC_VER) && !defined(__clang__)
+//    __isb(_ARM64_BARRIER_SY);
+//#else
+    __asm__ __volatile__("isb\n");
+//#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+}
+
+// Broadcast 16-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+//#if defined(_sse2neon_shuffle)
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+//#else  // generic
+//#define _mm_shuffle_epi32(a, imm)                           \
+//    _sse2neon_define1(                                      \
+//        __m128i, a, __m128i ret; switch (imm) {             \
+//            case _MM_SHUFFLE(1, 0, 3, 2):                   \
+//                ret = _mm_shuffle_epi_1032(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 3, 0, 1):                   \
+//                ret = _mm_shuffle_epi_2301(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 3, 2, 1):                   \
+//                ret = _mm_shuffle_epi_0321(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 1, 0, 3):                   \
+//                ret = _mm_shuffle_epi_2103(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(1, 0, 1, 0):                   \
+//                ret = _mm_shuffle_epi_1010(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(1, 0, 0, 1):                   \
+//                ret = _mm_shuffle_epi_1001(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 1, 0, 1):                   \
+//                ret = _mm_shuffle_epi_0101(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 2, 1, 1):                   \
+//                ret = _mm_shuffle_epi_2211(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 1, 2, 2):                   \
+//                ret = _mm_shuffle_epi_0122(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(3, 3, 3, 2):                   \
+//                ret = _mm_shuffle_epi_3332(_a);             \
+//                break;                                      \
+//            case _MM_SHUFFLE(0, 0, 0, 0):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 0);       \
+//                break;                                      \
+//            case _MM_SHUFFLE(1, 1, 1, 1):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 1);       \
+//                break;                                      \
+//            case _MM_SHUFFLE(2, 2, 2, 2):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 2);       \
+//                break;                                      \
+//            case _MM_SHUFFLE(3, 3, 3, 3):                   \
+//                ret = _mm_shuffle_epi32_splat(_a, 3);       \
+//                break;                                      \
+//            default:                                        \
+//                ret = _mm_shuffle_epi32_default(_a, (imm)); \
+//                break;                                      \
+//        } _sse2neon_return(ret);)
+//#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+//#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+//#else
+//#define _mm_shuffle_pd(a, b, imm8)                                     \
+//    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+//        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+//        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+//#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+//#if defined(_sse2neon_shuffle)
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+//#else  // generic
+//#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+//#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+//#if defined(_sse2neon_shuffle)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+//#else  // generic
+//#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+//#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                              \
+    _sse2neon_define1(                                                      \
+        __m128i, a, int8x16_t ret;                                          \
+        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
+                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(
+        vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) {        \
+            ret = _a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {               \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31));                \
+        } _sse2neon_return(ret);)
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u16(                                    \
+                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u32(                                    \
+                vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u64(                                    \
+                vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                         \
+    _sse2neon_define1(                                                 \
+        __m128i, a, int8x16_t ret;                                     \
+        if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
+                            (imm > 15 ? 0 : imm));                     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+   vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (__m128d *) p);
+#else
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+/* don't need this call
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+#if defined(_MSC_VER) && !defined(__clang__)
+    a = _mm_setzero_pd();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+*/
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
+{
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#if defined(__GNUC__) && !defined(__clang__)
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+#else
+#define _mm_alignr_epi8(a, b, imm)                                          \
+    _sse2neon_define2(                                                      \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
+        else if (imm >= 16) ret =                                           \
+            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
+        else ret =                                                          \
+            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
+        _sse2neon_return(ret);)
+
+#endif
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    _sse2neon_define2(                                                      \
+        __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) {      \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low;                                              \
+            uint8x8_t tmp_high;                                             \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(_a);                          \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(_b);                          \
+                tmp_high = vreinterpret_u8_m64(_a);                         \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        } _sse2neon_return(ret);)
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                      \
+    _sse2neon_define2(                                                  \
+        __m128i, a, b,                                                  \
+        const uint16_t _mask[8] =                                       \
+            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
+        uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
+        uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
+            vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                              \
+    _sse2neon_define2(                                                       \
+        __m128d, a, b,                                                       \
+        const uint64_t _mask[2] =                                            \
+            _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),  \
+                           ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);                             \
+        uint64x2_t __a = vreinterpretq_u64_m128d(_a);                        \
+        uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return(      \
+            vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+    float32x4_t elementwise_prod = _mm_mul_ps(a, b);
+
+   /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+
+    if ((imm & 0x0F) == 0x0F) {
+        if (!(imm & (1 << 4)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
+        if (!(imm & (1 << 5)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
+        if (!(imm & (1 << 6)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
+        if (!(imm & (1 << 7)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
+
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+
+    float s = 0.0f;
+
+    if (imm & (1 << 4))
+        s += vgetq_lane_f32(elementwise_prod, 0);
+    if (imm & (1 << 5))
+        s += vgetq_lane_f32(elementwise_prod, 1);
+    if (imm & (1 << 6))
+        s += vgetq_lane_f32(elementwise_prod, 2);
+    if (imm & (1 << 7))
+        s += vgetq_lane_f32(elementwise_prod, 3);
+
+    const float32_t res[4] = {
+        (imm & 0x1) ? s : 0.0f,
+        (imm & 0x2) ? s : 0.0f,
+        (imm & 0x4) ? s : 0.0f,
+        (imm & 0x8) ? s : 0.0f,
+    };
+    return vreinterpretq_m128_f32(vld1q_f32(res));
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm) \
+    vreinterpretq_m128i_s32(        \
+        vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm) \
+    vreinterpretq_m128i_s64(        \
+        vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm) \
+    vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                            \
+    _sse2neon_define2(                                                       \
+        __m128, a, b,                                                        \
+        float32x4_t tmp1 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                   \
+        float32x4_t tmp2 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
+                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
+        const uint32_t data[4] =                                             \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
+        uint32x4_t mask = vld1q_u32(data);                                   \
+        float32x4_t all_zeros = vdupq_n_f32(0);                              \
+                                                                             \
+        _sse2neon_return(vreinterpretq_m128_f32(                             \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+//if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+//#elif defined(_MSC_VER)
+//        __assume(0);
+//#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+//#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+//#elif defined(_MSC_VER)
+//        __assume(0);
+//#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+// Note: Argument names may be wrong in the Intel intrinsics guide.
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t v = vreinterpretq_u64_m128i(a);
+    uint64x2_t m = vreinterpretq_u64_m128i(mask);
+
+    // find ones (set-bits) and zeros (clear-bits) under clip mask
+    uint64x2_t ones = vandq_u64(m, v);
+    uint64x2_t zeros = vbicq_u64(m, v);
+
+    // If both 128-bit variables are populated (non-zero) then return 1.
+    // For comparison purposes, first compact each var down to 32-bits.
+    uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
+
+    // if folding minimum is non-zero then both vars must be non-zero
+    return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *)mtx;              \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+        case _SIDD_NEGATIVE_POLARITY:
+            res ^= 0xffffffff;
+            break;
+        case _SIDD_MASKED_NEGATIVE_POLARITY:
+            res ^= (1 << lb) - 1;
+            break;
+        default:
+            break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+    //#if defined(_MSC_VER) && !defined(__clang__)
+    //    unsigned long cnt = 0;
+    //    if (_BitScanReverse(&cnt, x))
+    //        return 31 - cnt;
+    //    return 32;
+    //#else
+    return x != 0 ? __builtin_clz(x) : 32;
+    //#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+    //#if defined(_MSC_VER) && !defined(__clang__)
+    //    unsigned long cnt = 0;
+    //    if (_BitScanForward(&cnt, x))
+    //        return cnt;
+    //    return 32;
+    //#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+    //#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+    //#ifdef _MSC_VER
+    //    unsigned long cnt;
+    //#if defined(SSE2NEON_HAS_BITSCAN64)
+    //    if (_BitScanForward64(&cnt, x))
+    //        return (int) (cnt);
+    //#else
+    //    if (_BitScanForward(&cnt, (unsigned long) (x)))
+    //        return (int) cnt;
+    //    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+    //        return (int) (cnt + 32);
+    //#endif /* SSE2NEON_HAS_BITSCAN64 */
+    //    return 64;
+    //#else /* assume GNU compatible compilers */
+    return x != 0 ? __builtin_ctzll(x) : 64;
+    //#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        }                                                                      \
+        else {                                                                 \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    }                                                                          \
+    else {                                                                     \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        }                                                                      \
+        else {                                                                 \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void)a;
+    (void)b;
+    (void)lb;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void)a;
+    (void)b;
+    (void)la;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        }                                                                \
+        else {                                                           \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    (void)b;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    (void)a;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cd((uint32_t)crc, v);
+#else
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+#if defined(__ARM_FEATURE_CRYPTO)
+    // Adapted from: https://mary.rs/lab/crc32/
+    // Barrent reduction
+    uint64x2_t orig =
+        vcombine_u64(vcreate_u64((uint64_t)(crc) << 24), vcreate_u64(0x0));
+    uint64x2_t tmp = orig;
+
+    // Polynomial P(x) of CRC32C
+    uint64_t p = 0x105EC76F1;
+    // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
+    // 2^{64} / P(x) \rfloor = 0x11f91caf6
+    uint64_t mu = 0x1dea713f1;
+
+    // Multiply by mu_{64}
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
+    // Divide by 2^{64} (mask away the unnecessary bits)
+    tmp =
+        vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
+    // Multiply by P(x) (shifted left by 1 for alignment reasons)
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
+    // Subtract original from result
+    tmp = veorq_u64(tmp, orig);
+
+    // Extract the 'lower' (in bit-reflected sense) 32 bits
+    crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
+#else // Fall back to the generic table lookup approach
+    // Adapted from: https://create.stephan-brumme.com/crc32/
+    // Apply half-byte comparison algorithm for the best ratio between
+    // performance and lookup table.
+
+    // The lookup table just needs to store every 16th entry
+    // of the standard look-up table.
+    static const uint32_t crc32_half_byte_tbl[] = {
+        0x00000000,
+        0x105ec76f,
+        0x20bd8ede,
+        0x30e349b1,
+        0x417b1dbc,
+        0x5125dad3,
+        0x61c69362,
+        0x7198540d,
+        0x82f63b78,
+        0x92a8fc17,
+        0xa24bb5a6,
+        0xb21572c9,
+        0xc38d26c4,
+        0xd3d3e1ab,
+        0xe330a81a,
+        0xf36e6f75,
+    };
+
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+#endif
+#endif
+    return crc;
+}
+
+/* AES */
+
+// Some aes emulation for non arm64.  Can remove all this
+#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+//#if !defined(__aarch64__) && !defined(_M_ARM64)
+//#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+//#define SSE2NEON_MULTIPLY(x, y)                                  \
+//    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+//     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+//     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+//     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+//#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t shift_rows[] = {
+        0x0,
+        0x5,
+        0xa,
+        0xf,
+        0x4,
+        0x9,
+        0xe,
+        0x3,
+        0x8,
+        0xd,
+        0x2,
+        0x7,
+        0xc,
+        0x1,
+        0x6,
+        0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1,
+        0x2,
+        0x3,
+        0x0,
+        0x5,
+        0x6,
+        0x7,
+        0x4,
+        0x9,
+        0xa,
+        0xb,
+        0x8,
+        0xd,
+        0xe,
+        0xf,
+        0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t inv_shift_rows[] = {
+        0x0,
+        0xd,
+        0xa,
+        0x7,
+        0x4,
+        0x1,
+        0xe,
+        0xb,
+        0x8,
+        0x5,
+        0x2,
+        0xf,
+        0xc,
+        0x9,
+        0x6,
+        0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1,
+        0x2,
+        0x3,
+        0x0,
+        0x5,
+        0x6,
+        0x7,
+        0x4,
+        0x9,
+        0xa,
+        0xb,
+        0x8,
+        0xd,
+        0xe,
+        0xf,
+        0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t)(((int8x16_t)w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t)vrev32q_u16((uint16x8_t)w);
+
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) &
+                                0x1b); // multiplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t shift_rows[] = {
+        0x0,
+        0x5,
+        0xa,
+        0xf,
+        0x4,
+        0x9,
+        0xe,
+        0x3,
+        0x8,
+        0xd,
+        0x2,
+        0x7,
+        0xc,
+        0x1,
+        0x6,
+        0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    static const uint8_t inv_shift_rows[] = {
+        0x0,
+        0xd,
+        0xa,
+        0x7,
+        0x4,
+        0x1,
+        0xe,
+        0xb,
+        0x8,
+        0x5,
+        0x2,
+        0xf,
+        0xc,
+        0x9,
+        0x6,
+        0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    static const uint8_t ror32by8[] = {
+        0x1,
+        0x2,
+        0x3,
+        0x0,
+        0x5,
+        0x6,
+        0x7,
+        0x4,
+        0x9,
+        0xa,
+        0xb,
+        0x8,
+        0xd,
+        0xe,
+        0xf,
+        0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t)(((int8x16_t)w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t)vrev32q_u16((uint16x8_t)w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+    w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+//#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+//#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(b)));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
+                 vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    //#if !defined(_MSC_VER) || defined(__clang__)
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned)rcon, 0, (unsigned)rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+    //#else
+    //    // We have to do this hack because MSVC is strictly adhering to the CPP
+    //    // standard, in particular C++03 8.5.1 sub-section 15, which states that
+    //    // unions must be initialized by their first member type.
+    //
+    //    // As per the Windows ARM64 ABI, it is always little endian, so this works
+    //    __n128 dest{
+    //        ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
+    //            ((uint64_t) u8.n128_u8[0xE] << 16) |
+    //            ((uint64_t) u8.n128_u8[0xB] << 24) |
+    //            ((uint64_t) u8.n128_u8[0x1] << 32) |
+    //            ((uint64_t) u8.n128_u8[0xE] << 40) |
+    //            ((uint64_t) u8.n128_u8[0xB] << 48) |
+    //            ((uint64_t) u8.n128_u8[0x4] << 56),
+    //        ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
+    //            ((uint64_t) u8.n128_u8[0x6] << 16) |
+    //            ((uint64_t) u8.n128_u8[0x3] << 24) |
+    //            ((uint64_t) u8.n128_u8[0x9] << 32) |
+    //            ((uint64_t) u8.n128_u8[0x6] << 40) |
+    //            ((uint64_t) u8.n128_u8[0x3] << 48) |
+    //            ((uint64_t) u8.n128_u8[0xC] << 56)};
+    //
+    //    dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
+    //    dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
+    //
+    //    return dest;
+    //#endif
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+        case 0x00:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+        case 0x01:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+        case 0x10:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+        case 0x11:
+            return vreinterpretq_m128i_u64(
+                _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+        default:
+            abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+//#elif defined(_MSC_VER)
+//    return _CountOneBits(a);
+#else
+    return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+//#elif defined(_MSC_VER)
+//    return _CountOneBits64(a);
+#else
+    return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+}
+
+FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(
+    unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+        uint64_t value;
+    } r;
+
+    r.value = _sse2neon_get_fpcr();
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+    _sse2neon_set_fpcr(r.value);
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    //#if defined(_MSC_VER) && !defined(__clang__)
+    //    val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
+    //#else
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+    //#endif
+
+    return val;
+}
+
+//#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#pragma pop_macro("FORCE_INLINE_OPTNONE")
+//#endif
+
+//#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+//#endif
+
+#endif
diff --git a/libkram/vectormath/sse_mathfun.h b/libkram/vectormath/sse_mathfun.h
new file mode 100644
index 00000000..708c2146
--- /dev/null
+++ b/libkram/vectormath/sse_mathfun.h
@@ -0,0 +1,417 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+// Mods to this;
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+// TODO: may want to rename to sse_mathfun.cpp
+// TODO: use math ops and simd ops here and let compiler gen intrinsics?
+// TODO: combine the constants into fewer registers, reference .x,..
+
+#pragma once
+// clang-format off
+
+#include <math.h>
+
+namespace SIMD_NAMESPACE {
+
+//---------------------------
+// Start of mathfun below
+
+#if SIMD_FLOAT
+
+#define _PS_CONST(Name, Val) \
+  static const float4 _ps_##Name = Val
+#define _PI_CONST(Name, Val) \
+  static const int4 _pi_##Name = Val
+
+/* the smallest non denormalized float number */
+_PS_CONST(min_norm_pos, (float)0x00800000);
+_PI_CONST(mant_mask, 0x7f800000);
+_PI_CONST(inv_mant_mask, ~0x7f800000);
+
+_PI_CONST(sign_mask, (int32_t)0x80000000);
+_PI_CONST(inv_sign_mask, ~0x80000000);
+
+_PI_CONST(1, 1);
+_PI_CONST(inv1, ~1);
+_PI_CONST(2, 2);
+_PI_CONST(4, 4);
+_PI_CONST(0x7f, 0x7f);
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+// not exposing yet due to calling convention and no math equiv
+static void sincos(float4 x, float4& s, float4& c);
+
+// This is just extra function overhead.  May just want to rename
+float4 sin(float4 x) {
+    float4 s, c;
+    sincos(x, s, c);
+    return s;
+}
+float4 cos(float4 x) {
+    float4 s, c;
+    sincos(x, s, c);
+    return c;
+}
+
+/* natural logarithm computed for 4 simultaneous float
+   return NaN for x <= 0
+*/
+float4 log(float4 x) {
+    int4 emm0;
+    float4 one = _ps_1;
+
+    float4 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+    x = _mm_max_ps(x, _ps_min_norm_pos);  /* cut off denormalized stuff */
+
+    emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+    /* keep only the fractional part */
+    x = _mm_and_ps(x, _pi_inv_mant_mask);
+    x = _mm_or_ps(x, _ps_0p5);
+
+    emm0 = _mm_sub_epi32(emm0, _pi_0x7f);
+    float4 e = _mm_cvtepi32_ps(emm0);
+
+    e = _mm_add_ps(e, one);
+
+    /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+    */
+    float4 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
+    float4 tmp = _mm_and_ps(x, mask);
+    x = _mm_sub_ps(x, one);
+    e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+    x = _mm_add_ps(x, tmp);
+
+
+    float4 z = _mm_mul_ps(x,x);
+
+    float4 y = _ps_cephes_log_p0;
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p1);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p2);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p3);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p4);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p5);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p6);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p7);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_log_p8);
+    y = _mm_mul_ps(y, x);
+
+    y = _mm_mul_ps(y, z);
+
+
+    tmp = _mm_mul_ps(e, _ps_cephes_log_q1);
+    y = _mm_add_ps(y, tmp);
+
+
+    tmp = _mm_mul_ps(z, _ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+
+    tmp = _mm_mul_ps(e, _ps_cephes_log_q2);
+    x = _mm_add_ps(x, y);
+    x = _mm_add_ps(x, tmp);
+    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+    return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+float4 exp(float4 x) {
+    float4 tmp = _mm_setzero_ps(), fx;
+    int4 emm0;
+    float4 one = _ps_1;
+
+#if 0
+    x = clamp(x, _ps_exp_lo, _ps_exp_hi);
+    fx = x * _ps_cephes_LOG2EF + _ps_0p5;
+    
+    fx = floor(fx);
+   
+    x -= fx * (_ps_cephes_exp_C1 + _ps_cephes_exp_C2);
+    float4 z = x * x; // squared
+    
+    // polynomial
+    float4 y = ((((((
+        _ps_cephes_exp_p0 * x + _ps_cephes_exp_p1) * x) +
+        _ps_cephes_exp_p2 * x) + _ps_cephes_exp_p3 * x) +
+        _ps_cephes_exp_p4 * x) + _ps_cephes_exp_p5 * z) + x + one;
+    
+    // build 2^n
+    emm0 = int4(fx); // truncate to int
+    emm0 = (emm0 + _pi_0x7f) << 23;
+    float4 pow2n = _mm_castsi128_ps(emm0); // treat int as float
+    y *= pow2n;
+    
+#else
+    x = _mm_min_ps(x, _ps_exp_hi);
+    x = _mm_max_ps(x, _ps_exp_lo);
+    
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = _mm_mul_ps(x, _ps_cephes_LOG2EF);
+    fx = _mm_add_ps(fx, _ps_0p5);
+    
+    /* how to perform a floorf with SSE: just below */
+    emm0 = _mm_cvttps_epi32(fx);
+    tmp  = _mm_cvtepi32_ps(emm0);
+    /* if greater, substract 1 */
+    float4 mask = _mm_cmpgt_ps(tmp, fx);
+    mask = _mm_and_ps(mask, one);
+    fx = _mm_sub_ps(tmp, mask);
+    
+    tmp = _mm_mul_ps(fx, _ps_cephes_exp_C1);
+    float4 z = _mm_mul_ps(fx, _ps_cephes_exp_C2);
+    x = _mm_sub_ps(x, tmp);
+    x = _mm_sub_ps(x, z);
+
+    z = _mm_mul_ps(x,x);
+    
+    // mads to form a polynoial
+    float4 y = _ps_cephes_exp_p0;
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_exp_p1);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_exp_p2);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_exp_p3);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_exp_p4);
+    y = _mm_mul_ps(y, x);
+    y = _mm_add_ps(y, _ps_cephes_exp_p5);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, x);
+    y = _mm_add_ps(y, one);
+
+    /* build 2^n */
+    emm0 = _mm_cvttps_epi32(fx);
+    emm0 = _mm_add_epi32(emm0, _pi_0x7f);
+    emm0 = _mm_slli_epi32(emm0, 23);
+    float4 pow2n = _mm_castsi128_ps(emm0);
+    y = _mm_mul_ps(y, pow2n);
+    
+#endif
+    
+    return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+static void sincos(float4 x, float4& s, float4& c) {
+    float4 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+    int4 emm0, emm2, emm4;
+    sign_bit_sin = x;
+    /* take the absolute value */
+    x = _mm_and_ps(x, _pi_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit_sin = _mm_and_ps(sign_bit_sin, _pi_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+    /* store the integer part of y in emm2 */
+    emm2 = _mm_cvttps_epi32(y);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = _mm_add_epi32(emm2, _pi_1);
+    emm2 = _mm_and_si128(emm2, _pi_inv1);
+    y = _mm_cvtepi32_ps(emm2);
+
+    emm4 = emm2;
+
+    /* get the swap sign flag for the sine */
+    emm0 = _mm_and_si128(emm2, _pi_4);
+    emm0 = _mm_slli_epi32(emm0, 29);
+    float4 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+    /* get the polynom selection mask for the sine*/
+    emm2 = _mm_and_si128(emm2, _pi_2);
+    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    float4 poly_mask = _mm_castsi128_ps(emm2);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = _ps_minus_cephes_DP1;
+    xmm2 = _ps_minus_cephes_DP2;
+    xmm3 = _ps_minus_cephes_DP3;
+    xmm1 = _mm_mul_ps(y, xmm1);
+    xmm2 = _mm_mul_ps(y, xmm2);
+    xmm3 = _mm_mul_ps(y, xmm3);
+    x = _mm_add_ps(x, xmm1);
+    x = _mm_add_ps(x, xmm2);
+    x = _mm_add_ps(x, xmm3);
+
+    emm4 = _mm_sub_epi32(emm4, _pi_2);
+    emm4 = _mm_andnot_si128(emm4, _pi_4);
+    emm4 = _mm_slli_epi32(emm4, 29);
+    float4 sign_bit_cos = _mm_castsi128_ps(emm4);
+
+    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    float4 z = _mm_mul_ps(x,x);
+    y = _ps_coscof_p0;
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, _ps_coscof_p1);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, _ps_coscof_p2);
+    y = _mm_mul_ps(y, z);
+    y = _mm_mul_ps(y, z);
+    float4 tmp = _mm_mul_ps(z, _ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+    y = _mm_add_ps(y, _ps_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    float4 y2 = _ps_sincof_p0;
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, _ps_sincof_p1);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, _ps_sincof_p2);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_mul_ps(y2, x);
+    y2 = _mm_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    float4 ysin2 = _mm_and_ps(xmm3, y2);
+    float4 ysin1 = _mm_andnot_ps(xmm3, y);
+    y2 = _mm_sub_ps(y2,ysin2);
+    y = _mm_sub_ps(y, ysin1);
+
+    xmm1 = _mm_add_ps(ysin1,ysin2);
+    xmm2 = _mm_add_ps(y,y2);
+
+    /* update the sign */
+    s = _mm_xor_ps(xmm1, sign_bit_sin);
+    c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
+// This has to forward 2/3 to the 4 element version.  
+#define macroVectorRepeatFnImpl(type, cppfun, func) \
+type##2 cppfunc(type##2) { return vec4to2(func(zeroext(x))); } \
+type##3 cppfunc(type##3) { return vec4to3(func(zeroext(x))); } \
+
+macroVectorRepeatFnImpl(float, log, log)
+macroVectorRepeatFnImpl(float, exp, exp)
+
+macroVectorRepeatFnImpl(float, sin, sin)
+macroVectorRepeatFnImpl(float, cos, cos)
+macroVectorRepeatFnImpl(float, cos, tan)
+
+// TODO: pow takes in 2 args
+
+#endif // SIMD_FLOAT
+
+} // namespace SIMD_NAMESPACE
diff --git a/libkram/vectormath/vectormath234.cpp b/libkram/vectormath/vectormath234.cpp
new file mode 100644
index 00000000..9a6a142a
--- /dev/null
+++ b/libkram/vectormath/vectormath234.cpp
@@ -0,0 +1,748 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+#include "vectormath234.h"
+
+#if USE_SIMDLIB
+
+// Tests with godbolt are here to show code comparsions with optimizations.
+
+// ---------------
+// Note: float4a.h has a rcp and rsqrt ops, but they are approximate.
+// Have real div and sqrt ops now.
+// ---------------
+// The storage of affine data in a column matrix is no different than rows
+// translation is in (r30 r31 r32) or in (c30, c31 c32)
+//
+// r0: r00 r01 r02 0
+// r1: r10         0
+// r2: r20         0
+// r3: px  py  pz  1
+//
+// c0  c1  c2  c3
+// c00 c10 c20 px
+// c01 c11     py
+// c02         pz
+//  0   0   0  1
+//
+// col: TRS * TRS * v   cameraToWorldTfm * worldToModelTfm * ..
+// row: v * SRT * SRT   modelToWorldTfm * worldToCameraTfm * ...
+//
+// ---------------
+// So there are currently 6 version of the Accelerate lib.
+// This library hides implementations of some of the calls.
+// So need to rely on a version of the lib to get them,
+// or supply some alternative.  Many calls have fallbacks.
+//
+// 6: macOS 15.0, iOS 18.0
+// 5: macOS 13.0, iOS 16.0
+// 4: macOS 12.0, iOS 15.0
+// 0: header only
+//
+// use 5 for macOS
+// SIMD_LIBRARY_VERSION >= 5
+//
+// use 4 for iOS
+// SIMD_LIBRARY_VERSION >= 4
+//
+//-----------------
+//
+// DONE: rename in README, and name of .cpp/h
+// DONE: split up files into types, float ops, double ops
+// DONE: limit !SIMD_FLOAT_EXT to only 32B vector types?  Have 64B vecs.
+// DONE: ryg on 32B ops on AVX systems
+//   These often only have 16B simd units, so running 32B ops isn't efficient.
+//   This could apply say to PS4/AMD chips too.
+// DONE: bring over fast inverses (RTS, RTU, etc)
+// DONE: need translation, rotation, scale
+// DONE: verify size/alignments are same across Win/macOS
+// DONE: add optimized vec2 ops on Neon
+// DONE: add AVX2 for double4
+// DONE: build an optimized Xcode library
+// DONE: check if packed to aligned conversions are automatic
+
+//-----------------
+
+// TODO: ryg on fp16 <-> fp32
+//   Not the right gist, you want the RTNE one (nm: that only matters for float->half,
+//   this was the half->float one. FWIW, other dir is https://gist.github.com/rygorous/eb3a019b99fdaa9c3064.
+//   These days I use a variant of the RTNE/RN version that also preserves NaN payload bits,
+//   which is slightly more ops but matches hardware conversions exactly for every input, including all NaNs.
+// TODO: build Xcode library that is a clang module or framework
+// TODO: build VisualStudio library with cmake, clang module too?
+// TODO: need fast post-translation, post-rotation, post-scale
+// TODO: need euler <-> matrix
+// TODO: saturating conversions (esp. integer) would be useful too and prevent overflow
+//   bit select to clamp values.
+// TODO: need natvis and lldb formatting of math classes.
+
+//-----------------
+// Links
+
+// here's a decomp
+// https://github.com/erich666/GraphicsGems/blob/master/gemsii/unmatrix.c
+//
+
+// intrinsic tables
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
+
+// older but good talk on simd
+// https://people.inf.ethz.ch/markusp/teaching/263-2300-ETH-spring14/slides/11-simd.pdf
+
+// another article
+// https://www.cs.uaf.edu/courses/cs441/notes/sse-avx/
+
+// aarch64
+// https://en.wikipedia.org/wiki/AArch64
+
+// make win happy for va_copy in format call
+#include <stdarg.h>
+
+#if SIMD_ACCELERATE_MATH
+// clang-format off
+#include <simd/base.h>
+
+// NOTE: this reports 5 for macOS 13 minspec, but SIMD_LIBRARY_VERSION is set to 6.
+//   This is a problem, since some lib code only exists on macOS 15 and iOS 18 then.
+// Can remove this once SIMD_LIBRARY_VERSION is correct.
+// Also unclear what XR_OS_1_0 library support there is.  It's not in the comparisons.
+# if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#  if __has_include(<TargetConditionals.h>) && __has_include(<Availability.h>)
+#   include <TargetConditionals.h>
+#   include <Availability.h>
+#   if TARGET_OS_RTKIT
+#    define SIMD_LIBRARY_VERSION SIMD_CURRENT_LIBRARY_VERSION
+#   elif __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_15_0   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_18_0 || \
+        __XR_OS_VERSION_MIN_REQUIRED     >= __XROS_2_0
+#    define SIMD_LIBRARY_VERSION_TEST 6
+#   elif __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_13_0   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_16_0
+#    define SIMD_LIBRARY_VERSION_TEST 5
+#   elif   __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_12_0   || \
+        __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_15_0
+#    define SIMD_LIBRARY_VERSION_TEST 4
+#   endif
+#  endif
+#endif
+
+
+#if 0
+// SIMD_LIBRARY_VERSION is set to 6 regardless of the minspec
+// iOS 15 = 4, and macOS 13 = 5
+#if TARGET_OS_OSX
+    #if SIMD_LIBRARY_VERSION_TEST != 5
+    blarg1
+    #endif
+
+    #if SIMD_LIBRARY_VERSION != 5
+    blarg2 // this fires
+    #endif
+#else
+    #if SIMD_LIBRARY_VERSION_TEST != 4
+    blarg1
+    #endif
+
+    #if SIMD_LIBRARY_VERSION != 4
+    blarg2 // this fires
+    #endif
+#endif
+#endif
+
+// clang-format on
+#endif // SIMD_ACCELERATE_MATH
+
+namespace SIMD_NAMESPACE {
+
+void TestCalls()
+{
+#if SIMD_FLOAT
+    float4a va = 0;
+    float4p vp = (float)1;
+    
+    va = vp;
+    vp = va;
+#endif
+}
+
+// Check format arguments.
+#ifndef __printflike
+#define __printflike(fmtIndex, varargIndex)
+#endif
+
+inline string format(const char* format, ...) __printflike(1, 2);
+
+// was using kram::format, but wanted to decouple this lib
+inline string format(const char* format, ...)
+{
+    string str;
+    
+    va_list args;
+    va_start(args, format);
+    
+    // format once to get length (without NULL at end)
+    va_list argsCopy;
+    va_copy(argsCopy, args);
+    int32_t len = vsnprintf(NULL, 0, format, argsCopy);
+    va_end(argsCopy);
+    
+    // replace string
+    str.resize(len, 0);
+    vsnprintf((char*)str.c_str(), len + 1, format, args);
+    
+    va_end(args);
+    
+    return str;
+}
+
+#if SIMD_DOUBLE
+
+string vecf::str(double2 v) const
+{
+    return format("(%f %f)", v.x, v.y);
+}
+string vecf::str(double3 v) const
+{
+    return format("(%f %f %f)", v.x, v.y, v.z);
+}
+string vecf::str(double4 v) const
+{
+    return format("(%f %f %f %f)", v.x, v.y, v.z, v.w);
+}
+
+string vecf::str(const double2x2& m) const
+{
+    return format("%s\n%s\n",
+                  str(m[0]).c_str(), str(m[1]).c_str());
+}
+string vecf::str(const double3x3& m) const
+{
+    return format("%s\n%s\n%s\n",
+                  str(m[0]).c_str(), str(m[1]).c_str(), str(m[2]).c_str());
+}
+string vecf::str(const double4x4& m) const
+{
+    return format("%s\n%s\n%s\n%s\n",
+                  str(m[0]).c_str(), str(m[1]).c_str(),
+                  str(m[2]).c_str(), str(m[3]).c_str());
+}
+
+#endif
+
+//-----------------------------
+
+#if SIMD_FLOAT
+
+string vecf::str(float2 v) const
+{
+    return format("(%f %f)", v.x, v.y);
+}
+string vecf::str(float3 v) const
+{
+    return format("(%f %f %f)", v.x, v.y, v.z);
+}
+string vecf::str(float4 v) const
+{
+    return format("(%f %f %f %f)", v.x, v.y, v.z, v.w);
+}
+
+string vecf::str(const float2x2& m) const
+{
+    return format("%s\n%s\n",
+                  str(m[0]).c_str(), str(m[1]).c_str());
+}
+string vecf::str(const float3x3& m) const
+{
+    return format("%s\n%s\n%s\n",
+                  str(m[0]).c_str(), str(m[1]).c_str(), str(m[2]).c_str());
+}
+string vecf::str(const float4x4& m) const
+{
+    return format("%s\n%s\n%s\n%s\n",
+                  str(m[0]).c_str(), str(m[1]).c_str(),
+                  str(m[2]).c_str(), str(m[3]).c_str());
+}
+
+#endif // SIMD_FLOAT
+
+#if SIMD_HALF
+
+#if SIMD_HALF_FLOAT16
+
+string vecf::str(half2 v) const
+{
+    return format("(%f %f)", (double)v.x, (double)v.y);
+}
+string vecf::str(half3 v) const
+{
+    return format("(%f %f %f)", (double)v.x, (double)v.y, (double)v.z);
+}
+string vecf::str(half4 v) const
+{
+    return format("(%f %f %f %f)", (double)v.x, (double)v.y, (double)v.z, (double)v.w);
+}
+
+#elif SIMD_HALF4_ONLY
+
+// this converts half4 to float, then just prints that
+string vecf::str(half2 v) const
+{
+    float4 vv = float4m(zeroext(v));
+    return format("(%f %f)", vv.x, vv.y);
+}
+string vecf::str(half3 v) const
+{
+    float4 vv = float4m(zeroext(v));
+    return format("(%f %f %f)", vv.x, vv.y, vv.z);
+}
+string vecf::str(half4 v) const
+{
+    float4 vv = float4m(v);
+    return format("(%f %f %f %f)", vv.x, vv.y, vv.z, vv.w);
+}
+
+#endif // SIMD_HALF_FLOAT16
+
+#endif // SIMD_HALF
+
+#if SIMD_INT
+string vecf::str(int2 v) const
+{
+    return format("(%d %d)", v.x, v.y);
+}
+string vecf::str(int3 v) const
+{
+    return format("(%d %d %d)", v.x, v.y, v.z);
+}
+string vecf::str(int4 v) const
+{
+    return format("(%d %d %d %d)", v.x, v.y, v.z, v.w);
+}
+#endif
+
+#if SIMD_LONG
+
+// This works across Win and macOS, so don't need to use PRId64.
+#define long1cast long long
+
+string vecf::str(long2 v) const
+{
+    return format("(%lld %lld)", (long1cast)v.x, (long1cast)v.y);
+}
+string vecf::str(long3 v) const
+{
+    return format("(%lld %lld %lld)", (long1cast)v.x, (long1cast)v.y, (long1cast)v.z);
+}
+string vecf::str(long4 v) const
+{
+    return format("(%lld %lld %lld %lld)", (long1cast)v.x, (long1cast)v.y, (long1cast)v.z, (long1cast)v.w);
+}
+#endif
+
+//-----------------------------
+
+#define FMT_SEP() s += "-----------\n"
+
+string vecf::simd_configs() const
+{
+    string s;
+    
+#define FMT_CONFIG(val) s += format("%s: %d\n", #val, val);
+    
+    FMT_CONFIG(SIMD_SSE);
+    FMT_CONFIG(SIMD_NEON);
+    
+#if SIMD_SSE
+    bool hasSSE42 = false;
+    bool hasAVX = false;
+    bool hasAVX2 = false;
+    
+    bool hasF16C = false;
+    bool hasFMA = false;
+    
+#if SIMD_SSE
+    hasSSE42 = true;
+#endif
+#ifdef __AVX__
+    hasAVX = true;
+#endif
+#if SIMD_AVX2
+    hasAVX2 = true;
+#endif
+    
+    // TODO: AVX-512 flags (combine into one?)
+    // (__AVX512F__) && (__AVX512DQ__) && (__AVX512CD__) && (__AVX512BW__) && (__AVX512VL__) && (__AVX512VBMI2__)
+    
+#ifdef __F16C__
+    hasF16C = true;
+#endif
+#ifdef __FMA__
+    hasFMA = true;
+#endif
+    
+    if (hasAVX2)
+        s += format("%s: %d\n", "AVX2 ", hasAVX2);
+    else if (hasAVX)
+        s += format("%s: %d\n", "AVX  ", hasAVX);
+    else if (hasSSE42)
+        s += format("%s: %d\n", "SSE42 ", hasSSE42);
+    
+    s += format("%s: %d\n", "F16C  ", hasF16C);
+    s += format("%s: %d\n", "FMA   ", hasFMA);
+    
+    // fp-contract, etc ?
+    // CRC (may not be worth it)
+    
+#endif
+    
+#if SIMD_NEON
+    // any neon setting, arm64 version
+    // __ARM_VFPV4__
+    // CRC (may not be worth it)
+    
+#endif
+    
+    FMT_CONFIG(SIMD_FLOAT_EXT);
+    FMT_CONFIG(SIMD_HALF_FLOAT16);
+#if SIMD_HALF
+    FMT_CONFIG(SIMD_HALF4_ONLY);
+#endif
+    
+    FMT_SEP();
+    
+    FMT_CONFIG(SIMD_CMATH_MATH);
+    FMT_CONFIG(SIMD_ACCELERATE_MATH);
+#if SIMD_ACCELERATE_MATH
+    // Dump the min version. This is supposed to control SIMD_LIBRARY_VERSION
+#if __APPLE__
+#if TARGET_OS_OSX
+    FMT_CONFIG(__MAC_OS_X_VERSION_MIN_REQUIRED);
+#elif TARGET_OS_VISION
+    FMT_CONFIG(__VISION_OS_VERSION_MIN_REQUIRED);
+#else
+    FMT_CONFIG(__IPHONE_OS_VERSION_MIN_REQUIRED);
+#endif
+#endif
+    
+    FMT_CONFIG(SIMD_LIBRARY_VERSION); // lib based on min os target
+    FMT_CONFIG(SIMD_CURRENT_LIBRARY_VERSION); // max lib based on sdk
+    FMT_CONFIG(SIMD_LIBRARY_VERSION_TEST);
+    FMT_CONFIG(SIMD_ACCELERATE_MATH_NAMES);
+#endif
+    
+    FMT_SEP();
+    
+    FMT_CONFIG(SIMD_HALF);
+    FMT_CONFIG(SIMD_FLOAT);
+    FMT_CONFIG(SIMD_DOUBLE);
+    
+    FMT_CONFIG(SIMD_INT);
+    FMT_CONFIG(SIMD_LONG);
+    
+    // don't have these implemented yet
+    //FMT_CONFIG(SIMD_CHAR);
+    //FMT_CONFIG(SIMD_SHORT);
+    
+#undef FMT_CONFIG
+    
+    return s;
+}
+
+string vecf::simd_alignments() const
+{
+    string s;
+    
+#define FMT_CONFIG(val) s += format("%s: %zu %zu\n", #val, sizeof(val), __alignof(val));
+    
+    // TODO: add other types int, half?
+    
+#if SIMD_FLOAT
+    FMT_SEP();
+    
+    FMT_CONFIG(float2);
+    FMT_CONFIG(float3);
+    FMT_CONFIG(float4);
+    FMT_CONFIG(float8);
+    //FMT_CONFIG(float16);
+    
+    FMT_CONFIG(float2x2);
+    FMT_CONFIG(float3x3);
+    FMT_CONFIG(float3x4);
+    FMT_CONFIG(float4x4);
+#endif
+    
+#if SIMD_DOUBLE
+    FMT_SEP();
+    
+    FMT_CONFIG(double2);
+    FMT_CONFIG(double3);
+    FMT_CONFIG(double4);
+    // FMT_CONFIG(double8);
+    
+    FMT_CONFIG(double2x2);
+    FMT_CONFIG(double3x3);
+    FMT_CONFIG(double3x4);
+    FMT_CONFIG(double4x4);
+#endif
+    
+#if SIMD_INT
+    FMT_SEP();
+    
+    FMT_CONFIG(int2);
+    FMT_CONFIG(int3);
+    FMT_CONFIG(int4);
+    FMT_CONFIG(int8);
+    //FMT_CONFIG(int16);
+#endif
+    
+#if SIMD_LONG
+    FMT_SEP();
+    
+    FMT_CONFIG(long2);
+    FMT_CONFIG(long3);
+    FMT_CONFIG(long4);
+    //FMT_CONFIG(long8);
+#endif
+    
+#undef FMT_CONFIG
+    
+    return s;
+}
+
+//---------------
+
+#if SIMD_HALF4_ONLY
+
+#if SIMD_NEON
+
+float4 float4m(half4 vv)
+{
+    return float4(vcvt_f32_f16(*(const float16x4_t*)&vv));
+}
+half4 half4m(float4 vv)
+{
+    return half4(vcvt_f16_f32(*(const float32x4_t*)&vv));
+}
+
+#endif // SIMD_NEON
+
+#if SIMD_SSE
+
+float4 float4m(half4 vv)
+{
+    // https://patchwork.ozlabs.org/project/gcc/patch/559BC75A.1080606@arm.com/
+    // https://gcc.gnu.org/onlinedocs/gcc-7.5.0/gcc/Half-Precision.html
+    // https://developer.arm.com/documentation/dui0491/i/Using-NEON-Support/Converting-vectors
+    __m128i reg16 = _mm_setzero_si128();
+    
+    // TODO: switch to load low 64-bits, but don't know which one _mm_cvtsi32_si128(&vv.reg); ?
+    // want 0 extend here, sse overuses int32_t when really unsigned and zero extended value
+    reg16 = _mm_insert_epi16(reg16, vv[0], 0);
+    reg16 = _mm_insert_epi16(reg16, vv[1], 1);
+    reg16 = _mm_insert_epi16(reg16, vv[2], 2);
+    reg16 = _mm_insert_epi16(reg16, vv[3], 3);
+    
+    return simd::float4(_mm_cvtph_ps(reg16));
+}
+
+half4 half4m(float4 vv)
+{
+    __m128i reg16 = _mm_cvtps_ph(*(const __m128*)&vv, 0); // 4xfp32-> 4xfp16,  round to nearest-even
+    
+    // TODO: switch to store/steam, but don't know which one _mm_storeu_epi16 ?
+    half4 val; // = 0;
+    
+    // 0 extended
+    val[0] = (half)_mm_extract_epi16(reg16, 0);
+    val[1] = (half)_mm_extract_epi16(reg16, 1);
+    val[2] = (half)_mm_extract_epi16(reg16, 2);
+    val[3] = (half)_mm_extract_epi16(reg16, 3);
+    return val;
+}
+
+#endif // SIMD_SSE
+#endif // SIMD_HALF4_ONLY
+
+
+// Adapted from this code for uint32_t.
+// https://github.com/lemire/FastDifferentialCoding/blob/master/src/fastdelta.c
+// Don't have uchar, ushort, uint, ulong support, but can use signed.
+
+#if SIMD_INT
+void deltaEncodeU32(uint32_t * buffer, size_t length, uint32_t starting_point) {
+    constexpr uint32_t elementCount = 4;
+    constexpr uint32_t elementSize = sizeof(uint32_t);
+    
+    // Could do 2 unaligned loads (second shifted by 1 element) instead of alignr.
+    // but that would lower simd count if multiple of elementCount
+    int4p* b = (int4p*)buffer;
+    int4 prev = starting_point;
+    size_t i = 0;
+    for(; i < length/elementCount; i++) {
+        int4 curr = b[i];
+        int4 delta = curr - _mm_alignr_epi8(curr, prev, (elementCount-1)*elementSize);
+        b[i] = delta;
+        prev = curr;
+    }
+    
+    // 1 to (elementCount-1) indices don't fit above
+    uint32_t lastprev = prev[elementCount-1];
+    for(i = elementCount * i; i < length; ++i) {
+        uint32_t curr = buffer[i];
+        buffer[i] = curr - lastprev;
+        lastprev = curr;
+    }
+}
+
+// write to buffer the successive differences of buffer (buffer[0]-starting_point, buffer[1]-buffer[2], ...)
+// there are "length" values in buffer
+void deltaDecodeU32(uint32_t * buffer, size_t length, uint32_t starting_point) {
+    constexpr uint32_t elementCount = 4;
+    constexpr uint32_t elementSize = sizeof(uint32_t);
+    
+    int4p* b = (int4p*)buffer;
+    int4 prev = starting_point;
+    size_t i = 0;
+    for(; i  < length/elementCount; i++) {
+        int4 curr = b[i];
+        
+        // this is prefix sum
+        int4 _tmp1 = _mm_slli_si128(curr, 2*elementSize) + curr;
+        int4 _tmp2 = _mm_slli_si128(_tmp1, 1*elementSize) + _tmp1;
+        prev = _tmp2 + _mm_shuffle_epi32(prev, _MM_SHUFFLE(3, 3, 3, 3));
+        
+        b[i] = prev;
+    }
+    uint32_t lastprev = prev[elementCount-1];
+    for(i = elementCount * i ; i < length; ++i) {
+        lastprev += buffer[i];
+        buffer[i] = lastprev;
+    }
+}
+#endif
+
+#if SIMD_SHORT
+void deltaEncodeU16(uint16_t * buffer, size_t length, uint16_t starting_point) {
+    constexpr uint32_t elementCount = 8;
+    constexpr uint32_t elementSize = sizeof(uint16_t);
+    
+    // Could do 2 unaligned loads (second shifted by 1 element) instead of alignr.
+    // but that would lower simd count if multiple of elementCount
+    short8p* b = (short8p*)buffer;
+    short8 prev = starting_point;
+    size_t i = 0;
+    for(; i < length/elementCount; i++) {
+        short8 curr = b[i];
+        short8 delta = curr - _mm_alignr_epi8(curr, prev, (elementCount-1)*elementSize);
+        b[i] = delta;
+        prev = curr;
+    }
+    
+    // 1 to (elementCount-1) indices don't fit above
+    uint16_t lastprev = prev[elementCount-1];
+    for(i = elementCount * i; i < length; ++i) {
+        uint16_t curr = buffer[i];
+        buffer[i] = curr - lastprev;
+        lastprev = curr;
+    }
+}
+
+// write to buffer the successive differences of buffer (buffer[0]-starting_point, buffer[1]-buffer[2], ...)
+// there are "length" values in buffer
+void deltaDecodeU16(uint16_t * buffer, size_t length, uint16_t starting_point) {
+    /* decode may be faster as scalar, lots of simd ops to prefix 8 values
+    constexpr uint32_t elementCount = 8;
+    constexpr uint32_t elementSize = sizeof(uint16_t);
+    
+    short8p* b = (short8p*)buffer;
+    short8 prev = starting_point;
+    size_t i = 0;
+    for(; i < length/elementCount; i++) {
+        short8 curr = b[i];
+        
+        // this is prefix sum
+        // TODO: way more values to add (8 total, this is for 4)
+        short8 _tmp1 = _mm_slli_si128(curr, 2*elementSize) + curr;
+        short8 _tmp2 = _mm_slli_si128(_tmp1, 1*elementSize) + _tmp1;
+        prev = _tmp2 + _mm_shuffle_epi32(prev, _MM_SHUFFLE(3, 3, 3, 3));
+        
+        b[i] = prev;
+    }
+    uint16_t lastprev = prev[elementCount-1];
+    for(i = 4 * i ; i < length; ++i) {
+        lastprev = lastprev + buffer[i];
+        buffer[i] = lastprev;
+    }
+    */
+    
+    uint16_t lastprev = starting_point;
+    for(size_t i = 0; i < length; ++i) {
+        lastprev += buffer[i];
+        buffer[i] = lastprev;
+    }
+}
+#endif
+
+#if SIMD_CHAR
+void deltaEncodeU8(uint8_t * buffer, size_t length, uint8_t starting_point) {
+    constexpr uint32_t elementCount = 16;
+    constexpr uint32_t elementSize = sizeof(uint8_t);
+    
+    char16p* b = (char16p*)buffer;
+    char16 prev = starting_point;
+    size_t i = 0;
+    for(; i < length/elementCount; i++) {
+        char16 curr = b[i];
+        char16 delta = curr - _mm_alignr_epi8(curr, prev, (elementCount-1)*elementSize);
+        b[i] = delta;
+        prev = curr;
+    }
+    
+    // 1 to (elementCount-1) indices don't fit above
+    uint8_t lastprev = prev[elementCount-1];
+    for(i = elementCount * i; i < length; ++i) {
+        uint8_t curr = buffer[i];
+        buffer[i] = curr - lastprev;
+        lastprev = curr;
+    }
+}
+
+// write to buffer the successive differences of buffer (buffer[0]-starting_point, buffer[1]-buffer[2], ...)
+// there are "length" values in buffer
+void deltaDecodeU8(uint8_t * buffer, size_t length, uint8_t starting_point) {
+    /* decode may be faster as scalar, lots of simd ops to prefix 16 values
+    constexpr uint32_t elementCount = 16;
+    constexpr uint32_t elementSize = sizeof(uint8_t);
+    
+    // This is a confusing name.  Really char8x16
+    char16p* b = (char16p*)buffer;
+    char16 prev = starting_point;
+    size_t i = 0;
+    for(; i  < length/elementCount; i++) {
+        char16 curr = b[i];
+        
+        // this is prefix sum
+        // TODO: way more values to add (16 total, this is for 4)
+        char16 _tmp1 = _mm_slli_si128(curr, 2*elementSize) + curr;
+        char16 _tmp2 = _mm_slli_si128(_tmp1, 1*elementSize) + _tmp1;
+        prev = _tmp2 + _mm_shuffle_epi32(prev, _MM_SHUFFLE(3, 3, 3, 3));
+        
+        b[i] = prev;
+    }
+    uint8_t lastprev = prev[elementCount-1];
+    for(i = elementCount * i ; i < length; ++i) {
+        lastprev = lastprev + buffer[i];
+        buffer[i] = lastprev;
+    }
+    */
+    uint8_t lastprev = starting_point;
+    for(size_t i = 0; i < length; ++i) {
+        lastprev += buffer[i];
+        buffer[i] = lastprev;
+    }
+}
+#endif
+
+
+} // namespace SIMD_NAMESPACE
+#endif // USE_SIMDLIB
diff --git a/libkram/vectormath/vectormath234.h b/libkram/vectormath/vectormath234.h
new file mode 100644
index 00000000..2af8fc31
--- /dev/null
+++ b/libkram/vectormath/vectormath234.h
@@ -0,0 +1,710 @@
+// kram - Copyright 2020-2025 by Alec Miller. - MIT License
+// The license and copyright notice shall be included
+// in all copies or substantial portions of the Software.
+
+#pragma once
+
+#if USE_SIMDLIB
+
+// This requires __clang__ or gcc.
+// Only really targeting __clang__ for development.
+// Targeting x64 running AVX2 and Neon.
+//
+// gcc/clang vector extension:
+// .lo and .hi are the first second halves of a vector,
+// .even and .odd are the even- and odd-indexed elements of a vector.
+// __builtin_shufflevector and __builtin_convertvector.
+// also have .rgba built into the types
+// So can emulate double4 on Neon using 2 registers.
+// These extensions only work on a C typedef and auto convert to
+//  _m128, _m128i, _m256, .. on Intel, and to float32x4_t on Neon.
+//
+// Apple simd lib has versioning, so the Accelerate lib provides optimized
+// simd calls.  And if not present, then those calls use 4 math.h function calls
+// or fail to build.  So v0 is basic functionaliy, and it's up to v6 on iOS 18.
+//
+// x64: Compile this with -march x86_64h (haswell) or -mavx2 -mf16c -fma
+// arm64: Compile this with Neon enabled (should have fma)
+//
+// Intel SSE core has 3 or 4 32B units internally.
+//   One core is shared by 2 Hyperthreads, but HT is being removed by Intel.
+//   e-cores can only run AVX2, and don't have HT support.
+// AVX-512 can drop to 1 unit rounding down.
+// Chip frequencies drop using AVX2 and AVX-512.
+// Trying to run AVX2 (32B) can even run slow on older Intel/AMD
+//   with double-pumped 16B internal cores.
+// New AMD runs double-pumped 32B instructions for AVX-512 on all cores.
+//  AMD has e-cores, but no instruction limits like Intel.
+//
+// Intel SSE scalar ops used to run 2:1 for 1:4 ops of work
+// but now it's 1:1, so these only keep value in sse register.
+//
+// This passes vector by value, and matrix by const reference to the ops.
+// May want to reconsider const ref for long uint/float/double vectors.
+// This assumes x64 calling conventions for passing registers.
+// There are differing function calling conventions for passing first 4 values.
+//
+// Neon    32x 32B 128-bt
+// SVE2    ?
+// +fp16   _Float16 support
+//
+// SSE     16x 16B 128-bit
+// AVX/2   16x 32B 256-bit
+// AVX-512 16x 64B 512-bit (disabled on p-cores and dropped from e-cores on i9), 4 variants
+// AVX10   32x 32B 256-bit (emulates 512-bit), 3 variants
+//
+// FMA     fp multiply add (clang v14)
+// F16C    2 ops fp16 <-> fp32
+// CRC32   instructions to enable fast crc ops (not using yet, but is in sse2neon.h)
+//
+// max vec size per register
+// 16B      32B
+// char16   char16?
+// short8   short16
+// uint4    uint8
+// float4   float8
+// double2  double4
+//
+// Metal Shading Language (MSL)
+// supports up to half4
+// supports up to float4
+// no support for double.  cpu only
+//
+// HLSL and DX12/Vulkan support double on desktop, but not mobile?
+//   and likely not on arm64 gpus.
+//
+//------------
+// x64 -> arm64 emulators
+// Prism   supports SSE4.2, no fma, no f16c
+// Rosetta supports SSE4.2, no fma, no f16c
+// Rosetta supports AVX2 (macOS 15.0)
+//
+//------------
+// Types for 32B max vector size (2 Neon reg, 1 AVX/AVX2 reg)
+// char2,3,4,8,16,32
+// int2,3,4,8
+//
+// half2,3,4,8,16
+// float2,3,4,8
+// double2,3,4
+//
+//------------
+
+// Intel chips
+//  1 Nehalem,
+//  2 Sandy Bridge,
+//  3 Ivy Bridge,
+//  4 Haswell,       AVX2, FMA3 (not Pentiums/Celerons)
+//  5 Broadwell,
+//  6 Sky Lake,
+//  7 Kaby Lake,
+//  8 Coffee Lake,
+//  9 Coffee Lake Refresh
+// 10 Comet Lake,
+// 11 Rocket Lake,
+// 12 Alder Lake,
+// 13 Raptor Lake
+//                  APX?  AVX10?
+//
+// AMD chips
+// Jaguar           AVX
+// Piledriver       AVX2?  FMA3
+// Ryzen
+// Zen
+//
+// Apple Silicon
+// iPhone 5S has arm64 arm64-v?
+//
+//-----------------
+// clang version matters to codegen.
+// These two version seem to be significant changes in output.
+//
+// v14 fma
+// v16 better fma?
+// v18 Intel APX support (still no chip)
+//
+// -Og can't unroll small loops for some reason. -O2 and -O3 do.
+// https://godbolt.org/z/KMPa8bchb
+//
+// As optimal as it gets
+// https://godbolt.org/z/YxzobGM17
+//
+// optimized quake rcp, rsqrt, sqrt
+// https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
+//
+// transpose examples
+// https://godbolt.org/z/TYcvrP7Y3
+
+//-----------------------------------
+// config
+
+// Can override the namespace to your own.  This avoids collision with Apple simd.
+#ifndef SIMD_NAMESPACE
+#define SIMD_NAMESPACE simdk
+#endif // SIMD_NAMESPACE
+
+// Can use std or eastl
+#ifndef STL_NAMESPACE
+#define STL_NAMESPACE std
+#endif // SIMD_NAMESPACE
+
+// only support avx2 and Neon, no avx-512 at first
+#if defined __ARM_NEON__
+#define SIMD_SSE 0
+#define SIMD_AVX2 0
+#define SIMD_NEON 1
+#elif defined __AVX2__ // x64 AVX2 or higher, can lower to AVX
+#define SIMD_SSE 1
+#define SIMD_AVX2 1
+#define SIMD_NEON 0
+#elif defined __SSE4_1__ // SSE 4.1+
+#define SIMD_SSE 1
+#define SIMD_AVX2 0
+#define SIMD_NEON 0
+#else
+#warning unuspported simd arch
+#endif
+
+// a define to override setings from prefix file
+#ifndef SIMD_CONFIG
+
+// fp comparisons gen a corresponding signed integer type
+// apple is signed-char, so make sure to set on -fsigned-char on other platforms
+#define SIMD_CHAR 1
+#define SIMD_SHORT 1
+#define SIMD_INT 1
+#define SIMD_LONG 1
+
+// don't need these yet, but easy to add with macros
+//#define SIMD_UCHAR  0
+//#define SIMD_USHORT 0
+//#define SIMD_UINT   0
+//#define SIMD_ULONG  0
+
+// Vector and matrix types.  Currently only matrix types for SIMD_FLOAT, SIMD_DOUBLE.
+// SIMD_INT must be kept on for conditional tests.
+// SIMD_HALF for bitselect would need SIMD_SHORT or SIMD_INT?
+// #define SIMD_HALF   (1 && SIMD_SHORT)
+#define SIMD_HALF (1)
+#define SIMD_FLOAT (1 && SIMD_INT)
+#define SIMD_DOUBLE (1 && SIMD_LONG)
+
+// Whether to support > 4 length vecs with some ops
+#define SIMD_FLOAT_EXT 0
+
+// controls over acclerate vs. func calls
+#ifdef __APPLE__
+#define SIMD_ACCELERATE_MATH 1
+#define SIMD_CMATH_MATH 0
+#else
+#define SIMD_ACCELERATE_MATH 0
+#define SIMD_CMATH_MATH 1
+#endif
+
+// This means simd_float4 will come from this file instead of simd.h
+#define SIMD_ACCELERATE_MATH_NAMES 0
+
+#endif // SIMD_CONFIG
+
+//-----------------------------------
+
+// simplify calls
+// const means it doesn't pull from global changing state (what about constants)
+// and inline is needed or get unused static calls, always_inline forces inline
+// of these mostly wrapper calls.
+#define SIMD_CALL static inline __attribute__((__always_inline__, __const__, __nodebug__))
+
+// op *=, +=, -=, /= mods the calling object, so can't be const
+#define SIMD_CALL_OP static inline __attribute__((__always_inline__, __nodebug__))
+
+//------------
+
+// aligned
+#define macroVector1TypesAligned(type, name)                         \
+    typedef type name##1a;                                           \
+    typedef __attribute__((__ext_vector_type__(2))) type name##2a;   \
+    typedef __attribute__((__ext_vector_type__(3))) type name##3a;   \
+    typedef __attribute__((__ext_vector_type__(4))) type name##4a;   \
+    typedef __attribute__((__ext_vector_type__(8))) type name##8a;   \
+    typedef __attribute__((__ext_vector_type__(16))) type name##16a; \
+    typedef __attribute__((__ext_vector_type__(32), __aligned__(16))) type name##32a;
+
+// packed
+#define macroVector1TypesPacked(type, name)                                          \
+    typedef type name##1p;                                                           \
+    typedef __attribute__((__ext_vector_type__(2), __aligned__(1))) type name##2p;   \
+    typedef __attribute__((__ext_vector_type__(3), __aligned__(1))) type name##3p;   \
+    typedef __attribute__((__ext_vector_type__(4), __aligned__(1))) type name##4p;   \
+    typedef __attribute__((__ext_vector_type__(8), __aligned__(1))) type name##8p;   \
+    typedef __attribute__((__ext_vector_type__(16), __aligned__(1))) type name##16p; \
+    typedef __attribute__((__ext_vector_type__(32), __aligned__(1))) type name##32p;
+
+// cpp rename for u/char
+#define macroVector1TypesStorageRenames(cname, cppname) \
+    typedef ::cname##1a cppname##1;                     \
+    typedef ::cname##2a cppname##2;                     \
+    typedef ::cname##3a cppname##3;                     \
+    typedef ::cname##4a cppname##4;                     \
+    typedef ::cname##8a cppname##8;                     \
+    typedef ::cname##16a cppname##16;                   \
+    typedef ::cname##32a cppname##32;
+
+//------------
+
+// aligned
+#define macroVector2TypesAligned(type, name)                       \
+    typedef type name##1a;                                         \
+    typedef __attribute__((__ext_vector_type__(2))) type name##2a; \
+    typedef __attribute__((__ext_vector_type__(3))) type name##3a; \
+    typedef __attribute__((__ext_vector_type__(4))) type name##4a; \
+    typedef __attribute__((__ext_vector_type__(8))) type name##8a; \
+    typedef __attribute__((__ext_vector_type__(16), __aligned__(16))) type name##16a;
+
+// packed
+#define macroVector2TypesPacked(type, name)                                        \
+    typedef type name##1p;                                                         \
+    typedef __attribute__((__ext_vector_type__(2), __aligned__(2))) type name##2p; \
+    typedef __attribute__((__ext_vector_type__(3), __aligned__(2))) type name##3p; \
+    typedef __attribute__((__ext_vector_type__(4), __aligned__(2))) type name##4p; \
+    typedef __attribute__((__ext_vector_type__(8), __aligned__(2))) type name##8p; \
+    typedef __attribute__((__ext_vector_type__(16), __aligned__(2))) type name##16p;
+
+// cpp rename for half, u/short
+#define macroVector2TypesStorageRenames(cname, cppname) \
+    typedef ::cname##1a cppname##1;                     \
+    typedef ::cname##2a cppname##2;                     \
+    typedef ::cname##3a cppname##3;                     \
+    typedef ::cname##4a cppname##4;                     \
+    typedef ::cname##8a cppname##8;                     \
+    typedef ::cname##16a cppname##16;
+
+//------------
+
+// aligned
+#define macroVector4TypesAligned(type, name)                       \
+    typedef type name##1a;                                         \
+    typedef __attribute__((__ext_vector_type__(2))) type name##2a; \
+    typedef __attribute__((__ext_vector_type__(3))) type name##3a; \
+    typedef __attribute__((__ext_vector_type__(4))) type name##4a; \
+    typedef __attribute__((__ext_vector_type__(8), __aligned__(16))) type name##8a;
+
+// typedef __attribute__((__ext_vector_type__(16),__aligned__(16))) type name##16s;
+
+// packed
+#define macroVector4TypesPacked(type, name)                                        \
+    typedef type name##1p;                                                         \
+    typedef __attribute__((__ext_vector_type__(2), __aligned__(4))) type name##2p; \
+    typedef __attribute__((__ext_vector_type__(3), __aligned__(4))) type name##3p; \
+    typedef __attribute__((__ext_vector_type__(4), __aligned__(4))) type name##4p; \
+    typedef __attribute__((__ext_vector_type__(8), __aligned__(4))) type name##8p;
+
+// typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) type name##16p; \
+
+// cpp rename for float, u/int
+#define macroVector4TypesStorageRenames(cname, cppname) \
+    typedef ::cname##1a cppname##1;                     \
+    typedef ::cname##2a cppname##2;                     \
+    typedef ::cname##3a cppname##3;                     \
+    typedef ::cname##4a cppname##4;                     \
+    typedef ::cname##8a cppname##8;
+
+// typedef ::cname##16s cppname##16; \
+
+//------------
+
+// aligned
+#define macroVector8TypesAligned(type, name)                                        \
+    typedef type name##1a;                                                          \
+    typedef __attribute__((__ext_vector_type__(2))) type name##2a;                  \
+    typedef __attribute__((__ext_vector_type__(3), __aligned__(16))) type name##3a; \
+    typedef __attribute__((__ext_vector_type__(4), __aligned__(16))) type name##4a;
+
+// typedef __attribute__((__ext_vector_type__(8),__aligned__(16))) type name##8s;
+
+// packed
+#define macroVector8TypesPacked(type, name)                                        \
+    typedef type name##1p;                                                         \
+    typedef __attribute__((__ext_vector_type__(2), __aligned__(8))) type name##2p; \
+    typedef __attribute__((__ext_vector_type__(3), __aligned__(8))) type name##3p; \
+    typedef __attribute__((__ext_vector_type__(4), __aligned__(8))) type name##4p;
+
+//typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) type name##8p;
+
+// cpp rename for double, u/long
+#define macroVector8TypesStorageRenames(cname, cppname) \
+    typedef ::cname##1a cppname##1;                     \
+    typedef ::cname##2a cppname##2;                     \
+    typedef ::cname##3a cppname##3;                     \
+    typedef ::cname##4a cppname##4;
+
+// typedef ::cname##8s cppname##8;
+
+//-----------------------------------
+
+#define macroMatrixOps(type) \
+    SIMD_CALL_OP type& operator*=(type& x, const type& y) \
+    { \
+        x = mul(x, y); \
+        return x; \
+    } \
+    SIMD_CALL_OP type& operator+=(type& x, const type& y)                                            \
+    {                                                                                                \
+        x = add(x, y);                                                                               \
+        return x;                                                                                    \
+    }                                                                                                \
+    SIMD_CALL_OP type& operator-=(type& x, const type& y)                                            \
+    {                                                                                                \
+        x = sub(x, y);                                                                               \
+        return x;                                                                                    \
+    }                                                                                                \
+    SIMD_CALL bool operator==(const type& x, const type& y) { return equal(x, y); }                  \
+    SIMD_CALL bool operator!=(const type& x, const type& y) { return !(x == y); }                    \
+                                                                                                     \
+    SIMD_CALL type operator-(const type& x, const type& y) { return sub(x, y); }                     \
+    SIMD_CALL type operator+(const type& x, const type& y) { return add(x, y); }                     \
+    SIMD_CALL type operator*(const type& x, const type& y) { return mul(x, y); }                     \
+    SIMD_CALL type::column_t operator*(const type::column_t& v, const type& y) { return mul(v, y); } \
+    SIMD_CALL type::column_t operator*(const type& x, const type::column_t& v) { return mul(x, v); }
+
+//-----------------------------------
+
+// TODO: type1 simdk::log(float1)
+// type##1 cppfunc(type##1 x);
+
+// define functions that don't map to typical simd ops
+#define macroVectorRepeatFnDecl(type, cppfunc) \
+    type##2 cppfunc(type##2 x);                \
+    type##3 cppfunc(type##3 x);                \
+    type##4 cppfunc(type##4 x);
+
+#define macroVectorRepeatFn2Decl(type, cppfunc) \
+    type##2 cppfunc(type##2 x, type##2 y);      \
+    type##3 cppfunc(type##3 x, type##3 y);      \
+    type##4 cppfunc(type##4 x, type##4 y);
+
+//------------
+
+#if SIMD_ACCELERATE_MATH
+
+// remap simdk to simd namespace
+#define macroVectorRepeatFnImpl(type, cppfunc)              \
+    type##2 cppfunc(type##2 a) { return simd::cppfunc(a); } \
+    type##3 cppfunc(type##3 a) { return simd::cppfunc(a); } \
+    type##4 cppfunc(type##4 a) { return simd::cppfunc(a); }
+
+#define macroVectorRepeatFn2Impl(type, cppfunc)                           \
+    type##2 cppfunc(type##2 a, type##2 b) { return simd::cppfunc(a, b); } \
+    type##3 cppfunc(type##3 a, type##3 b) { return simd::cppfunc(a, b); } \
+    type##4 cppfunc(type##4 a, type##4 b) { return simd::cppfunc(a, b); }
+
+#endif // SIMD_ACCELERATE_MATH
+
+// These are large functions that can be buried and optimized in the .cpp
+// Has alternate cmath form it uses for now.  Look into ISPC calls to
+// replace some of this.
+
+//-----------------------------------
+
+#if SIMD_CMATH_MATH
+
+// TODO: add this back
+// type##1 cppfunc(type##1 x) { return func(x); } \
+
+// This calls function repeatedly, then returns as vector.
+// These don't call to the 4 version since it's so much more work.
+#define macroVectorRepeatFnImpl(type, cppfunc, func)                         \
+    type##2 cppfunc(type##2 a) { return {func(a.x), func(a.y)}; }            \
+    type##3 cppfunc(type##3 a) { return {func(a.x), func(a.y), func(a.z)}; } \
+    type##4 cppfunc(type##4 a) { return {func(a.x), func(a.y), func(a.z), func(a.w)}; }
+
+#define macroVectorRepeatFn2Impl(type, cppfunc, func)                                                  \
+    type##2 cppfunc(type##2 a, type##2 b) { return {func(a.x, b.x), func(a.y, b.y)}; }                 \
+    type##3 cppfunc(type##3 a, type##3 b) { return {func(a.x, b.x), func(a.y, b.y), func(a.z, b.z)}; } \
+    type##4 cppfunc(type##4 a, type##4 b) { return {func(a.x, b.x), func(a.y, b.y), func(a.z, b.z), func(a.w, z.w)}; }
+
+#endif // SIMD_CMATH_MATH
+
+//-----------------------------------
+
+// clang-format off
+
+#include <inttypes.h> // for u/long
+
+#include <string>     // for formatter (only works using std::string, not eastl)
+
+#include <math.h>     // for sqrt, sqrtf
+#if SIMD_FLOAT
+#include <float.h>    // for FLT_MAX
+#endif
+
+#if SIMD_NEON
+// neon types and intrinsics, 16B
+#include <arm_neon.h>
+
+// This converts sse to neon intrinsics.
+// Only for transpose_affine (_mm_shuffle_ps)
+// Good reference for SSE <-> Neon mapping .
+#include "sse2neon-arm64.h"
+
+#else
+// SSE intrinsics up to AVX-512, but depends on -march avx2 -mf16c -fma
+#include <immintrin.h>
+#endif // SIMD_NEON
+
+// using macros here cuts the ifdefs a lot, leaked out of this into .cpp files
+#define vec2to4(x) (x).xyyy
+#define vec3to4(x) (x).xyzz
+#define vec4to2(x) (x).xy
+#define vec4to3(x) (x).xyz
+
+// moved vec/matrix ops into secondary headers
+#include "int234.h"
+#include "long234.h"
+
+#include "half234.h"
+#include "float234.h"
+#include "double234.h"
+
+// This may not belong in here.  But just want to use the lib to build
+// some helpers.
+#include "bounds234.h"
+
+//---------------------------
+
+#if SIMD_CHAR
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// define c vector types
+macroVector1TypesAligned(char, char)
+macroVector1TypesPacked(char, char)
+
+#if SIMD_ACCELERATE_MATH_NAMES
+macroVector1TypesStorageRenames(char, simd_char)
+#endif // SIMD_ACCELERATE_MATH_NAMES
+
+#ifdef __cplusplus
+}
+
+namespace SIMD_NAMESPACE {
+macroVector1TypesStorageRenames(char, char)
+}
+#endif // __cplusplus
+#endif // SIMD_CHAR
+
+//------------
+#if SIMD_SHORT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// define c vector types
+macroVector2TypesAligned(short, short)
+macroVector2TypesPacked(short, short)
+
+#if SIMD_ACCELERATE_MATH_NAMES
+macroVector2TypesStorageRenames(short, simd_short)
+#endif // SIMD_ACCELERATE_MATH_NAMES
+
+#ifdef __cplusplus
+}
+
+namespace SIMD_NAMESPACE {
+macroVector2TypesStorageRenames(short, short)
+}
+#endif // __cplusplus
+#endif // SIMD_SHORT
+
+// clang-format on
+
+//-------------------
+#ifdef __cplusplus
+
+namespace SIMD_NAMESPACE {
+
+// conversions
+// keeping these here due to ordering issues of header includes
+
+#if SIMD_FLOAT
+
+#if SIMD_INT // && SIMD_FLOAT
+SIMD_CALL float2 float2m(int2 x) { return __builtin_convertvector(x, float2); }
+SIMD_CALL float3 float3m(int3 x) { return __builtin_convertvector(x, float3); }
+SIMD_CALL float4 float4m(int4 x) { return __builtin_convertvector(x, float4); }
+
+SIMD_CALL int2 float2m(float2 x) { return __builtin_convertvector(x, int2); }
+SIMD_CALL int3 float3m(float3 x) { return __builtin_convertvector(x, int3); }
+SIMD_CALL int4 float4m(float4 x) { return __builtin_convertvector(x, int4); }
+
+#endif // SIMD_INT
+
+#if SIMD_HALF // && SIMD_FLOAT
+
+#if SIMD_HALF4_ONLY
+
+// half type is short, so builtin convert doesn't work
+half4 half4m(float4 x);
+SIMD_CALL half2 half2m(float2 x) { return vec4to2(half4m(vec2to4(x))); }
+SIMD_CALL half3 half3m(float3 x) { return vec4to3(half4m(vec3to4(x))); }
+
+float4 float4m(half4 x);
+SIMD_CALL float2 float2m(half2 x) { return vec4to2(float4m(vec2to4(x))); }
+SIMD_CALL float3 float3m(half3 x) { return vec4to3(float4m(vec3to4(x))); }
+
+#else
+
+SIMD_CALL float2 float2m(half2 x) { return __builtin_convertvector(x, float2); }
+SIMD_CALL float3 float3m(half3 x) { return __builtin_convertvector(x, float3); }
+SIMD_CALL float4 float4m(half4 x) { return __builtin_convertvector(x, float4); }
+
+SIMD_CALL half2 half2m(float2 x) { return __builtin_convertvector(x, half2); }
+SIMD_CALL half3 half3m(float3 x) { return __builtin_convertvector(x, half3); }
+SIMD_CALL half4 half4m(float4 x) { return __builtin_convertvector(x, half4); }
+#endif
+
+#endif // SIMD_HALF
+
+#if SIMD_DOUBLE // && SIMD_FLOAT
+SIMD_CALL double2 double2m(float2 x) { return __builtin_convertvector(x, double2); }
+SIMD_CALL double3 double3m(float3 x) { return __builtin_convertvector(x, double3); }
+SIMD_CALL double4 double4m(float4 x) { return __builtin_convertvector(x, double4); }
+
+SIMD_CALL float2 float2m(double2 x) { return __builtin_convertvector(x, float2); }
+SIMD_CALL float3 float3m(double3 x) { return __builtin_convertvector(x, float3); }
+SIMD_CALL float4 float4m(double4 x) { return __builtin_convertvector(x, float4); }
+
+SIMD_CALL float2x2 float2x2m(const double2x2& x) { return float2x2(float2m(x[0]), float2m(x[1])); }
+SIMD_CALL float3x3 float3x3m(const double3x3& x) { return float3x3(float3m(x[0]), float3m(x[1]), float3m(x[2])); }
+SIMD_CALL float3x4 float3x4m(const double3x4& x) { return float3x4(float4m(x[0]), float4m(x[1]), float4m(x[2])); }
+SIMD_CALL float4x4 float4x4m(const double4x4& x) { return float4x4(float4m(x[0]), float4m(x[1]), float4m(x[2]), float4m(x[3])); }
+
+#endif // SIMD_DOUBLE
+
+#endif // SIMD_FLOAT
+
+#if SIMD_DOUBLE
+
+#if SIMD_LONG
+SIMD_CALL double2 double2m(long2 x) { return __builtin_convertvector(x, double2); }
+SIMD_CALL double3 double3m(long3 x) { return __builtin_convertvector(x, double3); }
+SIMD_CALL double4 double4m(long4 x) { return __builtin_convertvector(x, double4); }
+
+SIMD_CALL long2 long2m(double2 x) { return __builtin_convertvector(x, long2); }
+SIMD_CALL long3 long3m(double3 x) { return __builtin_convertvector(x, long3); }
+SIMD_CALL long4 long4m(double4 x) { return __builtin_convertvector(x, long4); }
+#endif // SIMD_LONG
+#endif // SIMD_DOUBLE
+
+//---------------------------
+// formatting
+
+using namespace STL_NAMESPACE;
+
+// Usage:
+// vecf vfmt(fmtToken);
+// fprintf(stdout, "%s", vfmt.str(v1).c_str() );
+// This may seem extreme to pass string, but it has SSO and keeps temp alive to printf.
+struct vecf {
+    // TODO: add formatting options too
+    // no packed float support
+    vecf() {}
+
+#if SIMD_FLOAT
+    // vector
+    string str(float2 v) const;
+    string str(float3 v) const;
+    string str(float4 v) const;
+
+    // matrix
+    string str(const float2x2& m) const;
+    string str(const float3x3& m) const;
+    string str(const float4x4& m) const;
+
+    // quat
+    string quat(quatf q) const { return str(q.v); }
+#endif // SIMD_FLOAT
+
+#if SIMD_DOUBLE
+    // vector
+    string str(double2 v) const;
+    string str(double3 v) const;
+    string str(double4 v) const;
+
+    // matrix
+    string str(const double2x2& m) const;
+    string str(const double3x3& m) const;
+    string str(const double4x4& m) const;
+#endif // SIMD_DOUBLE
+
+#if SIMD_HALF
+    // vector
+    string str(half2 v) const;
+    string str(half3 v) const;
+    string str(half4 v) const;
+#endif
+
+#if SIMD_LONG
+    // vector
+    string str(long2 v) const;
+    string str(long3 v) const;
+    string str(long4 v) const;
+#endif
+
+#if SIMD_INT
+    // vector
+    string str(int2 v) const;
+    string str(int3 v) const;
+    string str(int4 v) const;
+#endif
+
+#if SIMD_SHORT
+    // vector
+    //string str(short2 v) const;
+    //string str(short3 v) const;
+    //string str(short4 v) const;
+    //string str(short8 v) const;
+    //string str(short16 v) const;
+#endif
+    
+#if SIMD_CHAR
+    // vector
+    // TODO: up to 32
+    //string str(char2 v) const;
+    //string str(char3 v) const;
+    //string str(char4 v) const;
+    //string str(char8 v) const;
+    //string str(char16 v) const;
+    //string str(char32 v) const;
+#endif
+
+    // Just stuffing this here for now
+    string simd_configs() const;
+    string simd_alignments() const;
+};
+
+} // namespace SIMD_NAMESPACE
+
+#endif // __cplusplus
+
+//-------------------------------
+// Compression utils.
+// Fast delta encode/decode for indices.
+#if SIMD_INT
+void deltaEncodeU32(uint32_t * buffer, size_t length, uint32_t starting_point = 0);
+void deltaDecodeU32(uint32_t * buffer, size_t length, uint32_t starting_point = 0);
+#endif
+
+#if SIMD_SHORT
+void deltaEncodeU16(uint16_t * buffer, size_t length, uint16_t starting_point = 0);
+void deltaDecodeU16(uint16_t * buffer, size_t length, uint16_t starting_point = 0);
+#endif
+
+#if SIMD_CHAR
+void deltaEncodeU8(uint8_t * buffer, size_t length, uint8_t starting_point = 0);
+void deltaDecodeU8(uint8_t * buffer, size_t length, uint8_t starting_point = 0);
+#endif
+
+#endif
diff --git a/libkram/zstd/zstd.cpp b/libkram/zstd/zstd.cpp
index fdd1ae8d..a1f2e042 100644
--- a/libkram/zstd/zstd.cpp
+++ b/libkram/zstd/zstd.cpp
@@ -30,6 +30,14 @@
  *
  * Note: multithreading is enabled for all platforms apart from Emscripten.
  */
+
+/*- Compiler specifics -*/
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
 #define DEBUGLEVEL 0
 #define MEM_MODULE
 #undef  XXH_NAMESPACE
@@ -44,6 +52,12 @@
 #endif
 #define ZSTD_TRACE 0
 
+#ifdef NDEBUG
+#define assert_or_fallthrough() [[fallthrough]]
+#else
+#define assert_or_fallthrough() assert(false)
+#endif
+
 /* Include zstd_deps.h first with all the options we need enabled. */
 #define ZSTD_DEPS_NEED_MALLOC
 #define ZSTD_DEPS_NEED_MATH64
@@ -652,9 +666,9 @@ void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
 *****************************************************************/
 #if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
 #  if defined(_AIX)
-#    include <inttypes.h>
+//#    include <inttypes.h>
 #  else
-#    include <stdint.h> /* intptr_t */
+//#    include <stdint.h> /* intptr_t */
 #  endif
   typedef   uint8_t BYTE;
   typedef  uint16_t U16;
@@ -1800,22 +1814,22 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         switch(srcSize)
         {
         case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
-                /* fall-through */
+                [[fallthrough]]; /* fall-through */
 
         case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
-                /* fall-through */
+                [[fallthrough]]; /* fall-through */
 
         case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
-                /* fall-through */
+                [[fallthrough]]; /* fall-through */
 
         case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
-                /* fall-through */
+                [[fallthrough]]; /* fall-through */
 
         case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
-                /* fall-through */
+                [[fallthrough]]; /* fall-through */
 
         case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
-                /* fall-through */
+                [[fallthrough]]; /* fall-through */
 
         default: break;
         }
@@ -3659,7 +3673,7 @@ int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
 
 #elif defined(ZSTD_MULTITHREAD)    /* posix assumed ; need a better detection method */
 /* ===   POSIX Systems   === */
-#  include <pthread.h>
+//#  include <pthread.h>
 
 #if DEBUGLEVEL < 1
 
@@ -3901,8 +3915,8 @@ extern "C" {
 #define ZSTD_H_235446
 
 /* ======   Dependency   ======*/
-#include <limits.h>   /* INT_MAX */
-#include <stddef.h>   /* size_t */
+//#include <limits.h>   /* INT_MAX */
+//#include <stddef.h>   /* size_t */
 
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
@@ -10218,13 +10232,13 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
     {
         case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
                  HUF_FLUSHBITS_2(&bitC);
-		 /* fall-through */
+            [[fallthrough]]; /* fall-through */
         case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
                  HUF_FLUSHBITS_1(&bitC);
-		 /* fall-through */
+            [[fallthrough]]; /* fall-through */
         case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
                  HUF_FLUSHBITS(&bitC);
-		 /* fall-through */
+            [[fallthrough]]; /* fall-through */
         case 0 : /* fall-through */
         default: break;
     }
@@ -11918,8 +11932,8 @@ MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParam
     case ZSTD_lcm_uncompressed:
         return 1;
     default:
-        assert(0 /* impossible: pre-validated */);
-        /* fall-through */
+        //assert(0 /* impossible: pre-validated */);
+        //[[fallthrough]]; /* fall-through */
     case ZSTD_lcm_auto:
         return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
     }
@@ -18551,7 +18565,6 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
     return (size_t)(op-ostart);
 }
 
-
 static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
                                     const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID)
 {   BYTE* const op = (BYTE*)dst;
@@ -18579,7 +18592,7 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
     if (!singleSegment) op[pos++] = windowLogByte;
     switch(dictIDSizeCode)
     {
-        default:  assert(0); /* impossible */
+        default:  assert_or_fallthrough(); /* impossible */
         case 0 : break;
         case 1 : op[pos] = (BYTE)(dictID); pos++; break;
         case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
@@ -18587,7 +18600,7 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
     }
     switch(fcsCode)
     {
-        default:  assert(0); /* impossible */
+        default:  assert_or_fallthrough(); /* impossible */
         case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
         case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
         case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
@@ -19991,7 +20004,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 zcs->outBuffFlushedSize = 0;
                 zcs->streamStage = zcss_flush; /* pass-through to flush stage */
             }
-	    /* fall-through */
+                [[fallthrough]]; /* fall-through */
         case zcss_flush:
             DEBUGLOG(5, "flush stage");
             assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered);
@@ -23099,7 +23112,7 @@ ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalB
     return mask;
   switch (totalBits) {
     default:
-      assert(0);
+          assert_or_fallthrough();
     case 16:
       return (mask >> rotation) | (U16)(mask << (16 - rotation));
     case 32:
@@ -31936,7 +31949,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
         }
         switch(dictIDSizeCode)
         {
-            default: assert(0);  /* impossible */
+            default: assert_or_fallthrough();  /* impossible */
             case 0 : break;
             case 1 : dictID = ip[pos]; pos++; break;
             case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
@@ -31944,7 +31957,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
         }
         switch(fcsID)
         {
-            default: assert(0);  /* impossible */
+            default: assert_or_fallthrough();  /* impossible */
             case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
             case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
             case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
@@ -32478,8 +32491,8 @@ static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx)
 {
     switch (dctx->dictUses) {
     default:
-        assert(0 /* Impossible */);
-        /* fall-through */
+        //assert(0 /* Impossible */);
+        //[[fallthrough]]; /* fall-through */
     case ZSTD_dont_use:
         ZSTD_clearDict(dctx);
         return NULL;
@@ -32542,7 +32555,7 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
     switch(dctx->stage)
     {
     default:   /* should not happen */
-        assert(0);
+            assert_or_fallthrough();
     case ZSTDds_getFrameHeaderSize:
     case ZSTDds_decodeFrameHeader:
         return ZSTDnit_frameHeader;
@@ -33370,7 +33383,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             zds->legacyVersion = 0;
             zds->hostageByte = 0;
             zds->expectedOutBuffer = *output;
-            /* fall-through */
+            [[fallthrough]]; /* fall-through */
 
         case zdss_loadHeader :
             DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
@@ -33508,7 +33521,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                         zds->outBuffSize = neededOutBuffSize;
             }   }   }
             zds->streamStage = zdss_read;
-            /* fall-through */
+            [[fallthrough]]; /* fall-through */
 
         case zdss_read:
             DEBUGLOG(5, "stage zdss_read");
@@ -33527,7 +33540,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             }   }
             if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
             zds->streamStage = zdss_load;
-            /* fall-through */
+            [[fallthrough]]; /* fall-through */
 
         case zdss_load:
             {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
@@ -33729,7 +33742,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
         case set_repeat:
             DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
             RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
-            /* fall-through */
+            [[fallthrough]]; /* fall-through */
 
         case set_compressed:
             RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
@@ -35852,11 +35865,12 @@ void COVER_dictSelectionFree(COVER_dictSelection_t selection);
 static int g_displayLevel = 2;
 #endif
 #undef  DISPLAY
-#define DISPLAY(...)                                                           \
-  {                                                                            \
-    fprintf(stderr, __VA_ARGS__);                                              \
-    fflush(stderr);                                                            \
-  }
+#define DISPLAY(fmt, ...) KLOGI("zstd", fmt, ##__VA_ARGS__)
+//#define DISPLAY(...)                                                           \
+//  {                                                                            \
+//    fprintf(stderr, __VA_ARGS__);                                              \
+//    fflush(stderr);                                                            \
+//  }
 #undef  LOCALDISPLAYLEVEL
 #define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
   if (displayLevel >= l) {                                                     \
@@ -37075,9 +37089,11 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
  */
 
 /*- Compiler specifics -*/
-#ifdef __clang__
-#pragma clang diagnostic ignored "-Wshorten-64-to-32"
-#endif
+//#ifdef __clang__
+//#pragma clang diagnostic push
+//#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+//#pragma clang diagnostic ignored "-Wunused-function"
+//#endif
 
 #if defined(_MSC_VER)
 #  pragma warning(disable : 4244)
@@ -39080,11 +39096,12 @@ divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char *
 static int g_displayLevel = 2;
 #endif
 #undef  DISPLAY
-#define DISPLAY(...)                                                           \
-  {                                                                            \
-    fprintf(stderr, __VA_ARGS__);                                              \
-    fflush(stderr);                                                            \
-  }
+#define DISPLAY(fmt, ...) KLOGI("zstd", fmt, ##__VA_ARGS__)
+//#define DISPLAY(...)                                                           \
+//  {                                                                            \
+//    fprintf(stderr, __VA_ARGS__);                                              \
+//    fflush(stderr);                                                            \
+//  }
 #undef  LOCALDISPLAYLEVEL
 #define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
   if (displayLevel >= l) {                                                     \
@@ -39868,7 +39885,8 @@ static const U32 g_selectivity_default = 9;
 *  Console display
 ***************************************/
 #undef  DISPLAY
-#define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
+#define DISPLAY(fmt, ...) KLOGI("zstd", fmt, ##__VA_ARGS__)
+// #define DISPLAY(...)         { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
 #undef  DISPLAYLEVEL
 #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); }    /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
 
@@ -40333,7 +40351,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
 #   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
             if (ZDICT_clockSpan(displayClock) > refreshRate)  \
             { displayClock = clock(); DISPLAY(__VA_ARGS__); \
-            if (notificationLevel>=4) fflush(stderr); } }
+            } }
 
     /* init */
     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
@@ -40927,3 +40945,8 @@ size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize
                                                      params);
 }
 /**** ended inlining dictBuilder/zdict.c ****/
+
+/*- Compiler specifics -*/
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
diff --git a/libkram/zstd/zstd.h b/libkram/zstd/zstd.h
index 4651e6c4..e011166c 100644
--- a/libkram/zstd/zstd.h
+++ b/libkram/zstd/zstd.h
@@ -15,8 +15,8 @@ extern "C" {
 #define ZSTD_H_235446
 
 /* ======   Dependency   ======*/
-#include <limits.h>   /* INT_MAX */
-#include <stddef.h>   /* size_t */
+//#include <limits.h>   /* INT_MAX */
+//#include <stddef.h>   /* size_t */
 
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
diff --git a/libkram/zstd/zstddeclib.cpp b/libkram/zstd/zstddeclib.cpp
index c4f292fb..04dee931 100644
--- a/libkram/zstd/zstddeclib.cpp
+++ b/libkram/zstd/zstddeclib.cpp
@@ -40,6 +40,12 @@
 #define ZSTD_STRIP_ERROR_STRINGS
 #define ZSTD_TRACE 0
 
+#ifdef NDEBUG
+#define assert_or_fallthrough() [[fallthrough]]
+#else
+#define assert_or_fallthrough() assert(false)
+#endif
+
 /* Include zstd_deps.h first with all the options we need enabled. */
 #define ZSTD_DEPS_NEED_MALLOC
 /**** start inlining common/zstd_deps.h ****/
@@ -322,14 +328,15 @@ int g_debuglevel = DEBUGLEVEL;
 #ifndef MEM_H_MODULE
 #define MEM_H_MODULE
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /*-****************************************
 *  Dependencies
 ******************************************/
 #include <stddef.h>  /* size_t, ptrdiff_t */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
 /**** start inlining compiler.h ****/
 /*
  * Copyright (c) Yann Collet, Facebook, Inc.
@@ -647,9 +654,9 @@ void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
 *****************************************************************/
 #if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
 #  if defined(_AIX)
-#    include <inttypes.h>
+//#    include <inttypes.h>
 #  else
-#    include <stdint.h> /* intptr_t */
+//#    include <stdint.h> /* intptr_t */
 #  endif
   typedef   uint8_t BYTE;
   typedef  uint16_t U16;
@@ -1040,9 +1047,9 @@ MEM_STATIC void MEM_check(void) { DEBUG_STATIC_ASSERT((sizeof(size_t)==4) || (si
 #ifndef ERROR_H_MODULE
 #define ERROR_H_MODULE
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
+//#if defined (__cplusplus)
+//extern "C" {
+//#endif
 
 
 /* ****************************************
@@ -1062,13 +1069,12 @@ extern "C" {
 #ifndef ZSTD_ERRORS_H_398273423
 #define ZSTD_ERRORS_H_398273423
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /*===== dependency =====*/
 #include <stddef.h>   /* size_t */
 
+#if defined (__cplusplus)
+extern "C" {
+#endif
 
 /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
 #ifndef ZSTDERRORLIB_VISIBILITY
@@ -1148,6 +1154,10 @@ ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /**< Sa
 /**** skipping file: zstd_deps.h ****/
 
 
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
 /* ****************************************
 *  Compiler-specific
 ******************************************/
@@ -1795,22 +1805,22 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         switch(srcSize)
         {
         case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
-                /* fall-through */
+                [[fallthrough]];/* fall-through */
 
         case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
-                /* fall-through */
+                [[fallthrough]];/* fall-through */
 
         case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
-                /* fall-through */
+                [[fallthrough]]; /* fall-through */
 
         case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
-                /* fall-through */
+                [[fallthrough]];/* fall-through */
 
         case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
-                /* fall-through */
+                [[fallthrough]];/* fall-through */
 
         case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
-                /* fall-through */
+                [[fallthrough]];/* fall-through */
 
         default: break;
         }
@@ -3612,6 +3622,11 @@ size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cS
  * in the COPYING file in the root directory of this source tree).
  * You may select, at your option, one of the above-listed licenses.
  */
+
+/* ======   Dependency   ======*/
+#include <limits.h>   /* INT_MAX */
+#include <stddef.h>   /* size_t */
+
 #if defined (__cplusplus)
 extern "C" {
 #endif
@@ -3619,9 +3634,6 @@ extern "C" {
 #ifndef ZSTD_H_235446
 #define ZSTD_H_235446
 
-/* ======   Dependency   ======*/
-#include <limits.h>   /* INT_MAX */
-#include <stddef.h>   /* size_t */
 
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
@@ -7270,11 +7282,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 #ifndef ZSTD_TRACE_H
 #define ZSTD_TRACE_H
 
+#include <stddef.h>
+
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
-#include <stddef.h>
 
 /* weak symbol support */
 #if !defined(ZSTD_HAVE_WEAK_SYMBOLS) && defined(__GNUC__) && \
@@ -9820,15 +9833,14 @@ extern "C" {
 #ifndef ZSTD_V01_H_28739879432
 #define ZSTD_V01_H_28739879432
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /* *************************************
 *  Includes
 ***************************************/
 #include <stddef.h>   /* size_t */
 
+#if defined (__cplusplus)
+extern "C" {
+#endif
 
 /* *************************************
 *  Simple one-step function
@@ -9918,15 +9930,16 @@ size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSi
 #ifndef ZSTD_V02_H_4174539423
 #define ZSTD_V02_H_4174539423
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /* *************************************
 *  Includes
 ***************************************/
 #include <stddef.h>   /* size_t */
 
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
 
 /* *************************************
 *  Simple one-step function
@@ -10015,15 +10028,14 @@ size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSi
 #ifndef ZSTD_V03_H_298734209782
 #define ZSTD_V03_H_298734209782
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /* *************************************
 *  Includes
 ***************************************/
 #include <stddef.h>   /* size_t */
 
+#if defined (__cplusplus)
+extern "C" {
+#endif
 
 /* *************************************
 *  Simple one-step function
@@ -10112,15 +10124,14 @@ size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSi
 #ifndef ZSTD_V04_H_91868324769238
 #define ZSTD_V04_H_91868324769238
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /* *************************************
 *  Includes
 ***************************************/
 #include <stddef.h>   /* size_t */
 
+#if defined (__cplusplus)
+extern "C" {
+#endif
 
 /* *************************************
 *  Simple one-step function
@@ -10258,14 +10269,15 @@ size_t ZBUFFv04_recommendedDOutSize(void);
 #ifndef ZSTDv05_H
 #define ZSTDv05_H
 
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
 /*-*************************************
 *  Dependencies
 ***************************************/
 #include <stddef.h>   /* size_t */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
 /**** skipping file: ../common/mem.h ****/
 
 
@@ -10424,12 +10436,13 @@ size_t ZBUFFv05_recommendedDOutSize(void);
 #ifndef ZSTDv06_H
 #define ZSTDv06_H
 
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
-/*======  Dependency  ======*/
-#include <stddef.h>   /* size_t */
 
 
 /*======  Export for Windows  ======*/
@@ -10600,12 +10613,13 @@ ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void);
 #ifndef ZSTDv07_H_235446
 #define ZSTDv07_H_235446
 
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
-/*======  Dependency  ======*/
-#include <stddef.h>   /* size_t */
 
 
 /*======  Export for Windows  ======*/
@@ -11891,7 +11905,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
         }
         switch(dictIDSizeCode)
         {
-            default: assert(0);  /* impossible */
+            default: assert_or_fallthrough();  /* impossible */
             case 0 : break;
             case 1 : dictID = ip[pos]; pos++; break;
             case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
@@ -11899,7 +11913,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s
         }
         switch(fcsID)
         {
-            default: assert(0);  /* impossible */
+            default: assert_or_fallthrough();  /* impossible */
             case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
             case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
             case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
@@ -12433,8 +12447,8 @@ static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx)
 {
     switch (dctx->dictUses) {
     default:
-        assert(0 /* Impossible */);
-        /* fall-through */
+        //assert(0 /* Impossible */);
+        //[[fallthrough]]; /* fall-through */
     case ZSTD_dont_use:
         ZSTD_clearDict(dctx);
         return NULL;
@@ -12497,7 +12511,7 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
     switch(dctx->stage)
     {
     default:   /* should not happen */
-        assert(0);
+            assert_or_fallthrough();
     case ZSTDds_getFrameHeaderSize:
     case ZSTDds_decodeFrameHeader:
         return ZSTDnit_frameHeader;
@@ -13325,8 +13339,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             zds->legacyVersion = 0;
             zds->hostageByte = 0;
             zds->expectedOutBuffer = *output;
-            /* fall-through */
-
+            [[fallthrough]]; /* fall-through */
+            
         case zdss_loadHeader :
             DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
 #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
@@ -13463,7 +13477,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                         zds->outBuffSize = neededOutBuffSize;
             }   }   }
             zds->streamStage = zdss_read;
-            /* fall-through */
+            [[fallthrough]]; /* fall-through */
 
         case zdss_read:
             DEBUGLOG(5, "stage zdss_read");
@@ -13482,7 +13496,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             }   }
             if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
             zds->streamStage = zdss_load;
-            /* fall-through */
+            [[fallthrough]]; /* fall-through */
 
         case zdss_load:
             {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
@@ -13684,7 +13698,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
         case set_repeat:
             DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
             RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
-            /* fall-through */
+            [[fallthrough]]; /* fall-through */
 
         case set_compressed:
             RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index c1ea92db..a56a4754 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -34,11 +34,11 @@ target_link_libraries(${myTargetApp}
 set_target_properties(${myTargetApp} PROPERTIES
     # Note: match this up with CXX version
     # c++11 min
-    XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++14"
+    XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++20"
     XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++"
 
-    # avx1
-    XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx"
+    # avx2
+    XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx2"
     
     # turn off exceptions/rtti
     XCODE_ATTRIBUTE_GCC_ENABLE_CPP_EXCEPTIONS NO
diff --git a/plugin/kps/KPS.cpp b/plugin/kps/KPS.cpp
index cdf3186d..6e7439f5 100755
--- a/plugin/kps/KPS.cpp
+++ b/plugin/kps/KPS.cpp
@@ -84,7 +84,7 @@ extern MyMTLPixelFormat FormatToPixelFormat(DDS_Format fmt);
 // global needed by a bunch of Photoshop SDK routines
 SPBasicSuite *sSPBasic = NULL;
 
-using namespace NAMESPACE_STL;
+using namespace STL_NAMESPACE;
 
 const char* kBundleIdentifier = "com.ba.kram-ps";
 
diff --git a/scripts/cibuild.sh b/scripts/cibuild.sh
index bd233584..7d9a264c 100755
--- a/scripts/cibuild.sh
+++ b/scripts/cibuild.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # note: zsh works on  osx, but not on Win git bash, so using bash
 
@@ -66,40 +66,70 @@ if [[ $buildType == macos ]]; then
 	# this dir already exists, so don't have to mkdir
 	pushd build2
 
+    xargs=-showBuildTimingSummary
 	# build libraries
 	# see here about destination arg
 	# https://github.com/appcelerator/titanium_mobile/pull/13098
-    echo "::group::kram-ios"
-    xcodebuild build -sdk iphoneos -workspace kram.xcworkspace -scheme kram-ios -configuration Release -destination generic/platform=iOS CONFIGURATION_BUILD_DIR=${binPath} BUILD_LIBRARY_FOR_DISTRIBUTION=YES
+ 
+    echo "::group::list-targets"
+    xcodebuild build -workspace kram.xcworkspace -list
+    echo "::endgroup::"
+ 
+	# note there is a method to make an xcframework, but seems that it has to be signed
+	# instead the vos/ios libs will have unique output dirs, but don't have to when used in a workspace
+
+    # vectormath
+    echo "::group::vectormath-vos"
+    xcodebuild build -sdk xros -workspace kram.xcworkspace -scheme vectormath -configuration Release ${xargs} -destination generic/platform=visionOS CONFIGURATION_BUILD_DIR=${binPath}/vos BUILD_LIBRARY_FOR_DISTRIBUTION=YES
+    echo "::endgroup::"
+    
+    echo "::group::vectormath-ios"
+    xcodebuild build -sdk iphoneos -workspace kram.xcworkspace -scheme vectormath -configuration Release ${xargs} -destination generic/platform=iOS CONFIGURATION_BUILD_DIR=${binPath}/ios BUILD_LIBRARY_FOR_DISTRIBUTION=YES
     echo "::endgroup::"
  
+    echo "::group::vectormath"
+    xcodebuild build -sdk macosx -workspace kram.xcworkspace -scheme vectormath -configuration Release ${xargs} -destination generic/platform=macOS CONFIGURATION_BUILD_DIR=${binPath}/mac BUILD_LIBRARY_FOR_DISTRIBUTION=YES
+	echo "::endgroup::"
+ 
+    # libkram
+    echo "::group::kram-vos"
+    xcodebuild build -sdk xros -workspace kram.xcworkspace -scheme kram -configuration Release ${xargs} -destination generic/platform=visionOS CONFIGURATION_BUILD_DIR=${binPath}/vos BUILD_LIBRARY_FOR_DISTRIBUTION=YES
+    echo "::endgroup::"
+    
+    echo "::group::kram-ios"
+    xcodebuild build -sdk iphoneos -workspace kram.xcworkspace -scheme kram -configuration Release ${xargs} -destination generic/platform=iOS CONFIGURATION_BUILD_DIR=${binPath}/ios BUILD_LIBRARY_FOR_DISTRIBUTION=YES
+    echo "::endgroup::"
+    
     echo "::group::kram"
-    xcodebuild build -sdk macosx -workspace kram.xcworkspace -scheme kram -configuration Release -destination generic/platform=macOS CONFIGURATION_BUILD_DIR=${binPath} BUILD_LIBRARY_FOR_DISTRIBUTION=YES
+    xcodebuild build -sdk macosx -workspace kram.xcworkspace -scheme kram -configuration Release ${xargs} -destination generic/platform=macOS CONFIGURATION_BUILD_DIR=${binPath}/mac BUILD_LIBRARY_FOR_DISTRIBUTION=YES
     echo "::endgroup::"
  
 	# install apps so they are signed
 	# can't specify empty INSTALL_PATH, or xcodebuild succeeds but copies nothing to bin
+ 
+    # kramc cli
     echo "::group::kramc"
-    xcodebuild install -sdk macosx -workspace kram.xcworkspace -scheme kramc -configuration Release -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
+    xcodebuild install -sdk macosx -workspace kram.xcworkspace -scheme kramc -configuration Release ${xargs} -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
     echo "::endgroup::"
       
+    # kramv viewer
     echo "::group::kramv"
-	xcodebuild install -sdk macosx -workspace kram.xcworkspace -scheme kramv -configuration Release -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
+	xcodebuild install -sdk macosx -workspace kram.xcworkspace -scheme kramv -configuration Release ${xargs} -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
     echo "::endgroup::"
     
 	popd
 
-	# build hlslparser to bin directory
+	# hlslparser
 	pushd hlslparser
     echo "::group::hlsl-parser"
-    xcodebuild install -sdk macosx -project hlslparser.xcodeproj -configuration Release -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
+    xcodebuild install -sdk macosx -project hlslparser.xcodeproj -scheme hlslparser -configuration Release ${xargs} -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
     echo "::endgroup::"
 	popd
 
-    # build kram-profile to bin directory
+    # kram-profile
     pushd kram-profile
     echo "::group::kram-profiler"
-    xcodebuild install -sdk macosx -project kram-profile.xcodeproj -configuration Release -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
+    xcodebuild install -sdk macosx -project kram-profile.xcodeproj -scheme kram-profile -configuration Release ${xargs} -destination generic/platform=macOS DSTROOT=${binHolderPath} INSTALL_PATH=bin
     echo "::endgroup::"
     popd
 
@@ -134,11 +164,15 @@ elif [[ $buildType == windows ]]; then
 elif [[ $buildType == linux ]]; then
     echo "::group::kram-linux"
 
-    mkdir -p build
+    mkdir -p build3
 
-	pushd build
+	pushd build3
 
-    cmake ..
+    # this will use make
+    # cmake ..
+   
+    # this uses Ninja, so can see failures
+    cmake .. -G Ninja
 
 	# build the release build
 	cmake --build . --config Release
diff --git a/scripts/formatSources.sh b/scripts/formatSources.sh
index 45547dd0..7c01f0cd 100755
--- a/scripts/formatSources.sh
+++ b/scripts/formatSources.sh
@@ -3,15 +3,42 @@
 # use the app/clang_format to only process sources in app directory
 # eventually replace with git hook.  This script only runs on Posix.
 
-pushd ../libkram/kram
-clang-format -style=file -i Kram*.cpp
-clang-format -style=file -i Kram*.h
-clang-format -style=file -i KTX*.cpp
-clang-format -style=file -i KTX*.h
+# pushd ../libkram/kram
+# clang-format -style=file -i Kram*.cpp
+# clang-format -style=file -i Kram*.h
+# clang-format -style=file -i KTX*.cpp
+# clang-format -style=file -i KTX*.h
+# popd
+
+# pushd ../kramv
+# clang-format -style=file -i Kram*.cpp
+# clang-format -style=file -i Kram*.h
+# clang-format -style=file -i Kram*.mm
+# popd
+
+
+# hope that the ignore file does so
+#pushd ../libkram/kram
+#clang-format -style=file -i *.*
+#popd
+
+#pushd ../libkram
+# this doesn't seem to honor the ignore file
+# find ../libkram -iname '*.h' -o -iname '*.cpp' | xargs clang-format -i
+#popd
+
+# no recursion for clang-format
+pushd ../libkram
+clang-format -style=file -i kram/*.(cpp|h)
+clang-format -style=file -i vectormath/*.(cpp|h)
 popd
 
-pushd ../kramv
-clang-format -style=file -i Kram*.cpp
-clang-format -style=file -i Kram*.h
-clang-format -style=file -i Kram*.mm
+pushd ..
+clang-format -style=file -i kramv/*.(cpp|h||mm)
+clang-format -style=file -i kramc/*.(cpp|h)
+clang-format -style=file -i kram-thumb/*.(cpp|h|mm)
+clang-format -style=file -i kram-thumb-win/*.(cpp|h)
+clang-format -style=file -i kram-profile/Source/*.(cpp|h|mm)
+clang-format -style=file -i kram-preview/*.(cpp|h|mm)
+clang-format -style=file -i hlslparser/src/*.(cpp|h)
 popd
\ No newline at end of file
diff --git a/scripts/kramTests.sh b/scripts/kramTests.sh
index b38d6e23..6d40b3ce 100755
--- a/scripts/kramTests.sh
+++ b/scripts/kramTests.sh
@@ -1,18 +1,17 @@
-#!/bin/zsh
+#!/usr/bin/env zsh
 
-args=$1
+# return all args from $1 onward
+# can pass --force, -c ktx, -c dds, -c ktx2
+args=$@
 
-../scripts/kramTextures.py -p mac --bundle ${args}
-#../scripts/kramTextures.py -p mac -c ktx --bundle ${args}
+../scripts/kramTextures.py -p mac --bundle ${=args}
 
-../scripts/kramTextures.py -p ios --bundle ${args}
-#../scripts/kramTextures.py -p ios -c ktx --bundle ${args} 
+../scripts/kramTextures.py -p ios --bundle ${=args}
 
 # this takes 15s+ with ETC2comp
-../scripts/kramTextures.py -p android --bundle ${args}
-#../scripts/kramTextures.py -p -c ktx android --bundle ${args}
+../scripts/kramTextures.py -p android --bundle ${=args}
 
 # this only has ktx2 form, tests uastc which kram doesn't open/save yet
-#../scripts/kramTextures.py -p any --bundle ${args}
+#../scripts/kramTextures.py -p any --bundle ${=args}
 
 
diff --git a/scripts/pre-commit b/scripts/pre-commit
index 594af076..3014d5d5 100755
--- a/scripts/pre-commit
+++ b/scripts/pre-commit
@@ -12,7 +12,7 @@ format_file() {
   fi
 }
 
-for file in `git diff-index --cached --name-only HEAD | grep -iE '\Kram*.(cpp|h|m|mm)$' ` ; do
+# find staged files only
+for file in `git diff-index --cached --name-only HEAD | grep -iE '\.(c|cpp|h|hpp|)$' ` ; do
   format_file "${file}"
 done
-
diff --git a/scripts/simdk.py b/scripts/simdk.py
new file mode 100755
index 00000000..e194b7ce
--- /dev/null
+++ b/scripts/simdk.py
@@ -0,0 +1,153 @@
+# add this to your ~/.lldbinit with
+# command script import ~/yourpath/kram/scripts/simdk.py
+
+import lldb
+
+# simd library
+
+# the vector ext allow various forms of addressing, but they require python eval to do so.
+# but everything else fails.  Only the EvaluateExpression returns the values.
+
+def float2_summary(valobj, internal_dict):
+    frame = valobj.GetFrame()
+    
+    name = valobj.GetName()
+    x = frame.EvaluateExpression('{0}.x'.format(name)).GetValue()
+    y = frame.EvaluateExpression('{0}.y'.format(name)).GetValue()
+    return '({0},{1})'.format(x, y)
+   
+def float3_summary(valobj, internal_dict):
+    frame = valobj.GetFrame()
+    name = valobj.GetName()
+    x = frame.EvaluateExpression('{0}.x'.format(name)).GetValue()
+    y = frame.EvaluateExpression('{0}.y'.format(name)).GetValue()
+    z = frame.EvaluateExpression('{0}.z'.format(name)).GetValue()
+    return '({0},{1},{2})'.format(x, y, z)
+
+def float4_summary(valobj, internal_dict):
+    frame = valobj.GetFrame()
+    name = valobj.GetName()
+    x = frame.EvaluateExpression('{0}.x'.format(name)).GetValue()
+    y = frame.EvaluateExpression('{0}.y'.format(name)).GetValue()
+    z = frame.EvaluateExpression('{0}.z'.format(name)).GetValue()
+    w = frame.EvaluateExpression('{0}.w'.format(name)).GetValue()
+    return '({0},{1},{2},{3})'.format(x, y, z, w)
+
+def float2x2_summary(valobj, internal_dict):
+    frame = valobj.GetFrame()
+    name = valobj.GetName()
+        
+    structVar = valobj.GetChildAtIndex(0)
+        
+    x = structVar.GetChildAtIndex(0)
+    y = structVar.GetChildAtIndex(1)
+    
+    # TODO: This isn't using the formatters, so may want to evalExpression
+    return '\n{0}\n{1}'.format(x, y)
+   
+def float3x3_summary(valobj, internal_dict):
+    frame = valobj.GetFrame()
+    name = valobj.GetName()
+
+    structVar = valobj.GetChildAtIndex(0)
+        
+    x = structVar.GetChildAtIndex(0)
+    y = structVar.GetChildAtIndex(1)
+    z = structVar.GetChildAtIndex(2)
+    
+    # TODO: This isn't using the formatters, so may want to evalExpression
+    return '\n{0}\n{1}\n{2}'.format(x, y, z)
+
+def float3x4_summary(valobj, interal_dict):
+    return float3x3_summary(valobj, internal_dict)
+
+def float4x4_summary(valobj, internal_dict):
+    frame = valobj.GetFrame()
+    name = valobj.GetName()
+    
+    structVar = valobj.GetChildAtIndex(0)
+    
+    x = structVar.GetChildAtIndex(0)
+    y = structVar.GetChildAtIndex(1)
+    z = structVar.GetChildAtIndex(2)
+    w = structVar.GetChildAtIndex(3)
+    
+# TODO: how to make this work?  Just reports "None" is the frame incorrect?
+#    x = float4_summary(x, internal_dict)
+#    y = float4_summary(y, internal_dict)
+#    z = float4_summary(z, internal_dict)
+#    w = float4_summary(w, internal_dict)
+    
+    # TODO: This isn't using the formatters, so may want to evalExpression
+    return '\n{0}\n{1}\n{2}\n{3}'.format(x, y, z, w)
+
+
+def __lldb_init_module(debugger, internal_dict):
+   
+    # simd library, many more types here
+    debugger.HandleCommand("type summary add -F simdk.float2_summary simd_float2")
+    debugger.HandleCommand("type summary add -F simdk.float3_summary simd_float3")
+    debugger.HandleCommand("type summary add -F simdk.float4_summary simd_float4")
+
+    debugger.HandleCommand("type summary add -F simdk.float4x4_summary simd_float4x4")
+    debugger.HandleCommand("type summary add -F simdk.float3x4_summary simd_float3x4")
+    debugger.HandleCommand("type summary add -F simdk.float3x3_summary simd_float3x3")
+    debugger.HandleCommand("type summary add -F simdk.float2x2_summary simd_float2x2")
+   
+    # TODO: add char, short, int8a, float8a, ...
+
+    # simdk library
+    
+    # TODO: more packed types
+    # These just cause Xcode to report (None, None, None)
+    #debugger.HandleCommand("type summary add -F simdk.float2_summary half2p")
+    #debugger.HandleCommand("type summary add -F simdk.float3_summary half3p")
+    #debugger.HandleCommand("type summary add -F simdk.float4_summary half4p")
+
+    #debugger.HandleCommand("type summary add -F simdk.float2_summary float2p")
+    #debugger.HandleCommand("type summary add -F simdk.float3_summary float3p")
+    #debugger.HandleCommand("type summary add -F simdk.float4_summary float4p")
+
+    #debugger.HandleCommand("type summary add -F simdk.float2_summary double2p")
+    #debugger.HandleCommand("type summary add -F simdk.float3_summary double3p")
+    #debugger.HandleCommand("type summary add -F simdk.float4_summary double4p")
+
+    # aligned types
+    debugger.HandleCommand("type summary add -F simdk.float2_summary short2a")
+    debugger.HandleCommand("type summary add -F simdk.float3_summary short3a")
+    debugger.HandleCommand("type summary add -F simdk.float4_summary short4a")
+
+    debugger.HandleCommand("type summary add -F simdk.float2_summary int2a")
+    debugger.HandleCommand("type summary add -F simdk.float3_summary int3a")
+    debugger.HandleCommand("type summary add -F simdk.float4_summary int4a")
+
+    debugger.HandleCommand("type summary add -F simdk.float2_summary long2a")
+    debugger.HandleCommand("type summary add -F simdk.float3_summary long3a")
+    debugger.HandleCommand("type summary add -F simdk.float4_summary long4a")
+
+    debugger.HandleCommand("type summary add -F simdk.float2_summary half2a")
+    debugger.HandleCommand("type summary add -F simdk.float3_summary half3a")
+    debugger.HandleCommand("type summary add -F simdk.float4_summary half4a")
+
+    # float234
+    debugger.HandleCommand("type summary add -F simdk.float2_summary float2a")
+    debugger.HandleCommand("type summary add -F simdk.float3_summary float3a")
+    debugger.HandleCommand("type summary add -F simdk.float4_summary float4a")
+
+    debugger.HandleCommand("type summary add -F simdk.float4x4_summary simdk::float4x4")
+    debugger.HandleCommand("type summary add -F simdk.float3x4_summary simdk::float3x4")
+    debugger.HandleCommand("type summary add -F simdk.float3x3_summary simdk::float3x3")
+    debugger.HandleCommand("type summary add -F simdk.float2x2_summary simdk::float2x2")
+   
+    debugger.HandleCommand("type summary add -F simdk.float4_summary simdk::quatf")
+    
+    # double234
+    debugger.HandleCommand("type summary add -F simdk.float2_summary double2a")
+    debugger.HandleCommand("type summary add -F simdk.float3_summary double3a")
+    debugger.HandleCommand("type summary add -F simdk.float4_summary double4a")
+
+    debugger.HandleCommand("type summary add -F simdk.float4x4_summary simdk::double4x4")
+    debugger.HandleCommand("type summary add -F simdk.float3x4_summary simdk::double3x4")
+    debugger.HandleCommand("type summary add -F simdk.float3x3_summary simdk::double3x3")
+    debugger.HandleCommand("type summary add -F simdk.float2x2_summary simdk::double2x2")
+   
diff --git a/tests/src/Black4x4-a.png b/tests/src/Black4x4-a.png
new file mode 100644
index 00000000..20cae3b7
--- /dev/null
+++ b/tests/src/Black4x4-a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc7b6562601638aa7fb3e4c4ca7c900f0d39e0665a6616d30ccbbbaf37dea75
+size 135