From 9956d29493019f867118184c5d0379a031751f70 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 15 Apr 2022 13:23:44 -0700 Subject: [PATCH 001/615] kram - fix warnings from quoted includes in GLTFKit, and fix other warnings. --- gtlf/GLTF/Headers/GLTFAccessor.h | 4 ++-- gtlf/GLTF/Headers/GLTFAnimation.h | 4 ++-- gtlf/GLTF/Headers/GLTFAsset.h | 4 ++-- gtlf/GLTF/Headers/GLTFBuffer.h | 2 +- gtlf/GLTF/Headers/GLTFBufferAllocator.h | 2 +- gtlf/GLTF/Headers/GLTFBufferView.h | 4 ++-- gtlf/GLTF/Headers/GLTFCamera.h | 4 ++-- gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h | 2 +- gtlf/GLTF/Headers/GLTFImage.h | 4 ++-- gtlf/GLTF/Headers/GLTFMaterial.h | 4 ++-- gtlf/GLTF/Headers/GLTFMesh.h | 4 ++-- gtlf/GLTF/Headers/GLTFNode.h | 6 +++--- gtlf/GLTF/Headers/GLTFScene.h | 6 +++--- gtlf/GLTF/Headers/GLTFSkin.h | 2 +- gtlf/GLTF/Headers/GLTFTexture.h | 4 ++-- gtlf/GLTF/Headers/GLTFTextureSampler.h | 4 ++-- gtlf/GLTF/Headers/GLTFUtilities.h | 2 +- gtlf/GLTF/Headers/GLTFVertexDescriptor.h | 2 +- gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h | 2 +- gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h | 2 +- libkram/kram/KTXImage.cpp | 6 +++--- libkram/kram/Kram.cpp | 2 +- 22 files changed, 38 insertions(+), 38 deletions(-) diff --git a/gtlf/GLTF/Headers/GLTFAccessor.h b/gtlf/GLTF/Headers/GLTFAccessor.h index 29ef9363..78f7e1ba 100644 --- a/gtlf/GLTF/Headers/GLTFAccessor.h +++ b/gtlf/GLTF/Headers/GLTFAccessor.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import @import simd; diff --git a/gtlf/GLTF/Headers/GLTFAnimation.h b/gtlf/GLTF/Headers/GLTFAnimation.h index b70ab091..0386c0ae 100644 --- a/gtlf/GLTF/Headers/GLTFAnimation.h +++ b/gtlf/GLTF/Headers/GLTFAnimation.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFAsset.h b/gtlf/GLTF/Headers/GLTFAsset.h index 7ebcb3a7..ce6a2eb9 100644 --- a/gtlf/GLTF/Headers/GLTFAsset.h +++ b/gtlf/GLTF/Headers/GLTFAsset.h @@ -16,8 +16,8 @@ @import Foundation; -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFBuffer.h b/gtlf/GLTF/Headers/GLTFBuffer.h index 0be3d201..912b210d 100644 --- a/gtlf/GLTF/Headers/GLTFBuffer.h +++ b/gtlf/GLTF/Headers/GLTFBuffer.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFBufferAllocator.h b/gtlf/GLTF/Headers/GLTFBufferAllocator.h index 585ae8a1..e04119f6 100644 --- a/gtlf/GLTF/Headers/GLTFBufferAllocator.h +++ b/gtlf/GLTF/Headers/GLTFBufferAllocator.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFBuffer.h" +#import @import Foundation; diff --git a/gtlf/GLTF/Headers/GLTFBufferView.h b/gtlf/GLTF/Headers/GLTFBufferView.h index 2b41ec2f..06c95e95 100644 --- a/gtlf/GLTF/Headers/GLTFBufferView.h +++ b/gtlf/GLTF/Headers/GLTFBufferView.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFCamera.h b/gtlf/GLTF/Headers/GLTFCamera.h index 2c771157..942b845a 100644 --- a/gtlf/GLTF/Headers/GLTFCamera.h +++ b/gtlf/GLTF/Headers/GLTFCamera.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import @import simd; diff --git a/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h b/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h index dc0078f7..18e8a591 100644 --- a/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h +++ b/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h @@ -16,7 +16,7 @@ @import Foundation; -#import "GLTFBufferAllocator.h" +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFImage.h b/gtlf/GLTF/Headers/GLTFImage.h index add922e6..118e4cd5 100644 --- a/gtlf/GLTF/Headers/GLTFImage.h +++ b/gtlf/GLTF/Headers/GLTFImage.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFBufferView.h" +#import +#import @import Foundation; diff --git a/gtlf/GLTF/Headers/GLTFMaterial.h b/gtlf/GLTF/Headers/GLTFMaterial.h index 17ec4198..cebe325e 100644 --- a/gtlf/GLTF/Headers/GLTFMaterial.h +++ b/gtlf/GLTF/Headers/GLTFMaterial.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFTexture.h" +#import +#import @import simd; diff --git a/gtlf/GLTF/Headers/GLTFMesh.h b/gtlf/GLTF/Headers/GLTFMesh.h index 3bf40f62..2c2eeef4 100644 --- a/gtlf/GLTF/Headers/GLTFMesh.h +++ b/gtlf/GLTF/Headers/GLTFMesh.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFNode.h b/gtlf/GLTF/Headers/GLTFNode.h index f2df6512..08ddff42 100644 --- a/gtlf/GLTF/Headers/GLTFNode.h +++ b/gtlf/GLTF/Headers/GLTFNode.h @@ -14,9 +14,9 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFUtilities.h" -#import "GLTFNodeVisitor.h" +#import +#import +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFScene.h b/gtlf/GLTF/Headers/GLTFScene.h index cf5ae848..1f7f397a 100644 --- a/gtlf/GLTF/Headers/GLTFScene.h +++ b/gtlf/GLTF/Headers/GLTFScene.h @@ -14,9 +14,9 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFUtilities.h" -#import "GLTFNodeVisitor.h" +#import +#import +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFSkin.h b/gtlf/GLTF/Headers/GLTFSkin.h index da289182..e3c1c60c 100644 --- a/gtlf/GLTF/Headers/GLTFSkin.h +++ b/gtlf/GLTF/Headers/GLTFSkin.h @@ -15,7 +15,7 @@ // @import Foundation; -#import "GLTFObject.h" +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFTexture.h b/gtlf/GLTF/Headers/GLTFTexture.h index 62a24bee..14ab1a6f 100644 --- a/gtlf/GLTF/Headers/GLTFTexture.h +++ b/gtlf/GLTF/Headers/GLTFTexture.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import @import simd; diff --git a/gtlf/GLTF/Headers/GLTFTextureSampler.h b/gtlf/GLTF/Headers/GLTFTextureSampler.h index a3b37249..936fdd77 100644 --- a/gtlf/GLTF/Headers/GLTFTextureSampler.h +++ b/gtlf/GLTF/Headers/GLTFTextureSampler.h @@ -14,8 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" -#import "GLTFEnums.h" +#import +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFUtilities.h b/gtlf/GLTF/Headers/GLTFUtilities.h index 22528ef4..ea90a6a8 100644 --- a/gtlf/GLTF/Headers/GLTFUtilities.h +++ b/gtlf/GLTF/Headers/GLTFUtilities.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFEnums.h" +#import @import Foundation; @import simd; diff --git a/gtlf/GLTF/Headers/GLTFVertexDescriptor.h b/gtlf/GLTF/Headers/GLTFVertexDescriptor.h index a94f96fb..3523d137 100644 --- a/gtlf/GLTF/Headers/GLTFVertexDescriptor.h +++ b/gtlf/GLTF/Headers/GLTFVertexDescriptor.h @@ -15,7 +15,7 @@ // @import Foundation; -#import "GLTFEnums.h" +#import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h b/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h index d0030b2d..a2a4d04a 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h @@ -15,7 +15,7 @@ // #import -#import "GLTFMTLTextureLoader.h" +#import @import Foundation; @import Metal; diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h b/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h index d78bd96c..30733fc5 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h @@ -15,7 +15,7 @@ // #import -#import "GLTFMTLUtilities.h" +#import @import Metal; diff --git a/libkram/kram/KTXImage.cpp b/libkram/kram/KTXImage.cpp index d08f8721..047b45aa 100644 --- a/libkram/kram/KTXImage.cpp +++ b/libkram/kram/KTXImage.cpp @@ -1215,7 +1215,7 @@ bool KTXImage::open(const uint8_t* imageData, size_t imageDataLength, bool isInf // since KTX1 doesn't have compressed mips, can alias the file data directly fileData = imageData; - fileDataLength = imageDataLength; + fileDataLength = (int32_t)imageDataLength; // copy out the header, TODO: should make sure bytes exist header = *(const KTXHeader*)fileData; @@ -1496,7 +1496,7 @@ void KTXImage::initMipLevels(bool doMipmaps, int32_t mipMinSize, int32_t mipMaxS mipLevels.push_back(level); } - header.numberOfMipmapLevels = mipLevels.size(); + header.numberOfMipmapLevels = (uint32_t)mipLevels.size(); header.pixelWidth = width; header.pixelHeight = height; @@ -1521,7 +1521,7 @@ void KTXImage::initMipLevels(size_t mipOffset) for (uint32_t i = 0; i < numMips; ++i) { size_t dataSize = mipLengthCalc(w, h); - uint32_t levelSize = dataSize * numChunks; + uint32_t levelSize = (uint32_t)(dataSize * numChunks); // TODO: align mip offset to multiple of 4 bytes for KTX1, may need for kTX2 // make sure when adding up offsets with length to include this padding diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index 4e5c803b..e4b19ab0 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -2155,7 +2155,7 @@ string kramInfoKTXToString(const string& srcFilename, const KTXImage& srcImage, // for now driving everything off metal type, but should switch to neutral MyMTLPixelFormat metalFormat = srcImage.pixelFormat; - int32_t dataSize = srcImage.fileDataLength; + int32_t dataSize = (int32_t)srcImage.fileDataLength; //string tmp; bool isMB = (dataSize > (512 * 1024)); From 9050e43948a0a816a8ebb0928b8ae88d9bceb697 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 15 Apr 2022 13:26:04 -0700 Subject: [PATCH 002/615] kram - more warning fixes, should be at zero --- gtlf/GLTF/GLTF.h | 2 +- gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h | 2 +- kramv/KramRenderer.mm | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gtlf/GLTF/GLTF.h b/gtlf/GLTF/GLTF.h index 0be075b8..6d2807d0 100644 --- a/gtlf/GLTF/GLTF.h +++ b/gtlf/GLTF/GLTF.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#include "TargetConditionals.h" +#include #if TARGET_OS_OSX @import Cocoa; diff --git a/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h b/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h index d359064f..45859dbe 100644 --- a/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h +++ b/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -#import "GLTFObject.h" +#import @import simd; diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index 5cd813a4..b5814bf6 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -375,8 +375,8 @@ - (void)_loadMetalWithView:(nonnull MTKView *)view // Important to set color space, or colors are wrong. Why doesn't one of these work (or the default) // false is good for srgb -> rgba16f // true is good for non-srgb -> rgba16f - bool pickOne = false; CGColorSpaceRef viewColorSpace = CGColorSpaceCreateWithName(kCGColorSpaceGenericRGBLinear); + //bool pickOne = false; // pickOne ? kCGColorSpaceSRGB : kCGColorSpaceLinearSRGB); view.colorspace = viewColorSpace; From a5f3a1e93ed8c7ddd9beca04d7ce22c269d9c33b Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 24 Apr 2022 18:16:52 -0700 Subject: [PATCH 003/615] Update README.md --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 41ab3258..95daaa31 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ L - advance lighting mode (none, diffuse, diffuse + specular) T - toggle tangent generation ↓ - advance bundle/folder image (can traverse zip of ktx/ktx2 files), displays list, esc to get out of list +→ - advance counterpart (can see png, then encodes if viewing folders). Not yet finished. ``` @@ -116,7 +117,7 @@ KTX2 - works in kram and viewer, has aligned levels of mips when uncompressed, DDS - works in kram and viewer, no mip compression, only BC and explicit formats, extended for ASTC/ETC kram/kramv only support newer DX10 style DDS format. Can view in Preview on macOS too. - DDSHelper provides load/save. Pixel data ordered by chunk instead of by mips. + DDSHelper provides load/save. Pixel data ordered by chunk instead of by mips. No metadata. ``` @@ -268,12 +269,12 @@ kram includes the following encoders/decoders: |----------|------------------|-------------|-----------------------------|---------| | BCEnc | Rich Geldreich | MIT | BC1,3,4,5,7 | same | | Squish | Simon Brown | MIT | BC1,3,4,5 | same | -| ATE | Apple | no sources | BC1,4,5,7 ASTC4x4,8x8 LDR | all LDR | +| ATE | Apple | no sources | BC1,4,5,7 ASTC4x4,8x8 LDR | LDR | | Astcenc | Arm | Apache 2.0 | ASTC4x4,5x5,6x6,8x8 LDR/HDR | same | | Etc2comp | Google | MIT | ETC2r11,rg11,rgb,rgba | same | | Explicit | Me | MIT | r/rg/rgba 8u/16f/32f | none | | Compress | AMD | MIT | BC6 | same | -| GTLFKit | Warren Moore | MIT | gltf | same | +| GTLFKit | Warren Moore | MIT | none | gltf | ``` ATE @@ -321,6 +322,7 @@ kram includes additional open-source: | mmap universal | Mike Frysinger | Pub | mmap on Windows | | zstd | Yann Collett (FB) | BSD-2 | KTX2 mip decode | | miniz | Rich Gelreich | Unlicense | bundle support via zip | +| gltfKit | Warren Moore | MIT | gltf decoder/renderer | #### Open source changes @@ -332,6 +334,7 @@ kram includes additional open-source: * mmap universal - may leak a file mapping handle on Win. * zstd - using single file version of zstd for decode, disabled encode paths * miniz - expose raw data and offset for mmap-ed zip files, disabled writer, disable read crc checks, in .cpp file +* gltfkit - several warning fixes, changes to support kram texture loader ## kram unstarted features: From df8ce0db041680830b4b8983690e110f7c857248 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Wed, 27 Apr 2022 23:38:14 -0700 Subject: [PATCH 004/615] kram - remove dead astc files, move pointers, and bury TaskSystem to .cpp --- build2/kram.xcodeproj/project.pbxproj | 6 - kramv/KramLoader.mm | 24 +-- kramv/KramRenderer.mm | 104 +++++----- kramv/KramViewerMain.mm | 166 ++++++++-------- libkram/kram/TaskSystem.cpp | 273 ++++++++++++++++++++++++++ libkram/kram/TaskSystem.h | 228 +-------------------- 6 files changed, 428 insertions(+), 373 deletions(-) diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index 218793e2..85574ec7 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -311,8 +311,6 @@ 70871E0827DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC627DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp */; }; 70871E0927DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC727DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp */; }; 70871E0A27DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DC727DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp */; }; - 70871E0B27DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DC827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h */; }; - 70871E0C27DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DC827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h */; }; 708A6A962708CE4700BA5410 /* bc6h_decode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 708A6A8B2708CE4700BA5410 /* bc6h_decode.cpp */; }; 708A6A972708CE4700BA5410 /* bc6h_decode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 708A6A8B2708CE4700BA5410 /* bc6h_decode.cpp */; }; 708A6A982708CE4700BA5410 /* bc6h_decode.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A8C2708CE4700BA5410 /* bc6h_decode.h */; }; @@ -633,7 +631,6 @@ 70871DC527DDDBCC00D0B9E1 /* astcenc_platform_isa_detection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = astcenc_platform_isa_detection.cpp; sourceTree = ""; }; 70871DC627DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = astcenc_diagnostic_trace.cpp; sourceTree = ""; }; 70871DC727DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = astcenc_ideal_endpoints_and_weights.cpp; sourceTree = ""; }; - 70871DC827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = astcenc_vecmathlib_neon_armv7_4.h; sourceTree = ""; }; 708A6A8B2708CE4700BA5410 /* bc6h_decode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bc6h_decode.cpp; sourceTree = ""; }; 708A6A8C2708CE4700BA5410 /* bc6h_decode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bc6h_decode.h; sourceTree = ""; }; 708A6A8D2708CE4700BA5410 /* bc6h_encode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bc6h_encode.cpp; sourceTree = ""; }; @@ -789,7 +786,6 @@ 70871DBF27DDDBCC00D0B9E1 /* astcenc_vecmathlib_avx2_8.h */, 70871DA727DDDBCC00D0B9E1 /* astcenc_vecmathlib_common_4.h */, 70871DBE27DDDBCC00D0B9E1 /* astcenc_vecmathlib_neon_4.h */, - 70871DC827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h */, 70871DC027DDDBCC00D0B9E1 /* astcenc_vecmathlib_none_4.h */, 70871DB127DDDBCC00D0B9E1 /* astcenc_vecmathlib_sse_4.h */, 70871DC127DDDBCC00D0B9E1 /* astcenc_vecmathlib.h */, @@ -1268,7 +1264,6 @@ 706EF01426D15985001C950E /* alpha.h in Headers */, 708A6A982708CE4700BA5410 /* bc6h_decode.h in Headers */, 706EF01526D15985001C950E /* singlecolourfit.h in Headers */, - 70871E0B27DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h in Headers */, 706EF01626D15985001C950E /* maths.h in Headers */, 706EF01726D15985001C950E /* colourset.h in Headers */, 708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */, @@ -1357,7 +1352,6 @@ 706EF18E26D166C5001C950E /* alpha.h in Headers */, 708A6A992708CE4700BA5410 /* bc6h_decode.h in Headers */, 706EF18F26D166C5001C950E /* singlecolourfit.h in Headers */, - 70871E0C27DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_armv7_4.h in Headers */, 706EF19026D166C5001C950E /* maths.h in Headers */, 706EF19126D166C5001C950E /* colourset.h in Headers */, 708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */, diff --git a/kramv/KramLoader.mm b/kramv/KramLoader.mm index 854e426e..9cb54974 100644 --- a/kramv/KramLoader.mm +++ b/kramv/KramLoader.mm @@ -45,12 +45,12 @@ @implementation KramLoader { // only one of these for now id _buffer; - uint8_t *_data; + uint8_t* _data; uint32_t _bufferOffset; vector _blits; - NSMutableArray> *_blitTextures; - NSMutableArray> *_mipgenTextures; + NSMutableArray>* _blitTextures; + NSMutableArray>* _mipgenTextures; } - (instancetype)init @@ -278,10 +278,10 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format) } - (BOOL)loadImageFromURL:(nonnull NSURL *)url - image:(KTXImage &)image - imageData:(KTXImageData &)imageData + image:(KTXImage&)image + imageData:(KTXImageData&)imageData { - const char *path = url.absoluteURL.path.UTF8String; + const char* path = url.absoluteURL.path.UTF8String; if (!imageData.open(path, image)) { return NO; } @@ -289,9 +289,9 @@ - (BOOL)loadImageFromURL:(nonnull NSURL *)url return YES; } -- (nullable id)loadTextureFromURL:(nonnull NSURL *)url +- (nullable id)loadTextureFromURL:(nonnull NSURL*)url originalFormat: - (nullable MTLPixelFormat *)originalFormat + (nullable MTLPixelFormat*)originalFormat { KTXImage image; KTXImageData imageData; @@ -306,7 +306,7 @@ - (BOOL)loadImageFromURL:(nonnull NSURL *)url - (nullable id)createTexture:(const KTXImage &)image isPrivate:(bool)isPrivate { - MTLTextureDescriptor *textureDescriptor = [[MTLTextureDescriptor alloc] init]; + MTLTextureDescriptor* textureDescriptor = [[MTLTextureDescriptor alloc] init]; // Indicate that each pixel has a blue, green, red, and alpha channel, where // each channel is an 8-bit unsigned normalized value (i.e. 0 maps to 0.0 and @@ -375,7 +375,7 @@ - (void)uploadTexturesIfNeeded:(id)blitEncoder if (!_blits.empty()) { // now upload from staging MTLBuffer to private MTLTexture - for (const auto &blit : _blits) { + for (const auto& blit : _blits) { MTLRegion region = { {0, 0, 0}, // MTLOrigin {(NSUInteger)blit.w, (NSUInteger)blit.h, 1} // MTLSize @@ -504,8 +504,8 @@ inline uint64_t alignOffset(uint64_t offset, uint64_t alignment) size_t blockSize = image.blockSize(); vector bufferOffsets; - uint8_t *bufferData = (uint8_t *)_buffer.contents; - const uint8_t *mipData = (const uint8_t *)image.fileData; + uint8_t* bufferData = (uint8_t*)_buffer.contents; + const uint8_t* mipData = (const uint8_t*)image.fileData; bufferOffsets.resize(image.mipLevels.size()); uint32_t numChunks = image.totalChunks(); diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index b5814bf6..260cbf6a 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -135,7 +135,7 @@ @implementation Renderer { id _depthStateFull; id _depthStateNone; - MTLVertexDescriptor *_mtlVertexDescriptor; + MTLVertexDescriptor* _mtlVertexDescriptor; // TODO: Array< id > _textures; id _colorMap; @@ -173,31 +173,31 @@ @implementation Renderer { float4x4 _modelMatrix3D; // float _rotation; - KramLoader *_loader; - MTKMesh *_mesh; + KramLoader* _loader; + MTKMesh* _mesh; MDLVertexDescriptor *_mdlVertexDescriptor; - MTKMesh *_meshRect; - MTKMesh *_meshBox; - MTKMesh *_meshSphere; - MTKMesh *_meshSphereMirrored; - // MTKMesh *_meshCylinder; - MTKMesh *_meshCapsule; + MTKMesh* _meshRect; + MTKMesh* _meshBox; + MTKMesh* _meshSphere; + MTKMesh* _meshSphereMirrored; + // MTKMesh* _meshCylinder; + MTKMesh* _meshCapsule; MTKMeshBufferAllocator *_metalAllocator; id _shaderLibrary; - NSURL *_metallibFileURL; - NSDate *_metallibFileDate; + NSURL* _metallibFileURL; + NSDate* _metallibFileDate; ViewFramebufferData _viewFramebuffer; - ShowSettings *_showSettings; + ShowSettings* _showSettings; #if USE_GLTF KramGLTFTextureLoader* _textureLoader; id _bufferAllocator; GLTFMTLRenderer* _gltfRenderer; - GLTFAsset *_asset; // only 1 for now + GLTFAsset* _asset; // only 1 for now double _animationTime; id _environmentTexture; @@ -436,8 +436,8 @@ - (BOOL)hotloadShaders:(const char *)filename _metallibFileURL = [NSURL fileURLWithPath:[NSString stringWithUTF8String:filename]]; - NSError *err = nil; - NSDate *fileDate = nil; + NSError* err = nil; + NSDate* fileDate = nil; [_metallibFileURL getResourceValue:&fileDate forKey:NSURLContentModificationDateKey error:&err]; @@ -478,8 +478,8 @@ - (BOOL)hotloadShaders:(const char *)filename - (id)_createComputePipeline:(const char *)name { - NSString *nameNS = [NSString stringWithUTF8String:name]; - NSError *error = nil; + NSString* nameNS = [NSString stringWithUTF8String:name]; + NSError* error = nil; id computeFunction = [_shaderLibrary newFunctionWithName:nameNS]; id pipe; @@ -514,13 +514,13 @@ - (void)_createComputePipelines - (id)_createRenderPipeline:(const char *)vs fs:(const char *)fs { - NSString *vsNameNS = [NSString stringWithUTF8String:vs]; - NSString *fsNameNS = [NSString stringWithUTF8String:fs]; + NSString* vsNameNS = [NSString stringWithUTF8String:vs]; + NSString* fsNameNS = [NSString stringWithUTF8String:fs]; id vertexFunction; id fragmentFunction; - MTLRenderPipelineDescriptor *pipelineStateDescriptor = + MTLRenderPipelineDescriptor* pipelineStateDescriptor = [[MTLRenderPipelineDescriptor alloc] init]; pipelineStateDescriptor.label = fsNameNS; pipelineStateDescriptor.sampleCount = _viewFramebuffer.sampleCount; @@ -538,7 +538,7 @@ - (void)_createComputePipelines pipelineStateDescriptor.stencilAttachmentPixelFormat = _viewFramebuffer.depthStencilPixelFormat; - NSError *error = NULL; + NSError* error = NULL; //----------------------- @@ -587,7 +587,7 @@ - (void)_createSampleRender { { // writing to this texture - MTLTextureDescriptor *textureDesc = [MTLTextureDescriptor + MTLTextureDescriptor* textureDesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA32Float width:1 height:1 @@ -601,7 +601,7 @@ - (void)_createSampleRender { // this must match drawable format due to using a blit to copy pixel out of // drawable - MTLTextureDescriptor *textureDesc = [MTLTextureDescriptor + MTLTextureDescriptor* textureDesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float width:1 height:1 @@ -618,7 +618,7 @@ - (MTKMesh *)_createMeshAsset:(const char *)name mdlMesh:(MDLMesh *)mdlMesh doFlipUV:(bool)doFlipUV { - NSError *error = nil; + NSError* error = nil; mdlMesh.vertexDescriptor = _mdlVertexDescriptor; @@ -628,9 +628,9 @@ - (MTKMesh *)_createMeshAsset:(const char *)name // flip the u coordinate if (doFlipUV) { id uvs = mdlMesh.vertexBuffers[BufferIndexMeshUV0]; - MDLMeshBufferMap *uvsMap = [uvs map]; + MDLMeshBufferMap* uvsMap = [uvs map]; - packed_float2 *uvData = (packed_float2 *)uvsMap.bytes; + packed_float2* uvData = (packed_float2 *)uvsMap.bytes; for (uint32_t i = 0; i < mdlMesh.vertexCount; ++i) { auto &uv = uvData[i]; @@ -650,8 +650,8 @@ - (MTKMesh *)_createMeshAsset:(const char *)name bool doFlipBitangent = true; if (doFlipBitangent) { id uvs = mdlMesh.vertexBuffers[BufferIndexMeshTangent]; - MDLMeshBufferMap *uvsMap = [uvs map]; - packed_float4 *uvData = (packed_float4 *)uvsMap.bytes; + MDLMeshBufferMap* uvsMap = [uvs map]; + packed_float4* uvData = (packed_float4 *)uvsMap.bytes; for (uint32_t i = 0; i < mdlMesh.vertexCount; ++i) { // if (uvData[i].w != -1.0f && uvData[i].w != 1.0f) { @@ -664,7 +664,7 @@ - (MTKMesh *)_createMeshAsset:(const char *)name } // now set it into mtk mesh - MTKMesh *mesh = [[MTKMesh alloc] initWithMesh:mdlMesh + MTKMesh* mesh = [[MTKMesh alloc] initWithMesh:mdlMesh device:_device error:&error]; mesh.name = [NSString stringWithUTF8String:name]; @@ -673,10 +673,10 @@ - (MTKMesh *)_createMeshAsset:(const char *)name // These don't seem to appear as the buffer name that is suballocated from { // name the vertex range on the vb - MTKMeshBuffer *pos = mesh.vertexBuffers[BufferIndexMeshPosition]; - MTKMeshBuffer *uvs = mesh.vertexBuffers[BufferIndexMeshUV0]; - MTKMeshBuffer *normals = mesh.vertexBuffers[BufferIndexMeshNormal]; - MTKMeshBuffer *tangents = mesh.vertexBuffers[BufferIndexMeshTangent]; + MTKMeshBuffer* pos = mesh.vertexBuffers[BufferIndexMeshPosition]; + MTKMeshBuffer* uvs = mesh.vertexBuffers[BufferIndexMeshUV0]; + MTKMeshBuffer* normals = mesh.vertexBuffers[BufferIndexMeshNormal]; + MTKMeshBuffer* tangents = mesh.vertexBuffers[BufferIndexMeshTangent]; [pos.buffer addDebugMarker:@"Pos" range:NSMakeRange(pos.offset, pos.length)]; @@ -689,7 +689,7 @@ - (MTKMesh *)_createMeshAsset:(const char *)name // This seems to already be named "ellisoid-Indices", // need to do for ib as well - for (MTKSubmesh *submesh in mesh.submeshes) { + for (MTKSubmesh* submesh in mesh.submeshes) { [submesh.indexBuffer.buffer addDebugMarker:mesh.name range:NSMakeRange(submesh.indexBuffer.offset, @@ -870,7 +870,7 @@ - (void)_loadAssets { /// Load assets into metal objects - MDLMesh *mdlMesh; + MDLMesh* mdlMesh; mdlMesh = [MDLMesh newBoxWithDimensions:(vector_float3){1, 1, 1} segments:(vector_uint3){1, 1, 1} @@ -905,13 +905,13 @@ - (void)_loadAssets id posBuffer = mdlMesh.vertexBuffers[BufferIndexMeshPosition]; - MDLMeshBufferMap *posMap = [posBuffer map]; - packed_float3 *posData = (packed_float3 *)posMap.bytes; + MDLMeshBufferMap* posMap = [posBuffer map]; + packed_float3* posData = (packed_float3 *)posMap.bytes; id normalBuffer = mdlMesh.vertexBuffers[BufferIndexMeshNormal]; - MDLMeshBufferMap *normalsMap = [normalBuffer map]; - packed_float3 *normalData = (packed_float3 *)normalsMap.bytes; + MDLMeshBufferMap* normalsMap = [normalBuffer map]; + packed_float3* normalData = (packed_float3 *)normalsMap.bytes; // vertexCount reports 306, but vertex 289+ are garbage uint32_t numVertices = 289; // mdlMesh.vertexCount @@ -963,20 +963,20 @@ - (void)_loadAssets mdlMesh.vertexDescriptor = _mdlVertexDescriptor; id uvsBuffer = mdlMesh.vertexBuffers[BufferIndexMeshUV0]; - MDLMeshBufferMap *uvsMap = [uvsBuffer map]; - packed_float2 *uvData = (packed_float2 *)uvsMap.bytes; + MDLMeshBufferMap* uvsMap = [uvsBuffer map]; + packed_float2* uvData = (packed_float2 *)uvsMap.bytes; // this is all aos id posBuffer = mdlMesh.vertexBuffers[BufferIndexMeshPosition]; - MDLMeshBufferMap *posMap = [posBuffer map]; + MDLMeshBufferMap* posMap = [posBuffer map]; packed_float3 *posData = (packed_float3 *)posMap.bytes; id normalsBuffe = mdlMesh.vertexBuffers[BufferIndexMeshNormal]; - MDLMeshBufferMap *normalsMap = [normalsBuffe map]; - packed_float3 *normalData = (packed_float3 *)normalsMap.bytes; + MDLMeshBufferMap* normalsMap = [normalsBuffe map]; + packed_float3* normalData = (packed_float3 *)normalsMap.bytes; // vertexCount reports 306, but vertex 289+ are garbage uint32_t numVertices = 289; // mdlMesh.vertexCount @@ -1168,8 +1168,8 @@ - (BOOL)loadTexture:(nonnull NSURL *)url string fullFilename = url.path.UTF8String; // can use this to pull, or use fstat on FileHelper - NSDate *fileDate = nil; - NSError *error = nil; + NSDate* fileDate = nil; + NSError* error = nil; [url getResourceValue:&fileDate forKey:NSURLContentModificationDateKey error:&error]; @@ -1813,8 +1813,8 @@ - (void)drawMain:(id)commandBuffer _animationTime += 1.0/60.0; NSTimeInterval maxAnimDuration = 0; - for (GLTFAnimation *animation in _asset.animations) { - for (GLTFAnimationChannel *channel in animation.channels) { + for (GLTFAnimation* animation in _asset.animations) { + for (GLTFAnimationChannel* channel in animation.channels) { if (channel.duration > maxAnimDuration) { maxAnimDuration = channel.duration; } @@ -1823,7 +1823,7 @@ - (void)drawMain:(id)commandBuffer NSTimeInterval animTime = fmod(_animationTime, maxAnimDuration); - for (GLTFAnimation *animation in _asset.animations) { + for (GLTFAnimation* animation in _asset.animations) { [animation runAtTime:animTime]; } } @@ -1962,7 +1962,7 @@ - (void)drawMain:(id)commandBuffer // use exisiting lod, and mip [renderEncoder setFragmentSamplerState:sampler atIndex:SamplerIndexColor]; - for (MTKSubmesh *submesh in _mesh.submeshes) { + for (MTKSubmesh* submesh in _mesh.submeshes) { [renderEncoder drawIndexedPrimitives:submesh.primitiveType indexCount:submesh.indexCount indexType:submesh.indexType @@ -2033,7 +2033,7 @@ - (void)drawMain:(id)commandBuffer // and mips on on screen faces and arrays and slices go across in a // row, and mips are displayed down from each of those in a column - for (MTKSubmesh *submesh in _mesh.submeshes) { + for (MTKSubmesh* submesh in _mesh.submeshes) { [renderEncoder drawIndexedPrimitives:submesh.primitiveType indexCount:submesh.indexCount indexType:submesh.indexType @@ -2068,7 +2068,7 @@ - (void)drawMain:(id)commandBuffer // mips on on screen faces and arrays and slices go across in a row, and // mips are displayed down from each of those in a column - for (MTKSubmesh *submesh in _mesh.submeshes) { + for (MTKSubmesh* submesh in _mesh.submeshes) { [renderEncoder drawIndexedPrimitives:submesh.primitiveType indexCount:submesh.indexCount indexType:submesh.indexType diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 0fd8284f..b372e5b6 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -250,8 +250,8 @@ - (NSInteger)numberOfRowsInTableView:(NSTableView *)tableView // NSTableViewDelegate -(NSView *)tableView:(NSTableView *)tableView viewForTableColumn:(NSTableColumn *)tableColumn row:(NSInteger)row { - NSString *identifier = tableColumn.identifier; - NSTableCellView *cell = [tableView makeViewWithIdentifier:identifier owner:self]; + NSString* identifier = tableColumn.identifier; + NSTableCellView* cell = [tableView makeViewWithIdentifier:identifier owner:self]; cell.textField.stringValue = [self.items objectAtIndex:row]; return cell; } @@ -337,8 +337,8 @@ - (BOOL)readFromURL:(nonnull NSURL *)url // TODO: This is only getting called on first open on macOS 12.0 even with hack below. // find out why. - NSApplication *app = [NSApplication sharedApplication]; - MyMTKView *view = app.mainWindow.contentView; + NSApplication* app = [NSApplication sharedApplication]; + MyMTKView* view = app.mainWindow.contentView; BOOL success = [view loadTextureFromURL:url]; if (success) { // Note: if I return NO from this call then a dialog pops up that image @@ -398,7 +398,7 @@ - (void)application:(NSApplication *)sender // TODO: also do an overlapping diff if two files are dropped with same // dimensions. - NSURL *url = urls.firstObject; + NSURL* url = urls.firstObject; [view loadTextureFromURL:url]; [view fixupDocumentList]; } @@ -408,7 +408,7 @@ - (void)application:(NSApplication *)sender - (IBAction)showAboutDialog:(id)sender { // calls openDocumentWithContentsOfURL above - NSMutableDictionary *options = + NSMutableDictionary* options = [[NSMutableDictionary alloc] init]; // name and icon are already supplied @@ -418,7 +418,7 @@ - (IBAction)showAboutDialog:(id)sender [NSString stringWithUTF8String:"kram ©2020-2022 by Alec Miller"]; // add a link to kram website, skip the Visit text - NSMutableAttributedString *str = [[NSMutableAttributedString alloc] + NSMutableAttributedString* str = [[NSMutableAttributedString alloc] initWithString:@"https://github.com/alecazam/kram"]; [str addAttribute:NSLinkAttributeName value:@"https://github.com/alecazam/kram" @@ -442,7 +442,7 @@ - (IBAction)showAboutDialog:(id)sender options[NSAboutPanelOptionCredits] = str; // skip the v character - const char *version = KRAM_VERSION; + const char* version = KRAM_VERSION; version += 1; // this is the build version, should be github hash? @@ -465,19 +465,19 @@ - (IBAction)showAboutDialog:(id)sender NSArray *pasteboardTypes = @[ NSPasteboardTypeFileURL ]; @implementation MyMTKView { - NSMenu *_viewMenu; // really the items - NSStackView *_buttonStack; - NSMutableArray *_buttonArray; - NSTextField *_hudLabel; - NSTextField *_hudLabel2; + NSMenu* _viewMenu; // really the items + NSStackView* _buttonStack; + NSMutableArray* _buttonArray; + NSTextField* _hudLabel; + NSTextField* _hudLabel2; // Offer list of files in archives // TODO: move to NSOutlineView since that can show archive folders with content inside - IBOutlet NSTableView *_tableView; - IBOutlet TableViewController *_tableViewController; + IBOutlet NSTableView* _tableView; + IBOutlet TableViewController* _tableViewController; vector _textSlots; - ShowSettings *_showSettings; + ShowSettings* _showSettings; // allow zip files to be dropped and opened, and can advance through bundle // content @@ -617,15 +617,15 @@ -(void)fixupDocumentList // Clear the document list so readFromURL keeps getting called // Can't remove currentDoc, so have to skip that - NSDocumentController *dc = [NSDocumentController sharedDocumentController]; - NSDocument *currentDoc = dc.currentDocument; - NSMutableArray *docsToRemove = [[NSMutableArray alloc] init]; - for (NSDocument *doc in dc.documents) { + NSDocumentController* dc = [NSDocumentController sharedDocumentController]; + NSDocument* currentDoc = dc.currentDocument; + NSMutableArray* docsToRemove = [[NSMutableArray alloc] init]; + for (NSDocument* doc in dc.documents) { if (doc != currentDoc) [docsToRemove addObject:doc]; } - for (NSDocument *doc in docsToRemove) { + for (NSDocument* doc in docsToRemove) { [dc removeDocument:doc]; } } @@ -731,17 +731,17 @@ - (NSStackView *)_addButtons int32_t numActions = ArrayCount(actions); - NSMutableArray *buttons = [[NSMutableArray alloc] init]; + NSMutableArray* buttons = [[NSMutableArray alloc] init]; for (int32_t i = 0; i < numActions; ++i) { Action& action = actions[i]; - const char *icon = action.icon; - const char *tip = action.tip; + const char* icon = action.icon; + const char* tip = action.tip; - NSString *name = [NSString stringWithUTF8String:icon]; - NSString *toolTip = [NSString stringWithUTF8String:tip]; + NSString* name = [NSString stringWithUTF8String:icon]; + NSString* toolTip = [NSString stringWithUTF8String:tip]; - NSButton *button = nil; + NSButton* button = nil; button = [NSButton buttonWithTitle:name target:self @@ -814,7 +814,7 @@ - (NSStackView *)_addButtons [buttons addObject:button]; } - NSStackView *stackView = [NSStackView stackViewWithViews:buttons]; + NSStackView* stackView = [NSStackView stackViewWithViews:buttons]; stackView.orientation = NSUserInterfaceLayoutOrientationVertical; stackView.detachesHiddenViews = YES; // default, but why have to have _buttonArrary @@ -822,10 +822,10 @@ - (NSStackView *)_addButtons // Want menus, so user can define their own shortcuts to commands // Also need to enable/disable this via validateUserInterfaceItem - NSApplication *app = [NSApplication sharedApplication]; + NSApplication* app = [NSApplication sharedApplication]; - NSMenu *mainMenu = app.mainMenu; - NSMenuItem *viewMenuItem = mainMenu.itemArray[2]; + NSMenu* mainMenu = app.mainMenu; + NSMenuItem* viewMenuItem = mainMenu.itemArray[2]; _viewMenu = viewMenuItem.submenu; // TODO: add a view menu in the storyboard @@ -834,11 +834,11 @@ - (NSStackView *)_addButtons for (int32_t i = 0; i < numActions; ++i) { Action& action = actions[i]; - const char *icon = action.icon; // single char - const char *title = action.tip; + const char* icon = action.icon; // single char + const char* title = action.tip; - NSString *toolTip = [NSString stringWithUTF8String:icon]; - NSString *name = [NSString stringWithUTF8String:title]; + NSString* toolTip = [NSString stringWithUTF8String:icon]; + NSString* name = [NSString stringWithUTF8String:title]; bool isSeparator = icon[0] == 0; if (isSeparator) { @@ -846,9 +846,9 @@ - (NSStackView *)_addButtons } else { // NSString *shortcut = @""; // for now, or AppKit turns key int cmd+shift+key - NSString *shortcut = [NSString stringWithUTF8String:icon]; + NSString* shortcut = [NSString stringWithUTF8String:icon]; - NSMenuItem *menuItem = + NSMenuItem* menuItem = [[NSMenuItem alloc] initWithTitle:name action:@selector(handleAction:) keyEquivalent:shortcut]; @@ -880,7 +880,7 @@ - (NSStackView *)_addButtons // copy all of them to a vector, and then assign the action ptrs for (int32_t i = 0; i < numActions; ++i) { Action& action = actions[i]; - const char *icon = action.icon; // single char + const char* icon = action.icon; // single char // skip separators bool isSeparator = icon[0] == 0; @@ -917,7 +917,7 @@ - (NSTextField *)_addHud:(BOOL)isShadow uint32_t h = 1220; // add a label for the hud - NSTextField *label = [[MyNSTextField alloc] + NSTextField* label = [[MyNSTextField alloc] initWithFrame:NSMakeRect(isShadow ? 21 : 20, isShadow ? 21 : 20, w, h)]; @@ -954,7 +954,7 @@ - (NSTextField *)_addHud:(BOOL)isShadow - (void)doZoomMath:(float)newZoom newPan:(float2 &)newPan { // transform the cursor to texture coordinate, or clamped version if outside - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; float4x4 projectionViewModelMatrix = [renderer computeImageTransform:_showSettings->panX panY:_showSettings->panY @@ -1084,7 +1084,7 @@ -(void)updateZoom:(float)zoom float4 bottomLeftCorner = float4m(-0.5 * ar, -0.5f, 0.0f, 1.0f); float4 topRightCorner = float4m(0.5 * ar, 0.5f, 0.0f, 1.0f); - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; float4x4 newMatrix = [renderer computeImageTransform:_showSettings->panX panY:_showSettings->panY zoom:zoom]; @@ -1299,7 +1299,7 @@ - (void)updateEyedropper } // don't wait on renderer to update this matrix - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; if (_showSettings->isEyedropperFromDrawable()) { // this only needs the cursor location, but can't supply uv to @@ -1722,7 +1722,7 @@ - (void)scrollWheel:(NSEvent *)event - (void)updatePan:(float)panX panY:(float)panY { - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; float4x4 projectionViewModelMatrix = [renderer computeImageTransform:panX panY:panY @@ -1943,12 +1943,12 @@ - (void)updateUIControlState - (IBAction)handleAction:(id)sender { - NSEvent *theEvent = [NSApp currentEvent]; + NSEvent* theEvent = [NSApp currentEvent]; bool isShiftKeyDown = (theEvent.modifierFlags & NSEventModifierFlagShift); const Action* action = nullptr; if ([sender isKindOfClass:[NSButton class]]) { - NSButton *button = (NSButton *)sender; + NSButton* button = (NSButton *)sender; for (const auto& search: _actions) { if (search.button == button) { action = &search; @@ -1957,7 +1957,7 @@ - (IBAction)handleAction:(id)sender } } else if ([sender isKindOfClass:[NSMenuItem class]]) { - NSMenuItem *menuItem = (NSMenuItem *)sender; + NSMenuItem* menuItem = (NSMenuItem *)sender; for (const auto& search: _actions) { if (search.menuItem == menuItem) { action = &search; @@ -2561,30 +2561,30 @@ - (BOOL)prepareForDragOperation:(id)sender - (BOOL)performDragOperation:(id)sender { - NSPasteboard *pasteboard = [sender draggingPasteboard]; + NSPasteboard* pasteboard = [sender draggingPasteboard]; - NSString *desiredType = [pasteboard availableTypeFromArray:pasteboardTypes]; + NSString* desiredType = [pasteboard availableTypeFromArray:pasteboardTypes]; if ([desiredType isEqualToString:NSPasteboardTypeFileURL]) { // TODO: use readObjects to drag multiple files onto one view // load one mip of all those, use smaller mips for thumbnail // the pasteboard contains a list of filenames - NSString *urlString = + NSString* urlString = [pasteboard propertyListForType:NSPasteboardTypeFileURL]; // this turns it into a real path (supposedly works even with sandbox) - NSURL *url = [NSURL URLWithString:urlString]; + NSURL* url = [NSURL URLWithString:urlString]; // convert the original path and then back to a url, otherwise reload fails // when this file is replaced. - const char *filename = url.fileSystemRepresentation; + const char* filename = url.fileSystemRepresentation; if (filename == nullptr) { KLOGE("kramv", "Fix this drop url returning nil issue"); return NO; } - NSString *filenameString = [NSString stringWithUTF8String:filename]; + NSString* filenameString = [NSString stringWithUTF8String:filename]; url = [NSURL fileURLWithPath:filenameString]; @@ -2633,7 +2633,7 @@ - (BOOL)loadArchive:(const char *)zipFilename // copy names into the files view [_tableViewController.items removeAllObjects]; for (const auto& entry: _zip.zipEntrys()) { - const char *filenameShort = toFilenameShort(entry.filename); + const char* filenameShort = toFilenameShort(entry.filename); [_tableViewController.items addObject: [NSString stringWithUTF8String: filenameShort]]; } [_tableView reloadData]; @@ -2761,7 +2761,7 @@ static string findNormalMapFromAlbedoFilename(const char* filename) { string filenameShort = filename; - const char *ext = strrchr(filename, '.'); + const char* ext = strrchr(filename, '.'); auto dotPos = filenameShort.find_last_of("."); if (dotPos == string::npos) @@ -2791,7 +2791,7 @@ static string findNormalMapFromAlbedoFilename(const char* filename) - (BOOL)loadFileFromFolder { // now lookup the filename and data at that entry - const char *filename = _folderFiles[_fileFolderIndex].c_str(); + const char* filename = _folderFiles[_fileFolderIndex].c_str(); string fullFilename = filename; auto timestamp = FileHelper::modificationTimestamp(filename); @@ -2843,7 +2843,7 @@ - (BOOL)loadFileFromFolder } } - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; [renderer releaseAllPendingTextures]; if (![renderer loadTextureFromImage:fullFilename.c_str() @@ -2857,7 +2857,7 @@ - (BOOL)loadFileFromFolder //------------------------------- // set title to filename, chop this to just file+ext, not directory - const char *filenameShort = strrchr(filename, '/'); + const char* filenameShort = strrchr(filename, '/'); if (filenameShort == nullptr) { filenameShort = filename; } @@ -2913,7 +2913,7 @@ - (BOOL)loadFileFromArchive return NO; } - const uint8_t *imageData = nullptr; + const uint8_t* imageData = nullptr; uint64_t imageDataLength = 0; // search for main file - can be albedo or normal @@ -2921,7 +2921,7 @@ - (BOOL)loadFileFromArchive return NO; } - const uint8_t *imageNormalData = nullptr; + const uint8_t* imageNormalData = nullptr; uint64_t imageNormalDataLength = 0; string normalFilename; @@ -2964,7 +2964,7 @@ - (BOOL)loadFileFromArchive } } - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; [renderer releaseAllPendingTextures]; if (![renderer loadTextureFromImage:fullFilename.c_str() @@ -2978,7 +2978,7 @@ - (BOOL)loadFileFromArchive //--------------------------------- // set title to filename, chop this to just file+ext, not directory - const char *filenameShort = strrchr(filename, '/'); + const char* filenameShort = strrchr(filename, '/'); if (filenameShort == nullptr) { filenameShort = filename; } @@ -3021,7 +3021,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url _hudHidden = false; [self updateHudVisibility]; - const char *filename = url.fileSystemRepresentation; + const char* filename = url.fileSystemRepresentation; if (filename == nullptr) { // Fixed by converting dropped urls into paths then back to a url. // When file replaced the drop url is no longer valid. @@ -3029,7 +3029,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url return NO; } - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; // folders can have a . in them f.e. 2.0/blah/... bool isDirectory = url.hasDirectoryPath; @@ -3039,7 +3039,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url // make list of all file in the directory if (!self.imageURL || (!([self.imageURL isEqualTo:url]))) { - NSDirectoryEnumerator *directoryEnumerator = + NSDirectoryEnumerator* directoryEnumerator = [[NSFileManager defaultManager] enumeratorAtURL:url includingPropertiesForKeys:[NSArray array] @@ -3056,8 +3056,8 @@ - (BOOL)loadTextureFromURL:(NSURL *)url vector files; #if USE_GLTF // only display models in folder if found, ignore the png/jpg files - while (NSURL *fileOrDirectoryURL = [directoryEnumerator nextObject]) { - const char *name = fileOrDirectoryURL.fileSystemRepresentation; + while (NSURL* fileOrDirectoryURL = [directoryEnumerator nextObject]) { + const char* name = fileOrDirectoryURL.fileSystemRepresentation; bool isGLTF = endsWithExtension(name, ".gltf"); bool isGLB = endsWithExtension(name, ".glb"); @@ -3078,7 +3078,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url includingPropertiesForKeys:[NSArray array] options:0 errorHandler: // nil - ^BOOL(NSURL *urlArg, NSError *error) { + ^BOOL(NSURL* urlArg, NSError* error) { macroUnusedVar(urlArg); macroUnusedVar(error); @@ -3086,8 +3086,8 @@ - (BOOL)loadTextureFromURL:(NSURL *)url return NO; }]; #endif - while (NSURL *fileOrDirectoryURL = [directoryEnumerator nextObject]) { - const char *name = fileOrDirectoryURL.fileSystemRepresentation; + while (NSURL* fileOrDirectoryURL = [directoryEnumerator nextObject]) { + const char* name = fileOrDirectoryURL.fileSystemRepresentation; if (isSupportedFilename(name)) { @@ -3101,7 +3101,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url } // add it to recent docs - NSDocumentController *dc = + NSDocumentController* dc = [NSDocumentController sharedDocumentController]; [dc noteNewRecentDocumentURL:url]; @@ -3140,7 +3140,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url [_tableViewController.items removeAllObjects]; for (const auto& file: files) { - const char *filenameShort = toFilenameShort(file.c_str()); + const char* filenameShort = toFilenameShort(file.c_str()); [_tableViewController.items addObject: [NSString stringWithUTF8String: filenameShort]]; } [_tableView reloadData]; @@ -3187,11 +3187,11 @@ - (BOOL)loadTextureFromURL:(NSURL *)url if (endsWithExtension(filename, ".metallib")) { if ([renderer hotloadShaders:filename]) { - NSURL *metallibFileURL = + NSURL* metallibFileURL = [NSURL fileURLWithPath:[NSString stringWithUTF8String:filename]]; // add to recent docs, so can reload quickly - NSDocumentController *dc = + NSDocumentController* dc = [NSDocumentController sharedDocumentController]; [dc noteNewRecentDocumentURL:metallibFileURL]; @@ -3259,13 +3259,13 @@ - (BOOL)loadTextureFromURL:(NSURL *)url self.lastArchiveTimestamp = archiveTimestamp; // add it to recent docs - NSDocumentController *dc = + NSDocumentController* dc = [NSDocumentController sharedDocumentController]; [dc noteNewRecentDocumentURL:url]; // now reload the filename if needed if (!existingFilename.empty()) { - const ZipEntry *formerEntry = _zip.zipEntry(existingFilename.c_str()); + const ZipEntry* formerEntry = _zip.zipEntry(existingFilename.c_str()); if (formerEntry) { // lookup the index in the remapIndices table _fileArchiveIndex = @@ -3287,8 +3287,8 @@ - (BOOL)loadTextureFromURL:(NSURL *)url getErrorLogCaptureText(errorText); setErrorLogCapture(false); - const auto &entry = _zip.zipEntrys()[_fileArchiveIndex]; - const char *archiveFilename = entry.filename; + const auto& entry = _zip.zipEntrys()[_fileArchiveIndex]; + const char* archiveFilename = entry.filename; // prepend filename string finalErrorText; @@ -3373,7 +3373,7 @@ -(BOOL)loadModelFile:(NSURL*)url filename:(const char*)filename if (url != nil) { // add to recent docs, so can reload quickly - NSDocumentController *dc = + NSDocumentController* dc = [NSDocumentController sharedDocumentController]; [dc noteNewRecentDocumentURL:gltfFileURL]; @@ -3406,7 +3406,7 @@ -(BOOL)loadModelFile:(NSURL*)url filename:(const char*)filename -(BOOL)loadImageFile:(NSURL*)url { - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; setErrorLogCapture(true); // set title to filename, chop this to just file+ext, not directory @@ -3444,7 +3444,7 @@ -(BOOL)loadImageFile:(NSURL*)url // list // add to recent document menu - NSDocumentController *dc = [NSDocumentController sharedDocumentController]; + NSDocumentController* dc = [NSDocumentController sharedDocumentController]; [dc noteNewRecentDocumentURL:url]; self.imageURL = url; @@ -3549,11 +3549,11 @@ @interface GameViewController : NSViewController @end @implementation GameViewController { - MyMTKView *_view; + MyMTKView* _view; - Renderer *_renderer; + Renderer* _renderer; - NSTrackingArea *_trackingArea; + NSTrackingArea* _trackingArea; } - (void)viewWillDisappear diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index b9b4de1e..1979129d 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -1 +1,274 @@ #include "TaskSystem.h" + +// TODO: bury in system/cpp file +#if KRAM_MAC || KRAM_IOS + #include + #include +#elif KRAM_WIN + #include +#else + #include +#endif + +namespace kram { +using namespace NAMESPACE_STL; + +void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) +{ + // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ + // TODO: set affinity, but need to create a thread that doesn't launch + // so can set this up, and then run it. + + auto handle = thread.native_handle(); + uint64_t affinityMask = ((uint64_t)1) << threadIndex; // for now only allow single thread mask + +#if KRAM_MAC || KRAM_IOS + thread_affinity_policy_data_t policy = { (int)affinityMask }; + + // TODO: check return + thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); + +#elif KRAM_WIN + // each processor group only has 64 bits + SetThreadAffinityMask(handle, &affinityMask); +#else + // most systems are pthread-based, this is represented with array of bits + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(threadIndex, &cpuset); + + // TODO: check return + /*int rc = */ pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); +#endif +} + +void task_system::run(int32_t threadIndex) +{ + while (true) { + // pop() wait avoids a spinloop. + + function f; + + // start with ours, but steal from other queues if nothing found + // Note that if threadIndex queue is empty and stays empty + // then pop() below will stop using that thread. But async_ is round-robining + // all work across the available queues. + int32_t multiple = 4; // 32; + int32_t numTries = 0; + for (int32_t n = 0, nEnd = _count * multiple; n < nEnd; ++n) { + numTries++; + + // break for loop if work found + if (_q[(threadIndex + n) % _count].try_pop(f)) { + break; + } + } + + // numTries is 64 when queues are empty, and typically 1 when queues are full + //KLOGD("task_system", "thread %d searched %d tries", threadIndex, numTries); + + // if no task, and nothing to steal, pop own queue if possible + // pop blocks until it's queue receives tasks + if (!f && !_q[threadIndex].pop(f)) { + // shutdown if tasks have all been submitted and queue marked as done. + if (_q[threadIndex].is_done()) { + KLOGD("task_system", "thread %d shutting down", threadIndex); + + break; + } + else { + KLOGD("task_system", "no work found for %d in %d tries", threadIndex, numTries); + + // keep searching + continue; + } + } + + // do the work + f(); + } +} + + +// TODO: don't want hyperthreads from hardware_concurrency +task_system::task_system(int32_t count) : + _count(std::min(count, (int32_t)std::thread::hardware_concurrency())), + _q{(size_t)_count}, + _index(0) +{ + // start up the threads + for (int32_t threadIndex = 0; threadIndex != _count; ++threadIndex) { + _threads.emplace_back([&, threadIndex] { run(threadIndex); }); + set_affinity(_threads.back(), threadIndex); + } + +} + +task_system::~task_system() +{ + // indicate that all tasks are submitted + for (auto& e : _q) + e.set_done(); + + // wait until threads are all done, but joining each thread + for (auto& e : _threads) + e.join(); +} + +} + + +/**************************************************************************************************/ + +// this autogens max threads even if none are used +//task_system _system; + +/**************************************************************************************************/ + +// There's already a std::future, so may want to look at that +// otherwise, this all implements async call which would be useful +// +//template +//struct result_of_; +// +//template +//struct result_of_ { using type = R; }; +// +//template +//using result_of_t_ = typename result_of_::type; +// +///**************************************************************************************************/ +// +//template +//struct shared_base { +// vector _r; // optional +// mutex _mutex; +// condition_variable _ready; +// vector> _then; +// +// virtual ~shared_base() { } +// +// void set(R&& r) { +// vector> then; +// { +// lock_t lock{_mutex}; +// _r.push_back(move(r)); +// swap(_then, then); +// } +// _ready.notify_all(); +// for (const auto& f : then) _system.async_(move(f)); +// } +// +// template +// void then(F&& f) { +// bool resolved{false}; +// { +// lock_t lock{_mutex}; +// if (_r.empty()) _then.push_back(forward(f)); +// else resolved = true; +// } +// if (resolved) _system.async_(move(f)); +// } +// +// const R& get() { +// lock_t lock{_mutex}; +// while (_r.empty()) _ready.wait(lock); +// return _r.back(); +// } +//}; +// +//template struct shared; // not defined +// +//template +//struct shared : shared_base { +// function _f; +// +// template +// shared(F&& f) : _f(forward(f)) { } +// +// template +// void operator()(A&&... args) { +// this->set(_f(forward(args)...)); +// _f = nullptr; +// } +//}; +// +//template class packaged_task; //not defined +//template class future; +// +//template +//auto package(F&& f) -> pair, future>>; +// +//template +//class future { +// shared_ptr> _p; +// +// template +// friend auto package(F&& f) -> pair, future>>; +// +// explicit future(shared_ptr> p) : _p(move(p)) { } +// public: +// future() = default; +// +// template +// auto then(F&& f) { +// auto pack = package()>([p = _p, f = forward(f)](){ +// return f(p->_r.back()); +// }); +// _p->then(move(pack.first)); +// return pack.second; +// } +// +// const R& get() const { return _p->get(); } +//}; +// +//template +//class packaged_task { +// weak_ptr> _p; +// +// template +// friend auto package(F&& f) -> pair, future>>; +// +// explicit packaged_task(weak_ptr> p) : _p(move(p)) { } +// +// public: +// packaged_task() = default; +// +// template +// void operator()(A&&... args) const { +// auto p = _p.lock(); +// if (p) (*p)(forward(args)...); +// } +//}; +// +//template +//auto package(F&& f) -> pair, future>> { +// auto p = make_shared>(forward(f)); +// return make_pair(packaged_task(p), future>(p)); +//} +// +///**************************************************************************************************/ +// +//template +//auto async(F&& f, Args&&... args) +//{ +// using result_type = result_of_t; +// using packaged_type = packaged_task; +// +// auto pack = package(bind(forward(f), forward(args)...)); +// +// _system.async_(move(get<0>(pack))); +// return get<1>(pack); +//} + +/**************************************************************************************************/ + +//int32_t main() { +// future x = async([]{ return fibonacci(100); }); +// +// future y = x.then([](const cpp_int& x){ return cpp_int(x * 2); }); +// future z = x.then([](const cpp_int& x){ return cpp_int(x / 15); }); +// +// cout << y.get() << endl; +// cout << z.get() << endl; +//} diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index 3805841b..b1b35262 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -17,6 +17,8 @@ #include //#include + + /**************************************************************************************************/ namespace kram { @@ -130,75 +132,15 @@ class task_system { vector _q; std::atomic _index; - void run(int32_t threadIndex) - { - while (true) { - // pop() wait avoids a spinloop. - - function f; - - // start with ours, but steal from other queues if nothing found - // Note that if threadIndex queue is empty and stays empty - // then pop() below will stop using that thread. But async_ is round-robining - // all work across the available queues. - int32_t multiple = 4; // 32; - int32_t numTries = 0; - for (int32_t n = 0, nEnd = _count * multiple; n < nEnd; ++n) { - numTries++; - - // break for loop if work found - if (_q[(threadIndex + n) % _count].try_pop(f)) { - break; - } - } - - // numTries is 64 when queues are empty, and typically 1 when queues are full - //KLOGD("task_system", "thread %d searched %d tries", threadIndex, numTries); - - // if no task, and nothing to steal, pop own queue if possible - // pop blocks until it's queue receives tasks - if (!f && !_q[threadIndex].pop(f)) { - // shutdown if tasks have all been submitted and queue marked as done. - if (_q[threadIndex].is_done()) { - KLOGD("task_system", "thread %d shutting down", threadIndex); - - break; - } - else { - KLOGD("task_system", "no work found for %d in %d tries", threadIndex, numTries); - - // keep searching - continue; - } - } - - // do the work - f(); - } - } + void run(int32_t threadIndex); + void set_affinity(std::thread& thread, uint32_t threadIndex); + public: - task_system(int32_t count = 1) : _count(std::min(count, (int32_t)std::thread::hardware_concurrency())), _q{(size_t)_count}, _index(0) - { - // start up the threads - for (int32_t threadIndex = 0; threadIndex != _count; ++threadIndex) { - _threads.emplace_back([&, threadIndex] { run(threadIndex); }); - } - } - - ~task_system() - { - // indicate that all tasks are submitted - for (auto& e : _q) e.set_done(); + task_system(int32_t count = 1); + ~task_system(); - // wait until threads are all done, but joining each thread - for (auto& e : _threads) e.join(); - } - - int32_t num_threads() const - { - return _count; - } + int32_t num_threads() const { return _count; } template void async_(F&& f) @@ -221,159 +163,5 @@ class task_system { } }; -/**************************************************************************************************/ - -// this autogens max threads even if none are used -//task_system _system; - -/**************************************************************************************************/ - -// There's already a std::future, so may want to look at that -// otherwise, this all implements async call which would be useful -// -//template -//struct result_of_; -// -//template -//struct result_of_ { using type = R; }; -// -//template -//using result_of_t_ = typename result_of_::type; -// -///**************************************************************************************************/ -// -//template -//struct shared_base { -// vector _r; // optional -// mutex _mutex; -// condition_variable _ready; -// vector> _then; -// -// virtual ~shared_base() { } -// -// void set(R&& r) { -// vector> then; -// { -// lock_t lock{_mutex}; -// _r.push_back(move(r)); -// swap(_then, then); -// } -// _ready.notify_all(); -// for (const auto& f : then) _system.async_(move(f)); -// } -// -// template -// void then(F&& f) { -// bool resolved{false}; -// { -// lock_t lock{_mutex}; -// if (_r.empty()) _then.push_back(forward(f)); -// else resolved = true; -// } -// if (resolved) _system.async_(move(f)); -// } -// -// const R& get() { -// lock_t lock{_mutex}; -// while (_r.empty()) _ready.wait(lock); -// return _r.back(); -// } -//}; -// -//template struct shared; // not defined -// -//template -//struct shared : shared_base { -// function _f; -// -// template -// shared(F&& f) : _f(forward(f)) { } -// -// template -// void operator()(A&&... args) { -// this->set(_f(forward(args)...)); -// _f = nullptr; -// } -//}; -// -//template class packaged_task; //not defined -//template class future; -// -//template -//auto package(F&& f) -> pair, future>>; -// -//template -//class future { -// shared_ptr> _p; -// -// template -// friend auto package(F&& f) -> pair, future>>; -// -// explicit future(shared_ptr> p) : _p(move(p)) { } -// public: -// future() = default; -// -// template -// auto then(F&& f) { -// auto pack = package()>([p = _p, f = forward(f)](){ -// return f(p->_r.back()); -// }); -// _p->then(move(pack.first)); -// return pack.second; -// } -// -// const R& get() const { return _p->get(); } -//}; -// -//template -//class packaged_task { -// weak_ptr> _p; -// -// template -// friend auto package(F&& f) -> pair, future>>; -// -// explicit packaged_task(weak_ptr> p) : _p(move(p)) { } -// -// public: -// packaged_task() = default; -// -// template -// void operator()(A&&... args) const { -// auto p = _p.lock(); -// if (p) (*p)(forward(args)...); -// } -//}; -// -//template -//auto package(F&& f) -> pair, future>> { -// auto p = make_shared>(forward(f)); -// return make_pair(packaged_task(p), future>(p)); -//} -// -///**************************************************************************************************/ -// -//template -//auto async(F&& f, Args&&... args) -//{ -// using result_type = result_of_t; -// using packaged_type = packaged_task; -// -// auto pack = package(bind(forward(f), forward(args)...)); -// -// _system.async_(move(get<0>(pack))); -// return get<1>(pack); -//} - -/**************************************************************************************************/ - -//int32_t main() { -// future x = async([]{ return fibonacci(100); }); -// -// future y = x.then([](const cpp_int& x){ return cpp_int(x * 2); }); -// future z = x.then([](const cpp_int& x){ return cpp_int(x / 15); }); -// -// cout << y.get() << endl; -// cout << z.get() << endl; -//} } // namespace kram From 71aa6a25641f16376aa2a137e824f9b96136cced Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Wed, 27 Apr 2022 23:40:51 -0700 Subject: [PATCH 005/615] kram - add dds read support for dx9 style files These have to infer the format from fourCC data and other flags. Only read, writes occur to dx10 style. This should help viewing legacy files, and also generating thumbnails for them. Only a subset of the D3DFMT are supported. --- libkram/kram/KramDDSHelper.cpp | 291 +++++++++++++++++++++++++++++---- 1 file changed, 257 insertions(+), 34 deletions(-) diff --git a/libkram/kram/KramDDSHelper.cpp b/libkram/kram/KramDDSHelper.cpp index 651e4a21..595b3390 100644 --- a/libkram/kram/KramDDSHelper.cpp +++ b/libkram/kram/KramDDSHelper.cpp @@ -16,6 +16,104 @@ const uint32_t DDS_MAGIC = 0x20534444; // "DDS " ((uint32_t)(uint8_t)(str[0]) | ((uint32_t)(uint8_t)(str[1]) << 8) | \ ((uint32_t)(uint8_t)(str[2]) << 16) | ((uint32_t)(uint8_t)(str[3]) << 24 )) +// DX9 era formats, only for reading dds files in dx9 style +enum D3DFORMAT : uint32_t +{ + D3DFMT_UNKNOWN = 0, + +// D3DFMT_R8G8B8 = 20, + D3DFMT_A8R8G8B8 = 21, +// D3DFMT_X8R8G8B8 = 22, +// D3DFMT_R5G6B5 = 23, +// D3DFMT_X1R5G5B5 = 24, +// D3DFMT_A1R5G5B5 = 25, +// D3DFMT_A4R4G4B4 = 26, +// D3DFMT_R3G3B2 = 27, +// D3DFMT_A8 = 28, +// D3DFMT_A8R3G3B2 = 29, +// D3DFMT_X4R4G4B4 = 30, +// D3DFMT_A2B10G10R10 = 31, + D3DFMT_A8B8G8R8 = 32, +// D3DFMT_X8B8G8R8 = 33, +// D3DFMT_G16R16 = 34, +// D3DFMT_A2R10G10B10 = 35, +// D3DFMT_A16B16G16R16 = 36, + +// D3DFMT_A8P8 = 40, +// D3DFMT_P8 = 41, + +// D3DFMT_L8 = 50, +// D3DFMT_A8L8 = 51, +// D3DFMT_A4L4 = 52, + +// D3DFMT_V8U8 = 60, +// D3DFMT_L6V5U5 = 61, +// D3DFMT_X8L8V8U8 = 62, +// D3DFMT_Q8W8V8U8 = 63, +// D3DFMT_V16U16 = 64, +// D3DFMT_A2W10V10U10 = 67, + +// D3DFMT_UYVY = MAKEFOURCC("UYVY"), +// D3DFMT_R8G8_B8G8 = MAKEFOURCC("RGBG"), +// D3DFMT_YUY2 = MAKEFOURCC("YUY2"), +// D3DFMT_G8R8_G8B8 = MAKEFOURCC("GRGB"), + + D3DFMT_DXT1 = MAKEFOURCC("DXT1"), + D3DFMT_DXT2 = MAKEFOURCC("DXT2"), + D3DFMT_DXT3 = MAKEFOURCC("DXT3"), + D3DFMT_DXT4 = MAKEFOURCC("DXT4"), + D3DFMT_DXT5 = MAKEFOURCC("DXT5"), + + // Not worth support dx9-style files, these don't even hold srgb state + D3DFMT_ATI1 = MAKEFOURCC("ATI1"), + D3DFMT_BC4U = MAKEFOURCC("BC4U"), + D3DFMT_BC4S = MAKEFOURCC("BC4S"), + + D3DFMT_ATI2 = MAKEFOURCC("ATI2"), + D3DFMT_BC5U = MAKEFOURCC("BC5U"), + D3DFMT_BC5S = MAKEFOURCC("BC5S"), + +// D3DFMT_D16_LOCKABLE = 70, +// D3DFMT_D32 = 71, +// D3DFMT_D15S1 = 73, +// D3DFMT_D24S8 = 75, +// D3DFMT_D24X8 = 77, +// D3DFMT_D24X4S4 = 79, +// D3DFMT_D16 = 80, +// +// D3DFMT_D32F_LOCKABLE = 82, +// D3DFMT_D24FS8 = 83, + + //D3DFMT_D32_LOCKABLE = 84, + //D3DFMT_S8_LOCKABLE = 85, + +// D3DFMT_L16 = 81, +// +// D3DFMT_VERTEXDATA =100, +// D3DFMT_INDEX16 =101, +// D3DFMT_INDEX32 =102, + + //D3DFMT_Q16W16V16U16 =110, + + //D3DFMT_MULTI2_ARGB8 = MAKEFOURCC("MET1"), + + D3DFMT_R16F = 111, + D3DFMT_G16R16F = 112, + D3DFMT_A16B16G16R16F = 113, + + D3DFMT_R32F = 114, + D3DFMT_G32R32F = 115, + D3DFMT_A32B32G32R32F = 116, + +// D3DFMT_CxV8U8 = 117, + + //D3DFMT_A1 = 118, + //D3DFMT_A2B10G10R10_XR_BIAS = 119, + //D3DFMT_BINARYBUFFER = 199, + + D3DFMT_FORCE_DWORD =0x7fffffff +}; + enum DDS_FLAGS : uint32_t { @@ -34,10 +132,10 @@ enum DDS_FLAGS : uint32_t DDSPF_ALPHAPIXELS = 0x00000001, DDSPF_FOURCC = 0x00000004, DDSPF_RGB = 0x00000040, - //DDSPF_LUMINANCE = 0x00020000, - //DDSPF_ALPHA = 0x00000002, + DDSPF_LUMINANCE = 0x00020000, // dx9 + DDSPF_ALPHA = 0x00000002, // dx9 //DDSPF_BUMPDUDV = 0x00080000, - + // caps DDSCAPS_TEXTURE = 0x00001000, DDSCAPS_MIPMAP = 0x00400000, @@ -63,10 +161,6 @@ enum DDS_FLAGS : uint32_t DDS_ALPHA_MODE_PREMULTIPLIED = 2, DDS_ALPHA_MODE_OPAQUE = 3, DDS_ALPHA_MODE_CUSTOM = 4, - - // Not worth support dx9-style files, these don't even hold srgb state - //FOURCC_BC1 = MAKEFOURCC("DXT1"), - //FOURCC_BC3 = MAKEFOURCC("DXT5"), }; struct DDS_PIXELFORMAT @@ -108,10 +202,115 @@ struct DDS_HEADER_DXT10 uint32_t miscFlags2; }; +// DX9 bitmask parsing adapted from GetPixelFormat() call here https://github.com/microsoft/DirectXTex/blob/main/DDSTextureLoader/DDSTextureLoader12.cpp +static MyMTLPixelFormat getMetalFormatFromDDS9(const DDS_PIXELFORMAT& ddpf) +{ + // Copyright (c) Microsoft Corporation. + // Licensed under the MIT License. + #define ISBITMASK( r,g,b,a ) ( ddpf.RBitMask == r && ddpf.GBitMask == g && ddpf.BBitMask == b && ddpf.ABitMask == a ) + + if (ddpf.flags & DDSPF_RGB) + { + // Note that sRGB formats are written using the "DX10" extended header + // here would need to force the format to an srgb format from cli + switch (ddpf.RGBBitCount) + { + case 32: + if (ISBITMASK(0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000)) + { + return MyMTLPixelFormatRGBA8Unorm; + } + + if (ISBITMASK(0xffffffff, 0, 0, 0)) + { + // Only 32-bit color channel format in D3D9 was R32F + return MyMTLPixelFormatR32Float; // D3DX writes this out as a FourCC of 114 + } + break; + + case 8: + // NVTT versions 1.x wrote this as RGB instead of LUMINANCE + if (ISBITMASK(0xff, 0, 0, 0)) + { + return MyMTLPixelFormatR8Unorm; + } + + // No 3:3:2 or paletted DXGI formats aka D3DFMT_R3G3B2, D3DFMT_P8 + break; + } + } + else if (ddpf.flags & DDSPF_LUMINANCE) + { + // TODO: need rrrg swizzle on these + switch (ddpf.RGBBitCount) + { + case 16: + if (ISBITMASK(0x00ff, 0, 0, 0xff00)) + { + return MyMTLPixelFormatRG8Unorm; // D3DX10/11 writes this out as DX10 extension + } + break; + + case 8: + if (ISBITMASK(0xff, 0, 0, 0)) + { + return MyMTLPixelFormatR8Unorm; // D3DX10/11 writes this out as DX10 extension + } + + // No DXGI format maps to ISBITMASK(0x0f,0,0,0xf0) aka D3DFMT_A4L4 + + if (ISBITMASK(0x00ff, 0, 0, 0xff00)) + { + return MyMTLPixelFormatRG8Unorm; // Some DDS writers assume the bitcount should be 8 instead of 16 + } + break; + } + } + else if (ddpf.flags & DDSPF_ALPHA) + { + if (8 == ddpf.RGBBitCount) + { + // TODO: need rrrr swizzle + return MyMTLPixelFormatR8Unorm; // really A8, but use a swizzle + } + } + else if (ddpf.flags & DDSPF_FOURCC) + { + switch (ddpf.fourCC) + { + case D3DFMT_DXT1: return MyMTLPixelFormatBC1_RGBA; + //case D3DFMT_DXT2: return MyMTLPixelFormatBC2_RGBA; // isPremul + //case D3DFMT_DXT3: return MyMTLPixelFormatBC2_RGBA; + case D3DFMT_DXT4: return MyMTLPixelFormatBC3_RGBA; // isPremul + case D3DFMT_DXT5: return MyMTLPixelFormatBC3_RGBA; + + case D3DFMT_ATI1: return MyMTLPixelFormatBC4_RUnorm; + case D3DFMT_BC4U: return MyMTLPixelFormatBC4_RUnorm; + case D3DFMT_BC4S: return MyMTLPixelFormatBC4_RSnorm; + + case D3DFMT_ATI2: return MyMTLPixelFormatBC5_RGUnorm; + case D3DFMT_BC5U: return MyMTLPixelFormatBC5_RGUnorm; + case D3DFMT_BC5S: return MyMTLPixelFormatBC5_RGSnorm; + + case D3DFMT_R16F: return MyMTLPixelFormatR16Float; + case D3DFMT_G16R16F: return MyMTLPixelFormatRG16Float; + case D3DFMT_A16B16G16R16F: return MyMTLPixelFormatRGBA16Float; + + case D3DFMT_R32F: return MyMTLPixelFormatR32Float; + case D3DFMT_G32R32F: return MyMTLPixelFormatRG32Float; + case D3DFMT_A32B32G32R32F: return MyMTLPixelFormatRGBA32Float; + } + + } + + return MyMTLPixelFormatInvalid; + #undef ISBITMASK +} + bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool isInfoOnly) { const uint32_t magicSize = sizeof(uint32_t); - uint32_t mipDataOffset = magicSize + sizeof(DDS_HEADER) + sizeof(DDS_HEADER_DXT10); + uint32_t mipDataOffset = magicSize + sizeof(DDS_HEADER); if (dataSize <= mipDataOffset) { KLOGE("kram", "bad dataSize too small %zu <= %d", dataSize, mipDataOffset); @@ -120,7 +319,6 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool const uint32_t& magic = *(const uint32_t*)data; const DDS_HEADER& hdr = *(const DDS_HEADER*)(data + magicSize); - const DDS_HEADER_DXT10& hdr10 = *(const DDS_HEADER_DXT10*)(data + magicSize + sizeof(DDS_HEADER)); const DDS_PIXELFORMAT& format = hdr.ddspf; if (magic != DDS_MAGIC) { @@ -128,7 +326,6 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool return false; } - // only load DX10 formatted DDS for now if (hdr.size != sizeof(DDS_HEADER)) { KLOGE("kram", "bad header size %d", hdr.size); return false; @@ -143,15 +340,22 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool KLOGE("kram", "missing format.fourCC flag"); return false; } - if (format.fourCC != FOURCC_DX10) { - KLOGE("kram", "format.fourCC 0x%08X must be DX10", format.fourCC); - return false; + + bool isDDS10 = format.fourCC == FOURCC_DX10; + const DDS_HEADER_DXT10& hdr10 = *(const DDS_HEADER_DXT10*)(data + magicSize + sizeof(DDS_HEADER)); + + MyMTLPixelFormat pixelFormat = MyMTLPixelFormatInvalid; + if (isDDS10) { + mipDataOffset += sizeof(DDS_HEADER_DXT10); + pixelFormat = directxToMetalFormat(hdr10.dxgiFormat); + } + else { + pixelFormat = getMetalFormatFromDDS9(format); } // Kram only supports a subset of DDS formats - auto pixelFormat = directxToMetalFormat(hdr10.dxgiFormat); if (pixelFormat == MyMTLPixelFormatInvalid) { - KLOGE("kram", "bad format.dxgiFormat %d", hdr10.dxgiFormat); + KLOGE("kram", "unsupported dds format"); return false; } @@ -161,7 +365,11 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool uint32_t depth = (hdr.flags & DDSD_DEPTH) ? hdr.depth : 1; uint32_t mipCount = (hdr.flags & DDSD_MIPMAPCOUNT) ? hdr.mipMapCount : 1; - uint32_t arrayCount = hdr10.arraySize; + uint32_t arrayCount = 1; + + if (isDDS10) { + arrayCount = hdr10.arraySize; + } // make sure that counts are reasonable const uint32_t kMaxMipCount = 16; @@ -202,30 +410,45 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool if (arrayCount == 0) arrayCount = 1; - bool isCube = (hdr10.miscFlag & DDS_RESOURCE_MISC_TEXTURECUBE); + bool isCube = false; bool isArray = arrayCount > 1; + bool isPremul = false; - switch(hdr10.resourceDimension) { - case DDS_DIMENSION_TEXTURE1D: - image.textureType = MyMTLTextureType1DArray; - isArray = true; // kram doesn't support 1d - break; - case DDS_DIMENSION_TEXTURE2D: - if (isCube) { - image.textureType = isArray ? MyMTLTextureTypeCubeArray : MyMTLTextureTypeCube; - } - else { - image.textureType = isArray ? MyMTLTextureType2DArray : MyMTLTextureType2D; - } - break; - case DDS_DIMENSION_TEXTURE3D: + if (isDDS10) { + isCube = (hdr10.miscFlag & DDS_RESOURCE_MISC_TEXTURECUBE); + + switch(hdr10.resourceDimension) { + case DDS_DIMENSION_TEXTURE1D: + image.textureType = MyMTLTextureType1DArray; + isArray = true; // kram doesn't support 1d + break; + case DDS_DIMENSION_TEXTURE2D: + if (isCube) { + image.textureType = isArray ? MyMTLTextureTypeCubeArray : MyMTLTextureTypeCube; + } + else { + image.textureType = isArray ? MyMTLTextureType2DArray : MyMTLTextureType2D; + } + break; + case DDS_DIMENSION_TEXTURE3D: + image.textureType = MyMTLTextureType3D; + isArray = false; + break; + } + isPremul = (hdr10.miscFlags2 & DDS_ALPHA_MODE_PREMULTIPLIED) != 0; + } + else { + isArray = false; + + if (hdr.flags & DDSD_DEPTH) { image.textureType = MyMTLTextureType3D; - isArray = false; - break; + } + else if (hdr.caps2 & DDSCAPS2_CUBEMAP) { + image.textureType = MyMTLTextureTypeCube; + } } // transfer premul setting, would like to not depend on "info" to carry this - bool isPremul = (hdr10.miscFlags2 & DDS_ALPHA_MODE_PREMULTIPLIED) != 0; if (isPremul) image.addChannelProps("Alb.ra,Alb.ga,Alb.ba,Alb.a"); From f37f50b37368a0cd90361745a1eeedf5d91e25d6 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Thu, 28 Apr 2022 00:03:05 -0700 Subject: [PATCH 006/615] kram - fix typo on task system, and mark warning on miniz to fix later. --- libkram/kram/TaskSystem.cpp | 2 +- libkram/miniz/miniz.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 1979129d..da3a8446 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -30,7 +30,7 @@ void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) #elif KRAM_WIN // each processor group only has 64 bits - SetThreadAffinityMask(handle, &affinityMask); + SetThreadAffinityMask(handle, (DWORD_PTR)&affinityMask); #else // most systems are pthread-based, this is represented with array of bits cpu_set_t cpuset; diff --git a/libkram/miniz/miniz.cpp b/libkram/miniz/miniz.cpp index 431c442f..a62263fc 100644 --- a/libkram/miniz/miniz.cpp +++ b/libkram/miniz/miniz.cpp @@ -2418,6 +2418,7 @@ tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_nex TINFL_GET_BYTE(2, r->m_zhdr1); counter = (((r->m_zhdr0 * 256 + r->m_zhdr1) % 31 != 0) || (r->m_zhdr1 & 32) || ((r->m_zhdr0 & 15) != 8)); if (!(decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)) + // TODO: fix warning C4334: '<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?) counter |= (((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) || ((out_buf_size_mask + 1) < (size_t)(1U << (8U + (r->m_zhdr0 >> 4))))); if (counter) { From bfa890bf76f74d3560a114463676eb699d226592 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 30 Apr 2022 10:17:31 -0700 Subject: [PATCH 007/615] kram - update to astc-encoder 3.7 This should be faster. Make sure to re-encode, since older encoder had some block encoding artifacts. Those were already fixed by the 3.5 update, but I had some stored textures with the artifacts. --- libkram/astc-encoder/astcenc.h | 67 +- .../astcenc_averages_and_directions.cpp | 632 ++++++++++++++---- libkram/astc-encoder/astcenc_block_sizes.cpp | 318 +++++---- .../astc-encoder/astcenc_color_quantize.cpp | 257 +++---- .../astc-encoder/astcenc_color_unquantize.cpp | 85 ++- .../astcenc_compress_symbolic.cpp | 139 ++-- .../astc-encoder/astcenc_compute_variance.cpp | 29 +- .../astcenc_decompress_symbolic.cpp | 21 +- libkram/astc-encoder/astcenc_entry.cpp | 50 +- .../astcenc_find_best_partitioning.cpp | 106 ++- .../astcenc_ideal_endpoints_and_weights.cpp | 188 ++++-- libkram/astc-encoder/astcenc_image.cpp | 84 ++- libkram/astc-encoder/astcenc_internal.h | 123 ++-- libkram/astc-encoder/astcenc_mathlib.h | 43 +- .../astcenc_mathlib_softfloat.cpp | 2 +- .../astc-encoder/astcenc_partition_tables.cpp | 83 ++- .../astcenc_percentile_tables.cpp | 2 +- .../astcenc_pick_best_endpoint_format.cpp | 144 ++-- libkram/astc-encoder/astcenc_quantization.cpp | 166 +---- .../astcenc_symbolic_physical.cpp | 7 +- libkram/astc-encoder/astcenc_vecmathlib.h | 14 +- .../astc-encoder/astcenc_vecmathlib_avx2_8.h | 118 ++-- .../astcenc_vecmathlib_common_4.h | 73 +- .../astc-encoder/astcenc_vecmathlib_neon_4.h | 30 +- .../astc-encoder/astcenc_vecmathlib_none_4.h | 13 +- .../astc-encoder/astcenc_vecmathlib_sse_4.h | 102 ++- libkram/astc-encoder/astcenc_weight_align.cpp | 120 ++-- 27 files changed, 1834 insertions(+), 1182 deletions(-) diff --git a/libkram/astc-encoder/astcenc.h b/libkram/astc-encoder/astcenc.h index f98fa7c6..a5e2b646 100644 --- a/libkram/astc-encoder/astcenc.h +++ b/libkram/astc-encoder/astcenc.h @@ -142,15 +142,18 @@ * * A normal context is capable of decompressing any ASTC texture, including those generated by other * compressors with unknown heuristics. This is the most flexible implementation, but forces the - * main data tables used by the codec to include entries that are not needed during compressor. This - * can slow down compression by ~15%. To optimize this use case the context can be created with the - * ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will only be asked to - * decompress images that it compressed, allowing the size of the context structures to be - * substantially reduced with a corresponding boost in performance. - * - * Attempting to decompress an valid image which was created by another compressor, or even another - * astcenc compressor configuration, may result in blocks returning as solid magenta or NaN values - * if they use unsupported encodings for that configuration. + * data tables used by the codec to include entries that are not needed during compression. This + * can slow down context creation by a significant amount, especially for the faster compression + * modes where few data table entries are actually used. To optimize this use case the context can + * be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will + * only be asked to decompress images that it compressed itself, allowing the data tables to + * exclude entries that are not needed by the current compression configuration. This reduces the + * size of the context data tables in memory and improves context creation performance. Note that, + * as of the 3.6 release, this flag no longer affects compression performance. + * + * Using this flag while attempting to decompress an valid image which was created by another + * compressor, or even another astcenc compressor version or configuration, may result in blocks + * returning as solid magenta or NaN value error blocks. */ #ifndef ASTCENC_INCLUDED @@ -308,29 +311,6 @@ static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0; */ static const unsigned int ASTCENC_FLG_MAP_MASK = 1 << 1; -/** - * @brief Enable RGBM map compression. - * - * Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper - * format. Data must be preprocessed by the user to be in LDR RGBM format before calling the - * compression function, this flag is only used to control the use of RGBM-specific heuristics and - * error metrics. - * - * IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small - * M values can round to zero due to quantization and result in black or white pixels. It is highly - * recommended that the minimum value of M used in the encoding is kept above a lower threshold (try - * 16 or 32). Applying this threshold reduces the number of very dark colors that can be - * represented, but is still higher precision than 8-bit LDR. - * - * When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale - * factor used during reconstruction. This defaults to 5 when in RGBM mode. - * - * It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier - * scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode, - * matching the default scale factor. - */ -static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6; - /** * @brief Enable alpha weighting. * @@ -366,6 +346,29 @@ static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY = 1 << 4; */ static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5; +/** + * @brief Enable RGBM map compression. + * + * Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper + * format. Data must be preprocessed by the user to be in LDR RGBM format before calling the + * compression function, this flag is only used to control the use of RGBM-specific heuristics and + * error metrics. + * + * IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small + * M values can round to zero due to quantization and result in black or white pixels. It is highly + * recommended that the minimum value of M used in the encoding is kept above a lower threshold (try + * 16 or 32). Applying this threshold reduces the number of very dark colors that can be + * represented, but is still higher precision than 8-bit LDR. + * + * When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale + * factor used during reconstruction. This defaults to 5 when in RGBM mode. + * + * It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier + * scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode, + * matching the default scale factor. + */ +static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6; + /** * @brief The bit mask of all valid flags. */ diff --git a/libkram/astc-encoder/astcenc_averages_and_directions.cpp b/libkram/astc-encoder/astcenc_averages_and_directions.cpp index 3002928d..2ceb83db 100644 --- a/libkram/astc-encoder/astcenc_averages_and_directions.cpp +++ b/libkram/astc-encoder/astcenc_averages_and_directions.cpp @@ -24,6 +24,366 @@ #include +/** + * @brief Compute the average RGB color of each partition. + * + * The algorithm here uses a vectorized sequential scan and per-partition + * color accumulators, using select() to mask texel lanes in other partitions. + * + * We only accumulate sums for N-1 partitions during the scan; the value for + * the last partition can be computed given that we know the block-wide average + * already. + * + * Because of this we could reduce the loop iteration count so it "just" spans + * the max texel index needed for the N-1 partitions, which could need fewer + * iterations than the full block texel count. However, this makes the loop + * count erratic and causes more branch mispredictions so is a net loss. + * + * @param pi The partitioning to use. + * @param blk The block data to process. + * @param[out] averages The output averages. Unused partition indices will + * not be initialized, and lane<3> will be zero. + */ +static void compute_partition_averages_rgb( + const partition_info& pi, + const image_block& blk, + vfloat4 averages[BLOCK_MAX_PARTITIONS] +) { + unsigned int partition_count = pi.partition_count; + unsigned int texel_count = blk.texel_count; + promise(texel_count > 0); + + // For 1 partition just use the precomputed mean + if (partition_count == 1) + { + averages[0] = blk.data_mean.swz<0, 1, 2>(); + } + // For 2 partitions scan results for partition 0, compute partition 1 + else if (partition_count == 2) + { + vfloatacc pp_avg_rgb[3] {}; + + vint lane_id = vint::lane_id(); + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint texel_partition(pi.partition_of_texel + i); + + vmask lane_mask = lane_id < vint(texel_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + + vmask p0_mask = lane_mask & (texel_partition == vint(0)); + + vfloat data_r = loada(blk.data_r + i); + haccumulate(pp_avg_rgb[0], data_r, p0_mask); + + vfloat data_g = loada(blk.data_g + i); + haccumulate(pp_avg_rgb[1], data_g, p0_mask); + + vfloat data_b = loada(blk.data_b + i); + haccumulate(pp_avg_rgb[2], data_b, p0_mask); + } + + vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast(blk.texel_count); + + vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]), + hadd_s(pp_avg_rgb[1]), + hadd_s(pp_avg_rgb[2])); + + vfloat4 p1_total = block_total - p0_total; + + averages[0] = p0_total / static_cast(pi.partition_texel_count[0]); + averages[1] = p1_total / static_cast(pi.partition_texel_count[1]); + } + // For 3 partitions scan results for partition 0/1, compute partition 2 + else if (partition_count == 3) + { + vfloatacc pp_avg_rgb[2][3] {}; + + vint lane_id = vint::lane_id(); + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint texel_partition(pi.partition_of_texel + i); + + vmask lane_mask = lane_id < vint(texel_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + + vmask p0_mask = lane_mask & (texel_partition == vint(0)); + vmask p1_mask = lane_mask & (texel_partition == vint(1)); + + vfloat data_r = loada(blk.data_r + i); + haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); + haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); + + vfloat data_g = loada(blk.data_g + i); + haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); + haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); + + vfloat data_b = loada(blk.data_b + i); + haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); + haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); + } + + vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast(blk.texel_count); + + vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), + hadd_s(pp_avg_rgb[0][1]), + hadd_s(pp_avg_rgb[0][2])); + + vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), + hadd_s(pp_avg_rgb[1][1]), + hadd_s(pp_avg_rgb[1][2])); + + vfloat4 p2_total = block_total - p0_total - p1_total; + + averages[0] = p0_total / static_cast(pi.partition_texel_count[0]); + averages[1] = p1_total / static_cast(pi.partition_texel_count[1]); + averages[2] = p2_total / static_cast(pi.partition_texel_count[2]); + } + else + { + // For 4 partitions scan results for partition 0/1/2, compute partition 3 + vfloatacc pp_avg_rgb[3][3] {}; + + vint lane_id = vint::lane_id(); + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint texel_partition(pi.partition_of_texel + i); + + vmask lane_mask = lane_id < vint(texel_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + + vmask p0_mask = lane_mask & (texel_partition == vint(0)); + vmask p1_mask = lane_mask & (texel_partition == vint(1)); + vmask p2_mask = lane_mask & (texel_partition == vint(2)); + + vfloat data_r = loada(blk.data_r + i); + haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); + haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); + haccumulate(pp_avg_rgb[2][0], data_r, p2_mask); + + vfloat data_g = loada(blk.data_g + i); + haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); + haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); + haccumulate(pp_avg_rgb[2][1], data_g, p2_mask); + + vfloat data_b = loada(blk.data_b + i); + haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); + haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); + haccumulate(pp_avg_rgb[2][2], data_b, p2_mask); + } + + vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast(blk.texel_count); + + vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), + hadd_s(pp_avg_rgb[0][1]), + hadd_s(pp_avg_rgb[0][2])); + + vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), + hadd_s(pp_avg_rgb[1][1]), + hadd_s(pp_avg_rgb[1][2])); + + vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]), + hadd_s(pp_avg_rgb[2][1]), + hadd_s(pp_avg_rgb[2][2])); + + vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; + + averages[0] = p0_total / static_cast(pi.partition_texel_count[0]); + averages[1] = p1_total / static_cast(pi.partition_texel_count[1]); + averages[2] = p2_total / static_cast(pi.partition_texel_count[2]); + averages[3] = p3_total / static_cast(pi.partition_texel_count[3]); + } +} + +/** + * @brief Compute the average RGBA color of each partition. + * + * The algorithm here uses a vectorized sequential scan and per-partition + * color accumulators, using select() to mask texel lanes in other partitions. + * + * We only accumulate sums for N-1 partitions during the scan; the value for + * the last partition can be computed given that we know the block-wide average + * already. + * + * Because of this we could reduce the loop iteration count so it "just" spans + * the max texel index needed for the N-1 partitions, which could need fewer + * iterations than the full block texel count. However, this makes the loop + * count erratic and causes more branch mispredictions so is a net loss. + * + * @param pi The partitioning to use. + * @param blk The block data to process. + * @param[out] averages The output averages. Unused partition indices will + * not be initialized. + */ +static void compute_partition_averages_rgba( + const partition_info& pi, + const image_block& blk, + vfloat4 averages[BLOCK_MAX_PARTITIONS] +) { + unsigned int partition_count = pi.partition_count; + unsigned int texel_count = blk.texel_count; + promise(texel_count > 0); + + // For 1 partition just use the precomputed mean + if (partition_count == 1) + { + averages[0] = blk.data_mean; + } + // For 2 partitions scan results for partition 0, compute partition 1 + else if (partition_count == 2) + { + vfloat4 pp_avg_rgba[4] {}; + + vint lane_id = vint::lane_id(); + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint texel_partition(pi.partition_of_texel + i); + + vmask lane_mask = lane_id < vint(texel_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + + vmask p0_mask = lane_mask & (texel_partition == vint(0)); + + vfloat data_r = loada(blk.data_r + i); + haccumulate(pp_avg_rgba[0], data_r, p0_mask); + + vfloat data_g = loada(blk.data_g + i); + haccumulate(pp_avg_rgba[1], data_g, p0_mask); + + vfloat data_b = loada(blk.data_b + i); + haccumulate(pp_avg_rgba[2], data_b, p0_mask); + + vfloat data_a = loada(blk.data_a + i); + haccumulate(pp_avg_rgba[3], data_a, p0_mask); + } + + vfloat4 block_total = blk.data_mean * static_cast(blk.texel_count); + + vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]), + hadd_s(pp_avg_rgba[1]), + hadd_s(pp_avg_rgba[2]), + hadd_s(pp_avg_rgba[3])); + + vfloat4 p1_total = block_total - p0_total; + + averages[0] = p0_total / static_cast(pi.partition_texel_count[0]); + averages[1] = p1_total / static_cast(pi.partition_texel_count[1]); + } + // For 3 partitions scan results for partition 0/1, compute partition 2 + else if (partition_count == 3) + { + vfloat4 pp_avg_rgba[2][4] {}; + + vint lane_id = vint::lane_id(); + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint texel_partition(pi.partition_of_texel + i); + + vmask lane_mask = lane_id < vint(texel_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + + vmask p0_mask = lane_mask & (texel_partition == vint(0)); + vmask p1_mask = lane_mask & (texel_partition == vint(1)); + + vfloat data_r = loada(blk.data_r + i); + haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); + haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); + + vfloat data_g = loada(blk.data_g + i); + haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); + haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); + + vfloat data_b = loada(blk.data_b + i); + haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); + haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); + + vfloat data_a = loada(blk.data_a + i); + haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); + haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); + } + + vfloat4 block_total = blk.data_mean * static_cast(blk.texel_count); + + vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), + hadd_s(pp_avg_rgba[0][1]), + hadd_s(pp_avg_rgba[0][2]), + hadd_s(pp_avg_rgba[0][3])); + + vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), + hadd_s(pp_avg_rgba[1][1]), + hadd_s(pp_avg_rgba[1][2]), + hadd_s(pp_avg_rgba[1][3])); + + vfloat4 p2_total = block_total - p0_total - p1_total; + + averages[0] = p0_total / static_cast(pi.partition_texel_count[0]); + averages[1] = p1_total / static_cast(pi.partition_texel_count[1]); + averages[2] = p2_total / static_cast(pi.partition_texel_count[2]); + } + else + { + // For 4 partitions scan results for partition 0/1/2, compute partition 3 + vfloat4 pp_avg_rgba[3][4] {}; + + vint lane_id = vint::lane_id(); + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint texel_partition(pi.partition_of_texel + i); + + vmask lane_mask = lane_id < vint(texel_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + + vmask p0_mask = lane_mask & (texel_partition == vint(0)); + vmask p1_mask = lane_mask & (texel_partition == vint(1)); + vmask p2_mask = lane_mask & (texel_partition == vint(2)); + + vfloat data_r = loada(blk.data_r + i); + haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); + haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); + haccumulate(pp_avg_rgba[2][0], data_r, p2_mask); + + vfloat data_g = loada(blk.data_g + i); + haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); + haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); + haccumulate(pp_avg_rgba[2][1], data_g, p2_mask); + + vfloat data_b = loada(blk.data_b + i); + haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); + haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); + haccumulate(pp_avg_rgba[2][2], data_b, p2_mask); + + vfloat data_a = loada(blk.data_a + i); + haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); + haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); + haccumulate(pp_avg_rgba[2][3], data_a, p2_mask); + } + + vfloat4 block_total = blk.data_mean * static_cast(blk.texel_count); + + vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), + hadd_s(pp_avg_rgba[0][1]), + hadd_s(pp_avg_rgba[0][2]), + hadd_s(pp_avg_rgba[0][3])); + + vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), + hadd_s(pp_avg_rgba[1][1]), + hadd_s(pp_avg_rgba[1][2]), + hadd_s(pp_avg_rgba[1][3])); + + vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]), + hadd_s(pp_avg_rgba[2][1]), + hadd_s(pp_avg_rgba[2][2]), + hadd_s(pp_avg_rgba[2][3])); + + vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; + + averages[0] = p0_total / static_cast(pi.partition_texel_count[0]); + averages[1] = p1_total / static_cast(pi.partition_texel_count[1]); + averages[2] = p2_total / static_cast(pi.partition_texel_count[2]); + averages[3] = p3_total / static_cast(pi.partition_texel_count[3]); + } +} + /* See header for documentation. */ void compute_avgs_and_dirs_4_comp( const partition_info& pi, @@ -35,22 +395,17 @@ void compute_avgs_and_dirs_4_comp( int partition_count = pi.partition_count; promise(partition_count > 0); + // Pre-compute partition_averages + vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; + compute_partition_averages_rgba(pi, blk, partition_averages); + for (int partition = 0; partition < partition_count; partition++) { const uint8_t *texel_indexes = pi.texels_of_partition[partition]; unsigned int texel_count = pi.partition_texel_count[partition]; promise(texel_count > 0); - // TODO: Try gathers? - vfloat4 base_sum = vfloat4::zero(); - - for (unsigned int i = 0; i < texel_count; i++) - { - int iwt = texel_indexes[i]; - base_sum += blk.texel(iwt); - } - - vfloat4 average = base_sum / static_cast(texel_count); + vfloat4 average = partition_averages[partition]; pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); @@ -62,47 +417,46 @@ void compute_avgs_and_dirs_4_comp( { unsigned int iwt = texel_indexes[i]; vfloat4 texel_datum = blk.texel(iwt); - texel_datum = (texel_datum - average) * texel_weight; + texel_datum = texel_datum - average; vfloat4 zero = vfloat4::zero(); - vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero; + vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; sum_xp += select(zero, texel_datum, tdm0); - vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero; + vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; sum_yp += select(zero, texel_datum, tdm1); - vmask4 tdm2 = vfloat4(texel_datum.lane<2>()) > zero; + vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; sum_zp += select(zero, texel_datum, tdm2); - vmask4 tdm3 = vfloat4(texel_datum.lane<3>()) > zero; + vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero; sum_wp += select(zero, texel_datum, tdm3); } - float prod_xp = dot_s(sum_xp, sum_xp); - float prod_yp = dot_s(sum_yp, sum_yp); - float prod_zp = dot_s(sum_zp, sum_zp); - float prod_wp = dot_s(sum_wp, sum_wp); + sum_xp = sum_xp * texel_weight; + sum_yp = sum_yp * texel_weight; + sum_zp = sum_zp * texel_weight; + sum_wp = sum_wp * texel_weight; + + vfloat4 prod_xp = dot(sum_xp, sum_xp); + vfloat4 prod_yp = dot(sum_yp, sum_yp); + vfloat4 prod_zp = dot(sum_zp, sum_zp); + vfloat4 prod_wp = dot(sum_wp, sum_wp); vfloat4 best_vector = sum_xp; - float best_sum = prod_xp; + vfloat4 best_sum = prod_xp; - if (prod_yp > best_sum) - { - best_vector = sum_yp; - best_sum = prod_yp; - } + vmask4 mask = prod_yp > best_sum; + best_vector = select(best_vector, sum_yp, mask); + best_sum = select(best_sum, prod_yp, mask); - if (prod_zp > best_sum) - { - best_vector = sum_zp; - best_sum = prod_zp; - } + mask = prod_zp > best_sum; + best_vector = select(best_vector, sum_zp, mask); + best_sum = select(best_sum, prod_zp, mask); - if (prod_wp > best_sum) - { - best_vector = sum_wp; - } + mask = prod_wp > best_sum; + best_vector = select(best_vector, sum_wp, mask); pm[partition].dir = best_vector; } @@ -115,15 +469,25 @@ void compute_avgs_and_dirs_3_comp( unsigned int omitted_component, partition_metrics pm[BLOCK_MAX_PARTITIONS] ) { - float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f; + // Pre-compute partition_averages + vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; + compute_partition_averages_rgba(pi, blk, partition_averages); + + float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); const float* data_vr = blk.data_r; const float* data_vg = blk.data_g; const float* data_vb = blk.data_b; + // TODO: Data-driven permute would be useful to avoid this ... if (omitted_component == 0) { - texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>()) / 3.0f; + texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>()); + + partition_averages[0] = partition_averages[0].swz<1, 2, 3>(); + partition_averages[1] = partition_averages[1].swz<1, 2, 3>(); + partition_averages[2] = partition_averages[2].swz<1, 2, 3>(); + partition_averages[3] = partition_averages[3].swz<1, 2, 3>(); data_vr = blk.data_g; data_vg = blk.data_b; @@ -131,17 +495,36 @@ void compute_avgs_and_dirs_3_comp( } else if (omitted_component == 1) { - texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()) / 3.0f; + texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()); + + partition_averages[0] = partition_averages[0].swz<0, 2, 3>(); + partition_averages[1] = partition_averages[1].swz<0, 2, 3>(); + partition_averages[2] = partition_averages[2].swz<0, 2, 3>(); + partition_averages[3] = partition_averages[3].swz<0, 2, 3>(); data_vg = blk.data_b; data_vb = blk.data_a; } else if (omitted_component == 2) { - texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()) / 3.0f; + texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()); + + partition_averages[0] = partition_averages[0].swz<0, 1, 3>(); + partition_averages[1] = partition_averages[1].swz<0, 1, 3>(); + partition_averages[2] = partition_averages[2].swz<0, 1, 3>(); + partition_averages[3] = partition_averages[3].swz<0, 1, 3>(); data_vb = blk.data_a; } + else + { + partition_averages[0] = partition_averages[0].swz<0, 1, 2>(); + partition_averages[1] = partition_averages[1].swz<0, 1, 2>(); + partition_averages[2] = partition_averages[2].swz<0, 1, 2>(); + partition_averages[3] = partition_averages[3].swz<0, 1, 2>(); + } + + texel_weight = texel_weight * (1.0f / 3.0f); unsigned int partition_count = pi.partition_count; promise(partition_count > 0); @@ -152,14 +535,7 @@ void compute_avgs_and_dirs_3_comp( unsigned int texel_count = pi.partition_texel_count[partition]; promise(texel_count > 0); - vfloat4 base_sum = vfloat4::zero(); - for (unsigned int i = 0; i < texel_count; i++) - { - unsigned int iwt = texel_indexes[i]; - base_sum += vfloat3(data_vr[iwt], data_vg[iwt], data_vb[iwt]); - } - - vfloat4 average = base_sum / static_cast(texel_count); + vfloat4 average = partition_averages[partition]; pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); @@ -173,38 +549,37 @@ void compute_avgs_and_dirs_3_comp( vfloat4 texel_datum = vfloat3(data_vr[iwt], data_vg[iwt], data_vb[iwt]); - - texel_datum = (texel_datum - average) * texel_weight; + texel_datum = texel_datum - average; vfloat4 zero = vfloat4::zero(); - vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero; + vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; sum_xp += select(zero, texel_datum, tdm0); - vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero; + vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; sum_yp += select(zero, texel_datum, tdm1); - vmask4 tdm2 = vfloat4(texel_datum.lane<2>()) > zero; + vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; sum_zp += select(zero, texel_datum, tdm2); } - float prod_xp = dot3_s(sum_xp, sum_xp); - float prod_yp = dot3_s(sum_yp, sum_yp); - float prod_zp = dot3_s(sum_zp, sum_zp); + sum_xp = sum_xp * texel_weight; + sum_yp = sum_yp * texel_weight; + sum_zp = sum_zp * texel_weight; + + vfloat4 prod_xp = dot(sum_xp, sum_xp); + vfloat4 prod_yp = dot(sum_yp, sum_yp); + vfloat4 prod_zp = dot(sum_zp, sum_zp); vfloat4 best_vector = sum_xp; - float best_sum = prod_xp; + vfloat4 best_sum = prod_xp; - if (prod_yp > best_sum) - { - best_vector = sum_yp; - best_sum = prod_yp; - } + vmask4 mask = prod_yp > best_sum; + best_vector = select(best_vector, sum_yp, mask); + best_sum = select(best_sum, prod_yp, mask); - if (prod_zp > best_sum) - { - best_vector = sum_zp; - } + mask = prod_zp > best_sum; + best_vector = select(best_vector, sum_zp, mask); pm[partition].dir = best_vector; } @@ -216,25 +591,22 @@ void compute_avgs_and_dirs_3_comp_rgb( const image_block& blk, partition_metrics pm[BLOCK_MAX_PARTITIONS] ) { - float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3; + float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) * (1.0f / 3.0f); unsigned int partition_count = pi.partition_count; promise(partition_count > 0); + // Pre-compute partition_averages + vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; + compute_partition_averages_rgb(pi, blk, partition_averages); + for (unsigned int partition = 0; partition < partition_count; partition++) { const uint8_t *texel_indexes = pi.texels_of_partition[partition]; unsigned int texel_count = pi.partition_texel_count[partition]; promise(texel_count > 0); - vfloat4 base_sum = vfloat4::zero(); - for (unsigned int i = 0; i < texel_count; i++) - { - unsigned int iwt = texel_indexes[i]; - base_sum += blk.texel3(iwt); - } - - vfloat4 average = base_sum / static_cast(texel_count); + vfloat4 average = partition_averages[partition]; pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); @@ -246,38 +618,37 @@ void compute_avgs_and_dirs_3_comp_rgb( unsigned int iwt = texel_indexes[i]; vfloat4 texel_datum = blk.texel3(iwt); - - texel_datum = (texel_datum - average) * texel_weight; + texel_datum = texel_datum - average; vfloat4 zero = vfloat4::zero(); - vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero; + vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; sum_xp += select(zero, texel_datum, tdm0); - vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero; + vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; sum_yp += select(zero, texel_datum, tdm1); - vmask4 tdm2 = vfloat4(texel_datum.lane<2>()) > zero; + vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; sum_zp += select(zero, texel_datum, tdm2); } - float prod_xp = dot3_s(sum_xp, sum_xp); - float prod_yp = dot3_s(sum_yp, sum_yp); - float prod_zp = dot3_s(sum_zp, sum_zp); + sum_xp = sum_xp * texel_weight; + sum_yp = sum_yp * texel_weight; + sum_zp = sum_zp * texel_weight; + + vfloat4 prod_xp = dot(sum_xp, sum_xp); + vfloat4 prod_yp = dot(sum_yp, sum_yp); + vfloat4 prod_zp = dot(sum_zp, sum_zp); vfloat4 best_vector = sum_xp; - float best_sum = prod_xp; + vfloat4 best_sum = prod_xp; - if (prod_yp > best_sum) - { - best_vector = sum_yp; - best_sum = prod_yp; - } + vmask4 mask = prod_yp > best_sum; + best_vector = select(best_vector, sum_yp, mask); + best_sum = select(best_sum, prod_yp, mask); - if (prod_zp > best_sum) - { - best_vector = sum_zp; - } + mask = prod_zp > best_sum; + best_vector = select(best_vector, sum_zp, mask); pm[partition].dir = best_vector; } @@ -292,6 +663,7 @@ void compute_avgs_and_dirs_2_comp( partition_metrics pm[BLOCK_MAX_PARTITIONS] ) { float texel_weight; + vfloat4 average; const float* data_vr = nullptr; const float* data_vg = nullptr; @@ -299,6 +671,7 @@ void compute_avgs_and_dirs_2_comp( if (component1 == 0 && component2 == 1) { texel_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f; + average = blk.data_mean.swz<0, 1>(); data_vr = blk.data_r; data_vg = blk.data_g; @@ -306,6 +679,7 @@ void compute_avgs_and_dirs_2_comp( else if (component1 == 0 && component2 == 2) { texel_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f; + average = blk.data_mean.swz<0, 2>(); data_vr = blk.data_r; data_vg = blk.data_b; @@ -315,6 +689,7 @@ void compute_avgs_and_dirs_2_comp( assert(component1 == 1 && component2 == 2); texel_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f; + average = blk.data_mean.swz<1, 2>(); data_vr = blk.data_g; data_vg = blk.data_b; @@ -329,14 +704,19 @@ void compute_avgs_and_dirs_2_comp( unsigned int texel_count = pt.partition_texel_count[partition]; promise(texel_count > 0); - vfloat4 base_sum = vfloat4::zero(); - for (unsigned int i = 0; i < texel_count; i++) + // Only compute a partition mean if more than one partition + if (partition_count > 1) { - unsigned int iwt = texel_indexes[i]; - base_sum += vfloat2(data_vr[iwt], data_vg[iwt]); + average = vfloat4::zero(); + for (unsigned int i = 0; i < texel_count; i++) + { + unsigned int iwt = texel_indexes[i]; + average += vfloat2(data_vr[iwt], data_vg[iwt]); + } + + average = average * (1.0f / static_cast(texel_count)); } - vfloat4 average = base_sum / static_cast(texel_count); pm[partition].avg = average; vfloat4 sum_xp = vfloat4::zero(); @@ -346,27 +726,28 @@ void compute_avgs_and_dirs_2_comp( { unsigned int iwt = texel_indexes[i]; vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]); - texel_datum = (texel_datum - average) * texel_weight; + texel_datum = texel_datum - average; vfloat4 zero = vfloat4::zero(); - vmask4 tdm0 = vfloat4(texel_datum.lane<0>()) > zero; + vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; sum_xp += select(zero, texel_datum, tdm0); - vmask4 tdm1 = vfloat4(texel_datum.lane<1>()) > zero; + vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; sum_yp += select(zero, texel_datum, tdm1); } - float prod_xp = dot_s(sum_xp, sum_xp); - float prod_yp = dot_s(sum_yp, sum_yp); + sum_xp = sum_xp * texel_weight; + sum_yp = sum_yp * texel_weight; + + vfloat4 prod_xp = dot(sum_xp, sum_xp); + vfloat4 prod_yp = dot(sum_yp, sum_yp); vfloat4 best_vector = sum_xp; - float best_sum = prod_xp; + vfloat4 best_sum = prod_xp; - if (prod_yp > best_sum) - { - best_vector = sum_yp; - } + vmask4 mask = prod_yp > best_sum; + best_vector = select(best_vector, sum_yp, mask); pm[partition].dir = best_vector; } @@ -386,8 +767,8 @@ void compute_error_squared_rgba( unsigned int partition_count = pi.partition_count; promise(partition_count > 0); - uncor_error = 0.0f; - samec_error = 0.0f; + vfloatacc uncor_errorsumv = vfloatacc::zero(); + vfloatacc samec_errorsumv = vfloatacc::zero(); for (unsigned int partition = 0; partition < partition_count; partition++) { @@ -425,11 +806,9 @@ void compute_error_squared_rgba( vfloat uncor_loparamv(1e10f); vfloat uncor_hiparamv(-1e10f); - vfloat4 uncor_errorsumv = vfloat4::zero(); vfloat samec_loparamv(1e10f); vfloat samec_hiparamv(-1e10f); - vfloat4 samec_errorsumv = vfloat4::zero(); vfloat ew_r(blk.channel_weight.lane<0>()); vfloat ew_g(blk.channel_weight.lane<1>()); @@ -472,8 +851,7 @@ void compute_error_squared_rgba( + (ew_b * uncor_dist2 * uncor_dist2) + (ew_a * uncor_dist3 * uncor_dist3); - uncor_err = select(vfloat::zero(), uncor_err, mask); - haccumulate(uncor_errorsumv, uncor_err); + haccumulate(uncor_errorsumv, uncor_err, mask); // Process samechroma data vfloat samec_param = (data_r * l_samec_bs0) @@ -494,8 +872,7 @@ void compute_error_squared_rgba( + (ew_b * samec_dist2 * samec_dist2) + (ew_a * samec_dist3 * samec_dist3); - samec_err = select(vfloat::zero(), samec_err, mask); - haccumulate(samec_errorsumv, samec_err); + haccumulate(samec_errorsumv, samec_err, mask); lane_ids += vint(ASTCENC_SIMD_WIDTH); } @@ -506,10 +883,6 @@ void compute_error_squared_rgba( samec_loparam = hmin_s(samec_loparamv); samec_hiparam = hmax_s(samec_hiparamv); - // Resolve the final scalar accumulator sum - haccumulate(uncor_error, uncor_errorsumv); - haccumulate(samec_error, samec_errorsumv); - float uncor_linelen = uncor_hiparam - uncor_loparam; float samec_linelen = samec_hiparam - samec_loparam; @@ -517,6 +890,9 @@ void compute_error_squared_rgba( uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f); samec_lengths[partition] = astc::max(samec_linelen, 1e-7f); } + + uncor_error = hadd_s(uncor_errorsumv); + samec_error = hadd_s(samec_errorsumv); } /* See header for documentation. */ @@ -530,8 +906,8 @@ void compute_error_squared_rgb( unsigned int partition_count = pi.partition_count; promise(partition_count > 0); - uncor_error = 0.0f; - samec_error = 0.0f; + vfloatacc uncor_errorsumv = vfloatacc::zero(); + vfloatacc samec_errorsumv = vfloatacc::zero(); for (unsigned int partition = 0; partition < partition_count; partition++) { @@ -570,11 +946,9 @@ void compute_error_squared_rgb( vfloat uncor_loparamv(1e10f); vfloat uncor_hiparamv(-1e10f); - vfloat4 uncor_errorsumv = vfloat4::zero(); vfloat samec_loparamv(1e10f); vfloat samec_hiparamv(-1e10f); - vfloat4 samec_errorsumv = vfloat4::zero(); vfloat ew_r(blk.channel_weight.lane<0>()); vfloat ew_g(blk.channel_weight.lane<1>()); @@ -611,8 +985,7 @@ void compute_error_squared_rgb( + (ew_g * uncor_dist1 * uncor_dist1) + (ew_b * uncor_dist2 * uncor_dist2); - uncor_err = select(vfloat::zero(), uncor_err, mask); - haccumulate(uncor_errorsumv, uncor_err); + haccumulate(uncor_errorsumv, uncor_err, mask); // Process samechroma data vfloat samec_param = (data_r * l_samec_bs0) @@ -622,7 +995,6 @@ void compute_error_squared_rgb( samec_loparamv = min(samec_param, samec_loparamv); samec_hiparamv = max(samec_param, samec_hiparamv); - vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; @@ -631,8 +1003,7 @@ void compute_error_squared_rgb( + (ew_g * samec_dist1 * samec_dist1) + (ew_b * samec_dist2 * samec_dist2); - samec_err = select(vfloat::zero(), samec_err, mask); - haccumulate(samec_errorsumv, samec_err); + haccumulate(samec_errorsumv, samec_err, mask); lane_ids += vint(ASTCENC_SIMD_WIDTH); } @@ -643,10 +1014,6 @@ void compute_error_squared_rgb( samec_loparam = hmin_s(samec_loparamv); samec_hiparam = hmax_s(samec_hiparamv); - // Resolve the final scalar accumulator sum - haccumulate(uncor_error, uncor_errorsumv); - haccumulate(samec_error, samec_errorsumv); - float uncor_linelen = uncor_hiparam - uncor_loparam; float samec_linelen = samec_hiparam - samec_loparam; @@ -654,6 +1021,9 @@ void compute_error_squared_rgb( pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f); pl.samec_line_len = astc::max(samec_linelen, 1e-7f); } + + uncor_error = hadd_s(uncor_errorsumv); + samec_error = hadd_s(samec_errorsumv); } #endif diff --git a/libkram/astc-encoder/astcenc_block_sizes.cpp b/libkram/astc-encoder/astcenc_block_sizes.cpp index 9718cba9..e498da46 100644 --- a/libkram/astc-encoder/astcenc_block_sizes.cpp +++ b/libkram/astc-encoder/astcenc_block_sizes.cpp @@ -130,7 +130,7 @@ static bool decode_block_mode_2d( quant_mode = (base_quant_mode - 2) + 6 * H; is_dual_plane = D != 0; - weight_bits = get_ise_sequence_bitcount(weight_count, (quant_method)quant_mode); + weight_bits = get_ise_sequence_bitcount(weight_count, static_cast(quant_mode)); return (weight_count <= BLOCK_MAX_WEIGHTS && weight_bits >= BLOCK_MIN_WEIGHT_BITS && weight_bits <= BLOCK_MAX_WEIGHT_BITS); @@ -233,7 +233,7 @@ static bool decode_block_mode_3d( quant_mode = (base_quant_mode - 2) + 6 * H; is_dual_plane = D != 0; - weight_bits = get_ise_sequence_bitcount(weight_count, (quant_method)quant_mode); + weight_bits = get_ise_sequence_bitcount(weight_count, static_cast(quant_mode)); return (weight_count <= BLOCK_MAX_WEIGHTS && weight_bits >= BLOCK_MIN_WEIGHT_BITS && weight_bits <= BLOCK_MAX_WEIGHT_BITS); @@ -331,7 +331,7 @@ static void init_decimation_info_2d( for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++) { di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j]; - di.texel_weights_float_4t[j][i] = ((float)wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); + di.texel_weights_float_4t[j][i] = static_cast(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j]; } @@ -349,7 +349,7 @@ static void init_decimation_info_2d( for (unsigned int i = 0; i < weights_per_block; i++) { unsigned int texel_count_wt = wb.texel_count_of_weight[i]; - di.weight_texel_count[i] = (uint8_t)texel_count_wt; + di.weight_texel_count[i] = static_cast(texel_count_wt); for (unsigned int j = 0; j < texel_count_wt; j++) { @@ -357,7 +357,7 @@ static void init_decimation_info_2d( // Create transposed versions of these for better vectorization di.weight_texel[j][i] = texel; - di.weights_flt[j][i] = (float)wb.texel_weights_of_weight[i][j]; + di.weights_flt[j][i] = static_cast(wb.texel_weights_of_weight[i][j]); // perform a layer of array unrolling. An aspect of this unrolling is that // one of the texel-weight indexes is an identity-mapped index; we will use this @@ -608,7 +608,7 @@ static void init_decimation_info_3d( for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++) { di.texel_weights_int_4t[j][i] = wb.weights_of_texel[i][j]; - di.texel_weights_float_4t[j][i] = ((float)wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); + di.texel_weights_float_4t[j][i] = static_cast(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM); di.texel_weights_4t[j][i] = wb.grid_weights_of_texel[i][j]; } } @@ -618,7 +618,7 @@ static void init_decimation_info_3d( for (unsigned int i = 0; i < weights_per_block; i++) { unsigned int texel_count_wt = wb.texel_count_of_weight[i]; - di.weight_texel_count[i] = (uint8_t)texel_count_wt; + di.weight_texel_count[i] = static_cast(texel_count_wt); for (unsigned int j = 0; j < texel_count_wt; j++) { @@ -759,31 +759,30 @@ static void assign_kmeans_texels( * @param y_weights The number of weights in the Y dimension. * @param bsd The block size descriptor we are populating. * @param wb The decimation table init scratch working buffers. - * - * @return The new entry's index in the compacted decimation table array. + * @param index The packed array index to populate. */ -static int construct_dt_entry_2d( +static void construct_dt_entry_2d( unsigned int x_texels, unsigned int y_texels, unsigned int x_weights, unsigned int y_weights, block_size_descriptor& bsd, - dt_init_working_buffers& wb + dt_init_working_buffers& wb, + unsigned int index ) { - unsigned int dm_index = bsd.decimation_mode_count; unsigned int weight_count = x_weights * y_weights; assert(weight_count <= BLOCK_MAX_WEIGHTS); bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS; - decimation_info& di = bsd.decimation_tables[dm_index]; + decimation_info& di = bsd.decimation_tables[index]; init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb); int maxprec_1plane = -1; int maxprec_2planes = -1; for (int i = 0; i < 12; i++) { - unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, (quant_method)i); + unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast(i)); if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS) { maxprec_1plane = i; @@ -791,7 +790,7 @@ static int construct_dt_entry_2d( if (try_2planes) { - unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, (quant_method)i); + unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast(i)); if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS) { maxprec_2planes = i; @@ -801,14 +800,10 @@ static int construct_dt_entry_2d( // At least one of the two should be valid ... assert(maxprec_1plane >= 0 || maxprec_2planes >= 0); - bsd.decimation_modes[dm_index].maxprec_1plane = static_cast(maxprec_1plane); - bsd.decimation_modes[dm_index].maxprec_2planes = static_cast(maxprec_2planes); - - // Default to not enabled - we'll populate these based on active block modes - bsd.decimation_modes[dm_index].percentile_hit = false; - - bsd.decimation_mode_count++; - return dm_index; + bsd.decimation_modes[index].maxprec_1plane = static_cast(maxprec_1plane); + bsd.decimation_modes[index].maxprec_2planes = static_cast(maxprec_2planes); + bsd.decimation_modes[index].ref_1_plane = 0; + bsd.decimation_modes[index].ref_2_planes = 0; } /** @@ -838,7 +833,6 @@ static void construct_block_size_descriptor_2d( bsd.ydim = static_cast(y_texels); bsd.zdim = 1; bsd.texel_count = static_cast(x_texels * y_texels); - bsd.decimation_mode_count = 0; for (unsigned int i = 0; i < MAX_DMI; i++) { @@ -848,7 +842,7 @@ static void construct_block_size_descriptor_2d( // Gather all the decimation grids that can be used with the current block #if !defined(ASTCENC_DECOMPRESS_ONLY) const float *percentiles = get_2d_percentile_table(x_texels, y_texels); - float always_threshold = 0.0f; + float always_cutoff = 0.0f; #else // Unused in decompress-only builds (void)can_omit_modes; @@ -856,57 +850,89 @@ static void construct_block_size_descriptor_2d( #endif // Construct the list of block formats referencing the decimation tables - unsigned int packed_idx = 0; - unsigned int always_block_mode_count = 0; - unsigned int always_decimation_mode_count = 0; + unsigned int packed_bm_idx = 0; + unsigned int packed_dm_idx = 0; - // Iterate twice; first time keep the "always" blocks, second time keep the "non-always" blocks. - // This ensures that the always block modes and decimation modes are at the start of the list. - for (unsigned int j = 0; j < 2; j ++) + // Trackers + unsigned int bm_counts[4] { 0 }; + unsigned int dm_counts[4] { 0 }; + + // Clear the list to a known-bad value + for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) + { + bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; + } + + // Iterate four times to build a usefully ordered list: + // - Pass 0 - keep selected single plane "always" block modes + // - Pass 1 - keep selected single plane "non-always" block modes + // - Pass 2 - keep select dual plane block modes + // - Pass 3 - keep everything else that's legal + unsigned int limit = can_omit_modes ? 3 : 4; + for (unsigned int j = 0; j < limit; j ++) { for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) { + // Skip modes we've already included in a previous pass + if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE) + { + continue; + } + + // Decode parameters unsigned int x_weights; unsigned int y_weights; bool is_dual_plane; unsigned int quant_mode; unsigned int weight_bits; + bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits); - #if !defined(ASTCENC_DECOMPRESS_ONLY) - float percentile = percentiles[i]; - bool selected = (percentile <= mode_cutoff) || !can_omit_modes; - - if (j == 0 && percentile > always_threshold) + // Always skip invalid encodings for the current block size + if (!valid || (x_weights > x_texels) || (y_weights > y_texels)) { continue; } - if (j == 1 && percentile <= always_threshold) + // Selectively skip dual plane encodings + if (((j <= 1) && is_dual_plane) || (j == 2 && !is_dual_plane)) { continue; } - #else - // Decompressor builds can never discard modes, as we cannot make any - // assumptions about the modes the original compressor used - bool selected = true; + // Always skip encodings we can't physically encode based on + // generic encoding bit availability + if (is_dual_plane) + { + // This is the only check we need as only support 1 partition + if ((109 - weight_bits) <= 0) + { + continue; + } + } + else + { + // This is conservative - fewer bits may be available for > 1 partition + if ((111 - weight_bits) <= 0) + { + continue; + } + } - if (j == 1) + // Selectively skip encodings based on percentile + bool percentile_hit = false; + #if !defined(ASTCENC_DECOMPRESS_ONLY) + if (j == 0) { - continue; + percentile_hit = percentiles[i] <= always_cutoff; + } + else + { + percentile_hit = percentiles[i] <= mode_cutoff; } #endif - // ASSUMPTION: No compressor will use more weights in a dimension than - // the block has actual texels, because it wastes bits. Decompression - // of an image which violates this assumption will fail, even though it - // is technically permitted by the specification. - - // Skip modes that are invalid, too large, or not selected by heuristic - bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits); - if (!selected || !valid || (x_weights > x_texels) || (y_weights > y_texels)) + if (j != 3 && !percentile_hit) { - bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; continue; } @@ -914,63 +940,62 @@ static void construct_block_size_descriptor_2d( int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights]; if (decimation_mode < 0) { - decimation_mode = construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb); - decimation_mode_index[y_weights * 16 + x_weights] = decimation_mode; + construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx); + decimation_mode_index[y_weights * 16 + x_weights] = packed_dm_idx; + decimation_mode = packed_dm_idx; - #if !defined(ASTCENC_DECOMPRESS_ONLY) - if (percentile <= always_threshold) - { - always_decimation_mode_count++; - } - #endif + dm_counts[j]++; + packed_dm_idx++; } - #if !defined(ASTCENC_DECOMPRESS_ONLY) - // Flatten the block mode heuristic into some precomputed flags - if (percentile <= always_threshold) - { - always_block_mode_count++; - bsd.block_modes[packed_idx].percentile_hit = true; - bsd.decimation_modes[decimation_mode].percentile_hit = true; - } - else if (percentile <= mode_cutoff) + auto& bm = bsd.block_modes[packed_bm_idx]; + auto& dm = bsd.decimation_modes[decimation_mode]; + + if (is_dual_plane) { - bsd.block_modes[packed_idx].percentile_hit = true; - bsd.decimation_modes[decimation_mode].percentile_hit = true; + dm.ref_2_planes = 1; } else { - bsd.block_modes[packed_idx].percentile_hit = false; + dm.ref_1_plane = 1; } - #endif - bsd.block_modes[packed_idx].decimation_mode = static_cast(decimation_mode); - bsd.block_modes[packed_idx].quant_mode = static_cast(quant_mode); - bsd.block_modes[packed_idx].is_dual_plane = static_cast(is_dual_plane); - bsd.block_modes[packed_idx].weight_bits = static_cast(weight_bits); - bsd.block_modes[packed_idx].mode_index = static_cast(i); - bsd.block_mode_packed_index[i] = static_cast(packed_idx); - packed_idx++; + bm.decimation_mode = static_cast(decimation_mode); + bm.quant_mode = static_cast(quant_mode); + bm.is_dual_plane = static_cast(is_dual_plane); + bm.weight_bits = static_cast(weight_bits); + bm.mode_index = static_cast(i); + + bsd.block_mode_packed_index[i] = static_cast(packed_bm_idx); + + packed_bm_idx++; + bm_counts[j]++; } } - bsd.block_mode_count = packed_idx; - bsd.always_block_mode_count = always_block_mode_count; - bsd.always_decimation_mode_count = always_decimation_mode_count; + bsd.block_mode_count_1plane_always = bm_counts[0]; + bsd.block_mode_count_1plane_selected = bm_counts[0] + bm_counts[1]; + bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1] + bm_counts[2]; + bsd.block_mode_count_all = bm_counts[0] + bm_counts[1] + bm_counts[2] + bm_counts[3]; + + bsd.decimation_mode_count_always = dm_counts[0]; + bsd.decimation_mode_count_selected = dm_counts[0] + dm_counts[1] + dm_counts[2]; + bsd.decimation_mode_count_all = dm_counts[0] + dm_counts[1] + dm_counts[2] + dm_counts[3]; #if !defined(ASTCENC_DECOMPRESS_ONLY) - assert(bsd.always_block_mode_count > 0); - assert(bsd.always_decimation_mode_count > 0); + assert(bsd.block_mode_count_1plane_always > 0); + assert(bsd.decimation_mode_count_always > 0); delete[] percentiles; #endif // Ensure the end of the array contains valid data (should never get read) - for (unsigned int i = bsd.decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++) + for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++) { bsd.decimation_modes[i].maxprec_1plane = -1; bsd.decimation_modes[i].maxprec_2planes = -1; - bsd.decimation_modes[i].percentile_hit = false; + bsd.decimation_modes[i].ref_1_plane = 0; + bsd.decimation_modes[i].ref_2_planes = 0; } // Determine the texels to use for kmeans clustering. @@ -1035,13 +1060,13 @@ static void construct_block_size_descriptor_3d( int maxprec_2planes = -1; for (unsigned int i = 0; i < 12; i++) { - unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, (quant_method)i); + unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast(i)); if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS) { maxprec_1plane = i; } - unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, (quant_method)i); + unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast(i)); if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS) { maxprec_2planes = i; @@ -1055,7 +1080,8 @@ static void construct_block_size_descriptor_3d( bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast(maxprec_1plane); bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast(maxprec_2planes); - bsd.decimation_modes[decimation_mode_count].percentile_hit = false; + bsd.decimation_modes[decimation_mode_count].ref_1_plane = maxprec_1plane == -1 ? 0 : 1; + bsd.decimation_modes[decimation_mode_count].ref_2_planes = maxprec_2planes == -1 ? 0 : 1; decimation_mode_count++; } } @@ -1066,62 +1092,94 @@ static void construct_block_size_descriptor_3d( { bsd.decimation_modes[i].maxprec_1plane = -1; bsd.decimation_modes[i].maxprec_2planes = -1; - bsd.decimation_modes[i].percentile_hit = false; + bsd.decimation_modes[i].ref_1_plane = 0; + bsd.decimation_modes[i].ref_2_planes = 0; } - bsd.decimation_mode_count = decimation_mode_count; + bsd.decimation_mode_count_always = 0; // Skipped for 3D modes + bsd.decimation_mode_count_selected = decimation_mode_count; + bsd.decimation_mode_count_all = decimation_mode_count; // Construct the list of block formats - unsigned int packed_idx = 0; + // Construct the list of block formats referencing the decimation tables + + // Clear the list to a known-bad value for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) { - unsigned int x_weights; - unsigned int y_weights; - unsigned int z_weights; - bool is_dual_plane; - unsigned int quant_mode; - unsigned int weight_bits; - bool permit_encode = true; - - if (decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits)) + bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; + } + + unsigned int packed_idx = 0; + unsigned int bm_counts[2] { 0 }; + + // Iterate two times to build a usefully ordered list: + // - Pass 0 - keep valid single plane block modes + // - Pass 1 - keep valid dual plane block modes + for (unsigned int j = 0; j < 2; j++) + { + for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++) { - if (x_weights > x_texels || y_weights > y_texels || z_weights > z_texels) + // Skip modes we've already included in a previous pass + if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE) { - permit_encode = false; + continue; } - } - else - { - permit_encode = false; - } - if (!permit_encode) - { - bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE; - continue; - } + unsigned int x_weights; + unsigned int y_weights; + unsigned int z_weights; + bool is_dual_plane; + unsigned int quant_mode; + unsigned int weight_bits; - int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights]; - bsd.block_modes[packed_idx].decimation_mode = static_cast(decimation_mode); - bsd.block_modes[packed_idx].quant_mode = static_cast(quant_mode); - bsd.block_modes[packed_idx].weight_bits = static_cast(weight_bits); - bsd.block_modes[packed_idx].is_dual_plane = static_cast(is_dual_plane); - bsd.block_modes[packed_idx].mode_index = static_cast(i); + bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits); + // Skip invalid encodings + if (!valid || x_weights > x_texels || y_weights > y_texels || z_weights > z_texels) + { + continue; + } - // No percentile table, so enable everything all the time ... - bsd.block_modes[packed_idx].percentile_hit = true; - bsd.decimation_modes[decimation_mode].percentile_hit = true; + // Skip encodings in the wrong iteration + if ((j == 0 && is_dual_plane) || (j == 1 && !is_dual_plane)) + { + continue; + } - bsd.block_mode_packed_index[i] = static_cast(packed_idx); + // Always skip encodings we can't physically encode based on bit availability + if (is_dual_plane) + { + // This is the only check we need as only support 1 partition + if ((109 - weight_bits) <= 0) + { + continue; + } + } + else + { + // This is conservative - fewer bits may be available for > 1 partition + if ((111 - weight_bits) <= 0) + { + continue; + } + } - packed_idx++; - } + int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights]; + bsd.block_modes[packed_idx].decimation_mode = static_cast(decimation_mode); + bsd.block_modes[packed_idx].quant_mode = static_cast(quant_mode); + bsd.block_modes[packed_idx].weight_bits = static_cast(weight_bits); + bsd.block_modes[packed_idx].is_dual_plane = static_cast(is_dual_plane); + bsd.block_modes[packed_idx].mode_index = static_cast(i); - bsd.block_mode_count = packed_idx; + bsd.block_mode_packed_index[i] = static_cast(packed_idx); + bm_counts[j]++; + packed_idx++; + } + } - // These are never used = the MODE0 fast path is skipped for 3D blocks - bsd.always_block_mode_count = 0; - bsd.always_decimation_mode_count = 0; + bsd.block_mode_count_1plane_always = 0; // Skipped for 3D modes + bsd.block_mode_count_1plane_selected = bm_counts[0]; + bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1]; + bsd.block_mode_count_all = bm_counts[0] + bm_counts[1]; // Determine the texels to use for kmeans clustering. assign_kmeans_texels(bsd); diff --git a/libkram/astc-encoder/astcenc_color_quantize.cpp b/libkram/astc-encoder/astcenc_color_quantize.cpp index f067c4d5..3d700a63 100644 --- a/libkram/astc-encoder/astcenc_color_quantize.cpp +++ b/libkram/astc-encoder/astcenc_color_quantize.cpp @@ -50,12 +50,45 @@ * @return The encoded quantized value. These are not necessarily in the order; the compressor * scrambles the values slightly to make hardware implementation easier. */ -static inline int quant_color( +static inline int quant_color_clamp( quant_method quant_level, int value ) { value = astc::clamp(value, 0, 255); - return color_quant_tables[quant_level][value]; + return color_quant_tables[quant_level - QUANT_6][value]; +} + +/** + * @brief Determine the quantized value given a quantization level. + * + * @param quant_level The quantization level to use. + * @param value The value to convert. This may be outside of the 0-255 range and will be + * clamped before the value is looked up. + * + * @return The encoded quantized value. These are not necessarily in the order; the compressor + * scrambles the values slightly to make hardware implementation easier. + */ +static inline int quant_color( + quant_method quant_level, + int value +) { + return color_quant_tables[quant_level - QUANT_6][value]; +} + +/** + * @brief Determine the unquantized value given a quantization level. + * + * @param quant_level The quantization level to use. + * @param value The value to convert. + * + * @return The encoded quantized value. These are not necessarily in the order; the compressor + * scrambles the values slightly to make hardware implementation easier. + */ +static inline int unquant_color( + quant_method quant_level, + int value +) { + return color_unquant_tables[quant_level - QUANT_6][value]; } /** @@ -90,26 +123,24 @@ static void quantize_rgb( int ri0b, gi0b, bi0b, ri1b, gi1b, bi1b; float rgb0_addon = 0.5f; float rgb1_addon = 0.5f; - int iters = 0; do { - ri0 = quant_color(quant_level, astc::flt2int_rd(r0 + rgb0_addon)); - gi0 = quant_color(quant_level, astc::flt2int_rd(g0 + rgb0_addon)); - bi0 = quant_color(quant_level, astc::flt2int_rd(b0 + rgb0_addon)); - ri1 = quant_color(quant_level, astc::flt2int_rd(r1 + rgb1_addon)); - gi1 = quant_color(quant_level, astc::flt2int_rd(g1 + rgb1_addon)); - bi1 = quant_color(quant_level, astc::flt2int_rd(b1 + rgb1_addon)); - - ri0b = color_unquant_tables[quant_level][ri0]; - gi0b = color_unquant_tables[quant_level][gi0]; - bi0b = color_unquant_tables[quant_level][bi0]; - ri1b = color_unquant_tables[quant_level][ri1]; - gi1b = color_unquant_tables[quant_level][gi1]; - bi1b = color_unquant_tables[quant_level][bi1]; + ri0 = quant_color_clamp(quant_level, astc::flt2int_rd(r0 + rgb0_addon)); + gi0 = quant_color_clamp(quant_level, astc::flt2int_rd(g0 + rgb0_addon)); + bi0 = quant_color_clamp(quant_level, astc::flt2int_rd(b0 + rgb0_addon)); + ri1 = quant_color_clamp(quant_level, astc::flt2int_rd(r1 + rgb1_addon)); + gi1 = quant_color_clamp(quant_level, astc::flt2int_rd(g1 + rgb1_addon)); + bi1 = quant_color_clamp(quant_level, astc::flt2int_rd(b1 + rgb1_addon)); + + ri0b = unquant_color(quant_level, ri0); + gi0b = unquant_color(quant_level, gi0); + bi0b = unquant_color(quant_level, bi0); + ri1b = unquant_color(quant_level, ri1); + gi1b = unquant_color(quant_level, gi1); + bi1b = unquant_color(quant_level, bi1); rgb0_addon -= 0.2f; rgb1_addon += 0.2f; - iters++; } while (ri0b + gi0b + bi0b > ri1b + gi1b + bi1b); output[0] = static_cast(ri0); @@ -144,8 +175,8 @@ static void quantize_rgba( float a0 = astc::clamp255f(color0.lane<3>() * scale); float a1 = astc::clamp255f(color1.lane<3>() * scale); - output[6] = color_quant_tables[quant_level][astc::flt2int_rtn(a0)]; - output[7] = color_quant_tables[quant_level][astc::flt2int_rtn(a1)]; + output[6] = quant_color(quant_level, astc::flt2int_rtn(a0)); + output[7] = quant_color(quant_level, astc::flt2int_rtn(a1)); quantize_rgb(color0, color1, output, quant_level); } @@ -191,22 +222,22 @@ static bool try_quantize_rgb_blue_contract( } // Quantize the inverse-blue-contracted color - int ri0 = color_quant_tables[quant_level][astc::flt2int_rtn(r0)]; - int gi0 = color_quant_tables[quant_level][astc::flt2int_rtn(g0)]; - int bi0 = color_quant_tables[quant_level][astc::flt2int_rtn(b0)]; + int ri0 = quant_color(quant_level, astc::flt2int_rtn(r0)); + int gi0 = quant_color(quant_level, astc::flt2int_rtn(g0)); + int bi0 = quant_color(quant_level, astc::flt2int_rtn(b0)); - int ri1 = color_quant_tables[quant_level][astc::flt2int_rtn(r1)]; - int gi1 = color_quant_tables[quant_level][astc::flt2int_rtn(g1)]; - int bi1 = color_quant_tables[quant_level][astc::flt2int_rtn(b1)]; + int ri1 = quant_color(quant_level, astc::flt2int_rtn(r1)); + int gi1 = quant_color(quant_level, astc::flt2int_rtn(g1)); + int bi1 = quant_color(quant_level, astc::flt2int_rtn(b1)); // Then unquantize again - int ru0 = color_unquant_tables[quant_level][ri0]; - int gu0 = color_unquant_tables[quant_level][gi0]; - int bu0 = color_unquant_tables[quant_level][bi0]; + int ru0 = unquant_color(quant_level, ri0); + int gu0 = unquant_color(quant_level, gi0); + int bu0 = unquant_color(quant_level, bi0); - int ru1 = color_unquant_tables[quant_level][ri1]; - int gu1 = color_unquant_tables[quant_level][gi1]; - int bu1 = color_unquant_tables[quant_level][bi1]; + int ru1 = unquant_color(quant_level, ri1); + int gu1 = unquant_color(quant_level, gi1); + int bu1 = unquant_color(quant_level, bi1); // If color #1 is not larger than color #0 then blue-contraction cannot be used. Note that // blue-contraction and quantization change this order, which is why we must test aftwards. @@ -248,8 +279,8 @@ static int try_quantize_rgba_blue_contract( float a0 = astc::clamp255f(color0.lane<3>() * scale); float a1 = astc::clamp255f(color1.lane<3>() * scale); - output[6] = color_quant_tables[quant_level][astc::flt2int_rtn(a1)]; - output[7] = color_quant_tables[quant_level][astc::flt2int_rtn(a0)]; + output[6] = quant_color(quant_level, astc::flt2int_rtn(a1)); + output[7] = quant_color(quant_level, astc::flt2int_rtn(a0)); return try_quantize_rgb_blue_contract(color0, color1, output, quant_level); } @@ -299,13 +330,14 @@ static bool try_quantize_rgb_delta( int b0b = b0a & 0xFF; // Quantize then unquantize in order to get a value that we take differences against - int r0be = color_quant_tables[quant_level][r0b]; - int g0be = color_quant_tables[quant_level][g0b]; - int b0be = color_quant_tables[quant_level][b0b]; + int r0be = quant_color(quant_level, r0b); + int g0be = quant_color(quant_level, g0b); + int b0be = quant_color(quant_level, b0b); + + r0b = unquant_color(quant_level, r0be); + g0b = unquant_color(quant_level, g0be); + b0b = unquant_color(quant_level, b0be); - r0b = color_unquant_tables[quant_level][r0be]; - g0b = color_unquant_tables[quant_level][g0be]; - b0b = color_unquant_tables[quant_level][b0be]; r0b |= r0a & 0x100; g0b |= g0a & 0x100; b0b |= b0a & 0x100; @@ -341,13 +373,13 @@ static bool try_quantize_rgb_delta( // Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails // since we have then corrupted either the top bit of the base or the sign bit of the offset - int r1de = color_quant_tables[quant_level][r1d]; - int g1de = color_quant_tables[quant_level][g1d]; - int b1de = color_quant_tables[quant_level][b1d]; + int r1de = quant_color(quant_level, r1d); + int g1de = quant_color(quant_level, g1d); + int b1de = quant_color(quant_level, b1d); - int r1du = color_unquant_tables[quant_level][r1de]; - int g1du = color_unquant_tables[quant_level][g1de]; - int b1du = color_unquant_tables[quant_level][b1de]; + int r1du = unquant_color(quant_level, r1de); + int g1du = unquant_color(quant_level, g1de); + int b1du = unquant_color(quant_level, b1de); if (((r1d ^ r1du) | (g1d ^ g1du) | (b1d ^ b1du)) & 0xC0) { @@ -441,13 +473,14 @@ static bool try_quantize_rgb_delta_blue_contract( int b0b = b0a & 0xFF; // Quantize, then unquantize in order to get a value that we take differences against. - int r0be = color_quant_tables[quant_level][r0b]; - int g0be = color_quant_tables[quant_level][g0b]; - int b0be = color_quant_tables[quant_level][b0b]; + int r0be = quant_color(quant_level, r0b); + int g0be = quant_color(quant_level, g0b); + int b0be = quant_color(quant_level, b0b); + + r0b = unquant_color(quant_level, r0be); + g0b = unquant_color(quant_level, g0be); + b0b = unquant_color(quant_level, b0be); - r0b = color_unquant_tables[quant_level][r0be]; - g0b = color_unquant_tables[quant_level][g0be]; - b0b = color_unquant_tables[quant_level][b0be]; r0b |= r0a & 0x100; g0b |= g0a & 0x100; b0b |= b0a & 0x100; @@ -484,13 +517,13 @@ static bool try_quantize_rgb_delta_blue_contract( // Then quantize and unquantize; if this causes any of the top two bits to flip, // then encoding fails, since we have then corrupted either the top bit of the base // or the sign bit of the offset. - int r1de = color_quant_tables[quant_level][r1d]; - int g1de = color_quant_tables[quant_level][g1d]; - int b1de = color_quant_tables[quant_level][b1d]; + int r1de = quant_color(quant_level, r1d); + int g1de = quant_color(quant_level, g1d); + int b1de = quant_color(quant_level, b1d); - int r1du = color_unquant_tables[quant_level][r1de]; - int g1du = color_unquant_tables[quant_level][g1de]; - int b1du = color_unquant_tables[quant_level][b1de]; + int r1du = unquant_color(quant_level, r1de); + int g1du = unquant_color(quant_level, g1de); + int b1du = unquant_color(quant_level, b1de); if (((r1d ^ r1du) | (g1d ^ g1du) | (b1d ^ b1du)) & 0xC0) { @@ -573,8 +606,8 @@ static bool try_quantize_alpha_delta( int a0a = astc::flt2int_rtn(a0); a0a <<= 1; int a0b = a0a & 0xFF; - int a0be = color_quant_tables[quant_level][a0b]; - a0b = color_unquant_tables[quant_level][a0be]; + int a0be = quant_color(quant_level, a0b); + a0b = unquant_color(quant_level, a0be); a0b |= a0a & 0x100; int a1d = astc::flt2int_rtn(a1); a1d <<= 1; @@ -588,8 +621,8 @@ static bool try_quantize_alpha_delta( a1d &= 0x7F; a1d |= (a0b & 0x100) >> 1; - int a1de = color_quant_tables[quant_level][a1d]; - int a1du = color_unquant_tables[quant_level][a1de]; + int a1de = quant_color(quant_level, a1d); + int a1du = unquant_color(quant_level, a1de); if ((a1d ^ a1du) & 0xC0) { return false; @@ -650,10 +683,10 @@ static bool try_quantize_luminance_alpha_delta( int l0b = l0a & 0xFF; int a0b = a0a & 0xFF; - int l0be = color_quant_tables[quant_level][l0b]; - int a0be = color_quant_tables[quant_level][a0b]; - l0b = color_unquant_tables[quant_level][l0be]; - a0b = color_unquant_tables[quant_level][a0be]; + int l0be = quant_color(quant_level, l0b); + int a0be = quant_color(quant_level, a0b); + l0b = unquant_color(quant_level, l0be); + a0b = unquant_color(quant_level, a0be); l0b |= l0a & 0x100; a0b |= a0a & 0x100; @@ -679,10 +712,10 @@ static bool try_quantize_luminance_alpha_delta( l1d |= (l0b & 0x100) >> 1; a1d |= (a0b & 0x100) >> 1; - int l1de = color_quant_tables[quant_level][l1d]; - int a1de = color_quant_tables[quant_level][a1d]; - int l1du = color_unquant_tables[quant_level][l1de]; - int a1du = color_unquant_tables[quant_level][a1de]; + int l1de = quant_color(quant_level, l1d); + int a1de = quant_color(quant_level, a1d); + int l1du = unquant_color(quant_level, l1de); + int a1du = unquant_color(quant_level, a1de); if ((l1d ^ l1du) & 0xC0) { @@ -800,16 +833,16 @@ static void quantize_rgbs( float g = astc::clamp255f(color.lane<1>() * scale); float b = astc::clamp255f(color.lane<2>() * scale); - int ri = color_quant_tables[quant_level][astc::flt2int_rtn(r)]; - int gi = color_quant_tables[quant_level][astc::flt2int_rtn(g)]; - int bi = color_quant_tables[quant_level][astc::flt2int_rtn(b)]; + int ri = quant_color(quant_level, astc::flt2int_rtn(r)); + int gi = quant_color(quant_level, astc::flt2int_rtn(g)); + int bi = quant_color(quant_level, astc::flt2int_rtn(b)); - int ru = color_unquant_tables[quant_level][ri]; - int gu = color_unquant_tables[quant_level][gi]; - int bu = color_unquant_tables[quant_level][bi]; + int ru = unquant_color(quant_level, ri); + int gu = unquant_color(quant_level, gi); + int bu = unquant_color(quant_level, bi); float oldcolorsum = hadd_rgb_s(color) * scale; - float newcolorsum = (float)(ru + gu + bu); + float newcolorsum = static_cast(ru + gu + bu); float scalea = astc::clamp1f(color.lane<3>() * (oldcolorsum + 1e-10f) / (newcolorsum + 1e-10f)); int scale_idx = astc::flt2int_rtn(scalea * 256.0f); @@ -818,7 +851,7 @@ static void quantize_rgbs( output[0] = static_cast(ri); output[1] = static_cast(gi); output[2] = static_cast(bi); - output[3] = color_quant_tables[quant_level][scale_idx]; + output[3] = quant_color(quant_level, scale_idx); } /** @@ -840,8 +873,8 @@ static void quantize_rgbs_alpha( float a0 = astc::clamp255f(color0.lane<3>() * scale); float a1 = astc::clamp255f(color1.lane<3>() * scale); - output[4] = color_quant_tables[quant_level][astc::flt2int_rtn(a0)]; - output[5] = color_quant_tables[quant_level][astc::flt2int_rtn(a1)]; + output[4] = quant_color(quant_level, astc::flt2int_rtn(a0)); + output[5] = quant_color(quant_level, astc::flt2int_rtn(a1)); quantize_rgbs(color, output, quant_level); } @@ -875,8 +908,8 @@ static void quantize_luminance( lum1 = avg; } - output[0] = color_quant_tables[quant_level][astc::flt2int_rtn(lum0)]; - output[1] = color_quant_tables[quant_level][astc::flt2int_rtn(lum1)]; + output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0)); + output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1)); } /** @@ -942,10 +975,10 @@ static void quantize_luminance_alpha( } } - output[0] = color_quant_tables[quant_level][astc::flt2int_rtn(lum0)]; - output[1] = color_quant_tables[quant_level][astc::flt2int_rtn(lum1)]; - output[2] = color_quant_tables[quant_level][astc::flt2int_rtn(a0)]; - output[3] = color_quant_tables[quant_level][astc::flt2int_rtn(a1)]; + output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0)); + output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1)); + output[2] = quant_color(quant_level, astc::flt2int_rtn(a0)); + output[3] = quant_color(quant_level, astc::flt2int_rtn(a1)); } /** @@ -968,8 +1001,8 @@ static inline void quantize_and_unquantize_retain_top_two_bits( do { - quantval = color_quant_tables[quant_level][value]; - uquantval = color_unquant_tables[quant_level][quantval]; + quantval = quant_color(quant_level, value); + uquantval = unquant_color(quant_level, quantval); // Perform looping if the top two bits were modified by quant/unquant perform_loop = (value & 0xC0) != (uquantval & 0xC0); @@ -1012,8 +1045,8 @@ static inline void quantize_and_unquantize_retain_top_four_bits( do { - quantval = color_quant_tables[quant_level][value]; - uquantval = color_unquant_tables[quant_level][quantval]; + quantval = quant_color(quant_level, value); + uquantval = unquant_color(quant_level, quantval); // Perform looping if the top four bits were modified by quant/unquant perform_loop = (value & 0xF0) != (uquantval & 0xF0); @@ -1501,8 +1534,8 @@ static void quantize_hdr_rgb( int a_intval = astc::flt2int_rtn(a_base * mode_scale); int a_lowbits = a_intval & 0xFF; - int a_quantval = color_quant_tables[quant_level][a_lowbits]; - int a_uquantval = color_unquant_tables[quant_level][a_quantval]; + int a_quantval = quant_color(quant_level, a_lowbits); + int a_uquantval = unquant_color(quant_level, a_quantval); a_intval = (a_intval & ~0xFF) | a_uquantval; float a_fval = static_cast(a_intval) * mode_rscale; @@ -1726,7 +1759,7 @@ static void quantize_hdr_rgb( for (int i = 0; i < 4; i++) { int idx = astc::flt2int_rtn(vals[i] * 1.0f / 256.0f); - output[i] = color_quant_tables[quant_level][idx]; + output[i] = quant_color(quant_level, idx); } for (int i = 4; i < 6; i++) @@ -1759,8 +1792,8 @@ static void quantize_hdr_rgb_ldr_alpha( float a0 = astc::clamp255f(color0.lane<3>() * scale); float a1 = astc::clamp255f(color1.lane<3>() * scale); - output[6] = color_quant_tables[quant_level][astc::flt2int_rtn(a0)]; - output[7] = color_quant_tables[quant_level][astc::flt2int_rtn(a1)]; + output[6] = quant_color(quant_level, astc::flt2int_rtn(a0)); + output[7] = quant_color(quant_level, astc::flt2int_rtn(a1)); quantize_hdr_rgb(color0, color1, output, quant_level); } @@ -1832,9 +1865,9 @@ static void quantize_hdr_luminance_large_range( v1 = lower_v1; } - // OK; encode. - output[0] = color_quant_tables[quant_level][v0]; - output[1] = color_quant_tables[quant_level][v1]; + // OK; encode + output[0] = quant_color(quant_level, v0); + output[1] = quant_color(quant_level, v1); } /** @@ -1885,8 +1918,8 @@ static bool try_quantize_hdr_luminance_small_range( highval = astc::clamp(highval, 0, 2047); v0 = lowval & 0x7F; - v0e = color_quant_tables[quant_level][v0]; - v0d = color_unquant_tables[quant_level][v0e]; + v0e = quant_color(quant_level, v0); + v0d = unquant_color(quant_level, v0e); if (v0d < 0x80) { @@ -1895,8 +1928,8 @@ static bool try_quantize_hdr_luminance_small_range( if (diffval >= 0 && diffval <= 15) { v1 = ((lowval >> 3) & 0xF0) | diffval; - v1e = color_quant_tables[quant_level][v1]; - v1d = color_unquant_tables[quant_level][v1e]; + v1e = quant_color(quant_level, v1); + v1d = unquant_color(quant_level, v1e); if ((v1d & 0xF0) == (v1 & 0xF0)) { output[0] = static_cast(v0e); @@ -1914,8 +1947,8 @@ static bool try_quantize_hdr_luminance_small_range( highval = astc::clamp(highval, 0, 1023); v0 = (lowval & 0x7F) | 0x80; - v0e = color_quant_tables[quant_level][v0]; - v0d = color_unquant_tables[quant_level][v0e]; + v0e = quant_color(quant_level, v0); + v0d = unquant_color(quant_level, v0e); if ((v0d & 0x80) == 0) { return false; @@ -1929,8 +1962,8 @@ static bool try_quantize_hdr_luminance_small_range( } v1 = ((lowval >> 2) & 0xE0) | diffval; - v1e = color_quant_tables[quant_level][v1]; - v1d = color_unquant_tables[quant_level][v1e]; + v1e = quant_color(quant_level, v1); + v1d = unquant_color(quant_level, v1e); if ((v1d & 0xE0) != (v1 & 0xE0)) { return false; @@ -1973,8 +2006,8 @@ static void quantize_hdr_alpha( val1 = (ialpha1 + (128 >> i)) >> (8 - i); v6 = (val0 & 0x7F) | ((i & 1) << 7); - v6e = color_quant_tables[quant_level][v6]; - v6d = color_unquant_tables[quant_level][v6e]; + v6e = quant_color(quant_level, v6); + v6d = unquant_color(quant_level, v6e); if ((v6 ^ v6d) & 0x80) { @@ -1992,8 +2025,8 @@ static void quantize_hdr_alpha( } v7 = ((i & 2) << 6) | ((val0 >> 7) << (6 - i)) | (diffval & mask); - v7e = color_quant_tables[quant_level][v7]; - v7d = color_unquant_tables[quant_level][v7e]; + v7e = quant_color(quant_level, v7); + v7d = unquant_color(quant_level, v7e); static const int testbits[3] { 0xE0, 0xF0, 0xF8 }; @@ -2013,8 +2046,8 @@ static void quantize_hdr_alpha( v6 = val0 | 0x80; v7 = val1 | 0x80; - output[0] = color_quant_tables[quant_level][v6]; - output[1] = color_quant_tables[quant_level][v7]; + output[0] = quant_color(quant_level, v6); + output[1] = quant_color(quant_level, v7); return; } @@ -2047,7 +2080,7 @@ uint8_t pack_color_endpoints( uint8_t* output, quant_method quant_level ) { - assert(quant_level < 21); + assert(QUANT_6 <= quant_level && quant_level <= QUANT_256); // We do not support negative colors color0 = max(color0, 0.0f); diff --git a/libkram/astc-encoder/astcenc_color_unquantize.cpp b/libkram/astc-encoder/astcenc_color_unquantize.cpp index a1c2eeb2..bb02d81b 100644 --- a/libkram/astc-encoder/astcenc_color_unquantize.cpp +++ b/libkram/astc-encoder/astcenc_color_unquantize.cpp @@ -38,11 +38,28 @@ static ASTCENC_SIMD_INLINE vint4 unquant_color( quant_method quant_level, vint4 inputq ) { - const uint8_t* unq = color_unquant_tables[quant_level]; + const uint8_t* unq = color_unquant_tables[quant_level - QUANT_6]; return vint4(unq[inputq.lane<0>()], unq[inputq.lane<1>()], unq[inputq.lane<2>()], unq[inputq.lane<3>()]); } +/** + * @brief Determine the quantized value given a quantization level. + * + * @param quant_level The quantization level to use. + * @param value The value to convert. This may be outside of the 0-255 range and will be + * clamped before the value is looked up. + * + * @return The encoded quantized value. These are not necessarily in the order; the compressor + * scrambles the values slightly to make hardware implementation easier. + */ +static inline int unquant_color( + quant_method quant_level, + int value +) { + return color_unquant_tables[quant_level - QUANT_6][value]; +} + /** * @brief Un-blue-contract a color. * @@ -204,8 +221,8 @@ static void rgb_scale_alpha_unpack( ) { // Unquantize color endpoints vint4 input = unquant_color(quant_level, input0q); - uint8_t alpha1 = color_unquant_tables[quant_level][alpha1q]; - uint8_t scale = color_unquant_tables[quant_level][scaleq]; + uint8_t alpha1 = unquant_color(quant_level, alpha1q); + uint8_t scale = unquant_color(quant_level, scaleq); output1 = input; output1.set_lane<3>(alpha1); @@ -233,7 +250,7 @@ static void rgb_scale_unpack( vint4& output1 ) { vint4 input = unquant_color(quant_level, input0q); - int scale = color_unquant_tables[quant_level][scaleq]; + int scale = unquant_color(quant_level, scaleq); output1 = input; output1.set_lane<3>(255); @@ -258,8 +275,8 @@ static void luminance_unpack( vint4& output0, vint4& output1 ) { - int lum0 = color_unquant_tables[quant_level][input[0]]; - int lum1 = color_unquant_tables[quant_level][input[1]]; + int lum0 = unquant_color(quant_level, input[0]); + int lum1 = unquant_color(quant_level, input[1]); output0 = vint4(lum0, lum0, lum0, 255); output1 = vint4(lum1, lum1, lum1, 255); } @@ -280,8 +297,8 @@ static void luminance_delta_unpack( vint4& output0, vint4& output1 ) { - int v0 = color_unquant_tables[quant_level][input[0]]; - int v1 = color_unquant_tables[quant_level][input[1]]; + int v0 = unquant_color(quant_level, input[0]); + int v1 = unquant_color(quant_level, input[1]); int l0 = (v0 >> 2) | (v1 & 0xC0); int l1 = l0 + (v1 & 0x3F); @@ -305,10 +322,10 @@ static void luminance_alpha_unpack( vint4& output0, vint4& output1 ) { - int lum0 = color_unquant_tables[quant_level][input[0]]; - int lum1 = color_unquant_tables[quant_level][input[1]]; - int alpha0 = color_unquant_tables[quant_level][input[2]]; - int alpha1 = color_unquant_tables[quant_level][input[3]]; + int lum0 = unquant_color(quant_level, input[0]); + int lum1 = unquant_color(quant_level, input[1]); + int alpha0 = unquant_color(quant_level, input[2]); + int alpha1 = unquant_color(quant_level, input[3]); output0 = vint4(lum0, lum0, lum0, alpha0); output1 = vint4(lum1, lum1, lum1, alpha1); } @@ -327,10 +344,10 @@ static void luminance_alpha_delta_unpack( vint4& output0, vint4& output1 ) { - int lum0 = color_unquant_tables[quant_level][input[0]]; - int lum1 = color_unquant_tables[quant_level][input[1]]; - int alpha0 = color_unquant_tables[quant_level][input[2]]; - int alpha1 = color_unquant_tables[quant_level][input[3]]; + int lum0 = unquant_color(quant_level, input[0]); + int lum1 = unquant_color(quant_level, input[1]); + int alpha0 = unquant_color(quant_level, input[2]); + int alpha1 = unquant_color(quant_level, input[3]); lum0 |= (lum1 & 0x80) << 1; alpha0 |= (alpha1 & 0x80) << 1; @@ -369,10 +386,10 @@ static void hdr_rgbo_unpack( vint4& output0, vint4& output1 ) { - int v0 = color_unquant_tables[quant_level][input[0]]; - int v1 = color_unquant_tables[quant_level][input[1]]; - int v2 = color_unquant_tables[quant_level][input[2]]; - int v3 = color_unquant_tables[quant_level][input[3]]; + int v0 = unquant_color(quant_level, input[0]); + int v1 = unquant_color(quant_level, input[1]); + int v2 = unquant_color(quant_level, input[2]); + int v3 = unquant_color(quant_level, input[3]); int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3); @@ -522,12 +539,12 @@ static void hdr_rgb_unpack( vint4& output1 ) { - int v0 = color_unquant_tables[quant_level][input[0]]; - int v1 = color_unquant_tables[quant_level][input[1]]; - int v2 = color_unquant_tables[quant_level][input[2]]; - int v3 = color_unquant_tables[quant_level][input[3]]; - int v4 = color_unquant_tables[quant_level][input[4]]; - int v5 = color_unquant_tables[quant_level][input[5]]; + int v0 = unquant_color(quant_level, input[0]); + int v1 = unquant_color(quant_level, input[1]); + int v2 = unquant_color(quant_level, input[2]); + int v3 = unquant_color(quant_level, input[3]); + int v4 = unquant_color(quant_level, input[4]); + int v5 = unquant_color(quant_level, input[5]); // extract all the fixed-placement bitfields int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2); @@ -691,8 +708,8 @@ static void hdr_rgb_ldr_alpha_unpack( ) { hdr_rgb_unpack(input, quant_level, output0, output1); - int v6 = color_unquant_tables[quant_level][input[6]]; - int v7 = color_unquant_tables[quant_level][input[7]]; + int v6 = unquant_color(quant_level, input[6]); + int v7 = unquant_color(quant_level, input[7]); output0.set_lane<3>(v6); output1.set_lane<3>(v7); } @@ -711,8 +728,8 @@ static void hdr_luminance_small_range_unpack( vint4& output0, vint4& output1 ) { - int v0 = color_unquant_tables[quant_level][input[0]]; - int v1 = color_unquant_tables[quant_level][input[1]]; + int v0 = unquant_color(quant_level, input[0]); + int v1 = unquant_color(quant_level, input[1]); int y0, y1; if (v0 & 0x80) @@ -748,8 +765,8 @@ static void hdr_luminance_large_range_unpack( vint4& output0, vint4& output1 ) { - int v0 = color_unquant_tables[quant_level][input[0]]; - int v1 = color_unquant_tables[quant_level][input[1]]; + int v0 = unquant_color(quant_level, input[0]); + int v1 = unquant_color(quant_level, input[1]); int y0, y1; if (v1 >= v0) @@ -782,8 +799,8 @@ static void hdr_alpha_unpack( int& output1 ) { - int v6 = color_unquant_tables[quant_level][input[0]]; - int v7 = color_unquant_tables[quant_level][input[1]]; + int v6 = unquant_color(quant_level, input[0]); + int v7 = unquant_color(quant_level, input[1]); int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2); v6 &= 0x7F; diff --git a/libkram/astc-encoder/astcenc_compress_symbolic.cpp b/libkram/astc-encoder/astcenc_compress_symbolic.cpp index 93d98e29..c7d3a8c7 100644 --- a/libkram/astc-encoder/astcenc_compress_symbolic.cpp +++ b/libkram/astc-encoder/astcenc_compress_symbolic.cpp @@ -61,20 +61,16 @@ static void merge_endpoints( * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation * is needed less often. * - * @param decode_mode The decode mode (LDR, HDR). - * @param bsd The block size information. - * @param blk The image block color data to compress. - * @param[out] scb The symbolic compressed block output. - * @param[out] dec_weights_quant_pvalue_plane1 The weights for plane 1. - * @param[out] dec_weights_quant_pvalue_plane2 The weights for plane 2, or @c nullptr if 1 plane. + * @param decode_mode The decode mode (LDR, HDR). + * @param bsd The block size information. + * @param blk The image block color data to compress. + * @param[out] scb The symbolic compressed block output. */ static bool realign_weights_undecimated( astcenc_profile decode_mode, const block_size_descriptor& bsd, const image_block& blk, - symbolic_compressed_block& scb, - uint8_t* dec_weights_quant_pvalue_plane1, - uint8_t* dec_weights_quant_pvalue_plane2 + symbolic_compressed_block& scb ) { // Get the partition descriptor unsigned int partition_count = scb.partition_count; @@ -110,7 +106,7 @@ static bool realign_weights_undecimated( endpnt1[pa_idx]); } - uint8_t* dec_weights_quant_pvalue = dec_weights_quant_pvalue_plane1; + uint8_t* dec_weights_quant_pvalue = scb.weights; bool adjustments = false; // For each plane and partition ... @@ -162,18 +158,18 @@ static bool realign_weights_undecimated( // Check if the prev or next error is better, and if so use it if ((up_error < current_error) && (up_error < down_error)) { - dec_weights_quant_pvalue[texel] = (uint8_t)((prev_and_next >> 24) & 0xFF); + dec_weights_quant_pvalue[texel] = static_cast((prev_and_next >> 24) & 0xFF); adjustments = true; } else if (down_error < current_error) { - dec_weights_quant_pvalue[texel] = (uint8_t)((prev_and_next >> 16) & 0xFF); + dec_weights_quant_pvalue[texel] = static_cast((prev_and_next >> 16) & 0xFF); adjustments = true; } } // Prepare iteration for plane 2 - dec_weights_quant_pvalue = dec_weights_quant_pvalue_plane2; + dec_weights_quant_pvalue += WEIGHTS_PLANE2_OFFSET; plane_mask = ~plane_mask; } @@ -187,20 +183,16 @@ static bool realign_weights_undecimated( * partition and per plane) and attempt to improve image quality by moving each weight up by one or * down by one quantization step. * - * @param decode_mode The decode mode (LDR, HDR). - * @param bsd The block size information. - * @param blk The image block color data to compress. - * @param[out] scb The symbolic compressed block output. - * @param[out] dec_weights_quant_pvalue_plane1 The weights for plane 1. - * @param[out] dec_weights_quant_pvalue_plane2 The weights for plane 2, or @c nullptr if 1 plane. + * @param decode_mode The decode mode (LDR, HDR). + * @param bsd The block size information. + * @param blk The image block color data to compress. + * @param[out] scb The symbolic compressed block output. */ static bool realign_weights_decimated( astcenc_profile decode_mode, const block_size_descriptor& bsd, const image_block& blk, - symbolic_compressed_block& scb, - uint8_t* dec_weights_quant_pvalue_plane1, - uint8_t* dec_weights_quant_pvalue_plane2 + symbolic_compressed_block& scb ) { // Get the partition descriptor unsigned int partition_count = scb.partition_count; @@ -244,7 +236,7 @@ static bool realign_weights_decimated( uint8_t uq_pl_weights[BLOCK_MAX_WEIGHTS]; float uq_pl_weightsf[BLOCK_MAX_WEIGHTS]; - uint8_t* dec_weights_quant_pvalue = dec_weights_quant_pvalue_plane1; + uint8_t* dec_weights_quant_pvalue = scb.weights; bool adjustments = false; // For each plane and partition ... @@ -280,9 +272,9 @@ static bool realign_weights_decimated( float uqw_next_dif = static_cast(next_wt_uq) - uqwf; float uqw_prev_dif = static_cast(prev_wt_uq) - uqwf; - float current_error = 0.0f; - float up_error = 0.0f; - float down_error = 0.0f; + vfloat4 current_errorv = vfloat4::zero(); + vfloat4 up_errorv = vfloat4::zero(); + vfloat4 down_errorv = vfloat4::zero(); // Interpolate the colors to create the diffs unsigned int texels_to_evaluate = di.weight_texel_count[we_idx]; @@ -318,35 +310,40 @@ static bool realign_weights_decimated( vfloat4 color = color_base + color_offset * plane_weight; vfloat4 orig_color = blk.texel(texel); - vfloat4 error_weight = blk.channel_weight; vfloat4 color_diff = color - orig_color; vfloat4 color_up_diff = color_diff + color_offset * plane_up_weight; vfloat4 color_down_diff = color_diff + color_offset * plane_down_weight; - current_error += dot_s(color_diff * color_diff, error_weight); - up_error += dot_s(color_up_diff * color_up_diff, error_weight); - down_error += dot_s(color_down_diff * color_down_diff, error_weight); + + current_errorv += color_diff * color_diff; + up_errorv += color_up_diff * color_up_diff; + down_errorv += color_down_diff * color_down_diff; } + vfloat4 error_weight = blk.channel_weight; + float current_error = hadd_s(current_errorv * error_weight); + float up_error = hadd_s(up_errorv * error_weight); + float down_error = hadd_s(down_errorv * error_weight); + // Check if the prev or next error is better, and if so use it if ((up_error < current_error) && (up_error < down_error)) { uq_pl_weights[we_idx] = static_cast(next_wt_uq); uq_pl_weightsf[we_idx] = static_cast(next_wt_uq); - dec_weights_quant_pvalue[we_idx] = (uint8_t)((prev_and_next >> 24) & 0xFF); + dec_weights_quant_pvalue[we_idx] = static_cast((prev_and_next >> 24) & 0xFF); adjustments = true; } else if (down_error < current_error) { uq_pl_weights[we_idx] = static_cast(prev_wt_uq); uq_pl_weightsf[we_idx] = static_cast(prev_wt_uq); - dec_weights_quant_pvalue[we_idx] = (uint8_t)((prev_and_next >> 16) & 0xFF); + dec_weights_quant_pvalue[we_idx] = static_cast((prev_and_next >> 16) & 0xFF); adjustments = true; } } // Prepare iteration for plane 2 - dec_weights_quant_pvalue = dec_weights_quant_pvalue_plane2; + dec_weights_quant_pvalue += WEIGHTS_PLANE2_OFFSET; plane_mask = ~plane_mask; } @@ -380,7 +377,6 @@ static float compress_symbolic_block_for_partition_1plane( promise(partition_count > 0); promise(config.tune_candidate_limit > 0); promise(config.tune_refinement_limit > 0); - promise(bsd.decimation_mode_count > 0); auto compute_difference = &compute_symbolic_block_difference_1plane; if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM)) @@ -401,13 +397,13 @@ static float compress_symbolic_block_for_partition_1plane( uint8_t *dec_weights_quant_pvalue = tmpbuf.dec_weights_quant_pvalue; // For each decimation mode, compute an ideal set of weights with no quantization - unsigned int max_decimation_modes = only_always ? bsd.always_decimation_mode_count - : bsd.decimation_mode_count; + unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always + : bsd.decimation_mode_count_selected; promise(max_decimation_modes > 0); for (unsigned int i = 0; i < max_decimation_modes; i++) { const auto& dm = bsd.get_decimation_mode(i); - if (dm.maxprec_1plane < 0 || !dm.percentile_hit) + if (!dm.ref_1_plane) { continue; } @@ -456,14 +452,15 @@ static float compress_symbolic_block_for_partition_1plane( 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS }; - unsigned int max_block_modes = only_always ? bsd.always_block_mode_count - : bsd.block_mode_count; + unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always + : bsd.block_mode_count_1plane_selected; promise(max_block_modes > 0); for (unsigned int i = 0; i < max_block_modes; ++i) { const block_mode& bm = bsd.block_modes[i]; + assert(!bm.is_dual_plane); int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits; - if (bm.is_dual_plane || !bm.percentile_hit || bitcount <= 0) + if (bitcount <= 0) { qwt_errors[i] = 1e38f; continue; @@ -503,8 +500,8 @@ static float compress_symbolic_block_for_partition_1plane( quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES]; unsigned int candidate_count = compute_ideal_endpoint_formats( - bsd, pi, blk, ei.ep, qwt_bitcounts, qwt_errors, - config.tune_candidate_limit, max_block_modes, + pi, blk, ei.ep, qwt_bitcounts, qwt_errors, + config.tune_candidate_limit, 0, max_block_modes, partition_format_specifiers, block_mode_index, color_quant_level, color_quant_level_mod, tmpbuf); @@ -517,7 +514,7 @@ static float compress_symbolic_block_for_partition_1plane( TRACE_NODE(node0, "candidate"); const int bm_packed_index = block_mode_index[i]; - assert(bm_packed_index >= 0 && bm_packed_index < (int)bsd.block_mode_count); + assert(bm_packed_index >= 0 && bm_packed_index < static_cast(bsd.block_mode_count_1plane_selected)); const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; int decimation_mode = qw_bm.decimation_mode; @@ -560,7 +557,7 @@ static float compress_symbolic_block_for_partition_1plane( rgbo_colors[j], partition_format_specifiers[i][j], workscb.color_values[j], - (quant_method)color_quant_level[i]); + color_quant_level[i]); } // If all the color endpoint modes are the same, we get a few more bits to store colors; @@ -584,7 +581,7 @@ static float compress_symbolic_block_for_partition_1plane( rgbo_colors[j], partition_format_specifiers[i][j], colorvals[j], - (quant_method)color_quant_level_mod[i]); + color_quant_level_mod[i]); } if (color_formats_mod[0] == color_formats_mod[1] @@ -612,11 +609,6 @@ static float compress_symbolic_block_for_partition_1plane( workscb.block_mode = qw_bm.mode_index; workscb.block_type = SYM_BTYPE_NONCONST; - if (workscb.quant_mode < QUANT_6) - { - workscb.block_type = SYM_BTYPE_ERROR; - } - // Pre-realign test if (l == 0) { @@ -660,14 +652,12 @@ static float compress_symbolic_block_for_partition_1plane( if (di.weight_count != bsd.texel_count) { adjustments = realign_weights_decimated( - config.profile, bsd, blk, workscb, - workscb.weights, nullptr); + config.profile, bsd, blk, workscb); } else { adjustments = realign_weights_undecimated( - config.profile, bsd, blk, workscb, - workscb.weights, nullptr); + config.profile, bsd, blk, workscb); } // Post-realign test @@ -737,7 +727,7 @@ static float compress_symbolic_block_for_partition_2planes( ) { promise(config.tune_candidate_limit > 0); promise(config.tune_refinement_limit > 0); - promise(bsd.decimation_mode_count > 0); + promise(bsd.decimation_mode_count_selected > 0); // Compute ideal weights and endpoint colors, with no quantization or decimation endpoints_and_weights& ei1 = tmpbuf.ei1; @@ -752,10 +742,10 @@ static float compress_symbolic_block_for_partition_2planes( uint8_t *dec_weights_quant_pvalue = tmpbuf.dec_weights_quant_pvalue; // For each decimation mode, compute an ideal set of weights with no quantization - for (unsigned int i = 0; i < bsd.decimation_mode_count; i++) + for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++) { const auto& dm = bsd.get_decimation_mode(i); - if (dm.maxprec_2planes < 0 || !dm.percentile_hit) + if (!dm.ref_2_planes) { continue; } @@ -817,17 +807,15 @@ static float compress_symbolic_block_for_partition_2planes( int* qwt_bitcounts = tmpbuf.qwt_bitcounts; float* qwt_errors = tmpbuf.qwt_errors; - for (unsigned int i = 0; i < bsd.block_mode_count; ++i) + unsigned int start_2plane = bsd.block_mode_count_1plane_selected; + unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected; + + for (unsigned int i = start_2plane; i < end_2plane; i++) { const block_mode& bm = bsd.block_modes[i]; - int bitcount = 109 - bm.weight_bits; - if (!bm.is_dual_plane || !bm.percentile_hit || bitcount <= 0) - { - qwt_errors[i] = 1e38f; - continue; - } + assert(bm.is_dual_plane); - qwt_bitcounts[i] = bitcount; + qwt_bitcounts[i] = 109 - bm.weight_bits; if (weight_high_value1[i] > 1.02f * min_wt_cutoff1) { @@ -882,8 +870,9 @@ static float compress_symbolic_block_for_partition_2planes( const auto& pi = bsd.get_partition_info(1, 0); unsigned int candidate_count = compute_ideal_endpoint_formats( - bsd, pi, blk, epm, qwt_bitcounts, qwt_errors, - config.tune_candidate_limit, bsd.block_mode_count, + pi, blk, epm, qwt_bitcounts, qwt_errors, + config.tune_candidate_limit, + bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected, partition_format_specifiers, block_mode_index, color_quant_level, color_quant_level_mod, tmpbuf); @@ -896,7 +885,8 @@ static float compress_symbolic_block_for_partition_2planes( TRACE_NODE(node0, "candidate"); const int bm_packed_index = block_mode_index[i]; - assert(bm_packed_index >= 0 && bm_packed_index < (int)bsd.block_mode_count); + assert(bm_packed_index >= static_cast(bsd.block_mode_count_1plane_selected) && + bm_packed_index < static_cast(bsd.block_mode_count_1plane_2plane_selected)); const block_mode& qw_bm = bsd.block_modes[bm_packed_index]; int decimation_mode = qw_bm.decimation_mode; @@ -940,7 +930,7 @@ static float compress_symbolic_block_for_partition_2planes( rgbs_color, rgbo_color, partition_format_specifiers[i][0], workscb.color_values[0], - (quant_method)color_quant_level[i]); + color_quant_level[i]); // Store header fields workscb.partition_count = 1; @@ -951,11 +941,6 @@ static float compress_symbolic_block_for_partition_2planes( workscb.plane2_component = static_cast(plane2_component); workscb.block_type = SYM_BTYPE_NONCONST; - if (workscb.quant_mode < 4) - { - workscb.block_type = SYM_BTYPE_ERROR; - } - // Pre-realign test if (l == 0) { @@ -1000,14 +985,12 @@ static float compress_symbolic_block_for_partition_2planes( if (di.weight_count != bsd.texel_count) { adjustments = realign_weights_decimated( - config.profile, bsd, blk, workscb, - workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET); + config.profile, bsd, blk, workscb); } else { adjustments = realign_weights_undecimated( - config.profile, bsd, blk, workscb, - workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET); + config.profile, bsd, blk, workscb); } // Post-realign test diff --git a/libkram/astc-encoder/astcenc_compute_variance.cpp b/libkram/astc-encoder/astcenc_compute_variance.cpp index 41757fc5..02281a19 100644 --- a/libkram/astc-encoder/astcenc_compute_variance.cpp +++ b/libkram/astc-encoder/astcenc_compute_variance.cpp @@ -63,7 +63,7 @@ static void brent_kung_prefix_sum( size_t iters = items >> log2_stride; vfloat4 *da = d + (start * stride); - ptrdiff_t ofs = -(ptrdiff_t)(step * stride); + ptrdiff_t ofs = -static_cast(step * stride); size_t ofs_stride = stride << log2_stride; while (iters) @@ -87,7 +87,7 @@ static void brent_kung_prefix_sum( size_t iters = (items - step) >> log2_stride; vfloat4 *da = d + (start * stride); - ptrdiff_t ofs = -(ptrdiff_t)(step * stride); + ptrdiff_t ofs = -static_cast(step * stride); size_t ofs_stride = stride << log2_stride; while (iters) @@ -169,18 +169,18 @@ static void compute_pixel_region_variance( for (int z = zd_start; z < padsize_z; z++) { int z_src = (z - zd_start) + offset_z - kernel_radius_z; - z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1)); + z_src = astc::clamp(z_src, 0, static_cast(img->dim_z - 1)); uint8_t* data8 = static_cast(img->data[z_src]); for (int y = 1; y < padsize_y; y++) { int y_src = (y - 1) + offset_y - kernel_radius_xy; - y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1)); + y_src = astc::clamp(y_src, 0, static_cast(img->dim_y - 1)); for (int x = 1; x < padsize_x; x++) { int x_src = (x - 1) + offset_x - kernel_radius_xy; - x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1)); + x_src = astc::clamp(x_src, 0, static_cast(img->dim_x - 1)); data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )]; data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; @@ -213,18 +213,18 @@ static void compute_pixel_region_variance( for (int z = zd_start; z < padsize_z; z++) { int z_src = (z - zd_start) + offset_z - kernel_radius_z; - z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1)); + z_src = astc::clamp(z_src, 0, static_cast(img->dim_z - 1)); uint16_t* data16 = static_cast(img->data[z_src]); for (int y = 1; y < padsize_y; y++) { int y_src = (y - 1) + offset_y - kernel_radius_xy; - y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1)); + y_src = astc::clamp(y_src, 0, static_cast(img->dim_y - 1)); for (int x = 1; x < padsize_x; x++) { int x_src = (x - 1) + offset_x - kernel_radius_xy; - x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1)); + x_src = astc::clamp(x_src, 0, static_cast(img->dim_x - 1)); data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )]; data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; @@ -252,18 +252,18 @@ static void compute_pixel_region_variance( for (int z = zd_start; z < padsize_z; z++) { int z_src = (z - zd_start) + offset_z - kernel_radius_z; - z_src = astc::clamp(z_src, 0, (int)(img->dim_z - 1)); + z_src = astc::clamp(z_src, 0, static_cast(img->dim_z - 1)); float* data32 = static_cast(img->data[z_src]); for (int y = 1; y < padsize_y; y++) { int y_src = (y - 1) + offset_y - kernel_radius_xy; - y_src = astc::clamp(y_src, 0, (int)(img->dim_y - 1)); + y_src = astc::clamp(y_src, 0, static_cast(img->dim_y - 1)); for (int x = 1; x < padsize_x; x++) { int x_src = (x - 1) + offset_x - kernel_radius_xy; - x_src = astc::clamp(x_src, 0, (int)(img->dim_x - 1)); + x_src = astc::clamp(x_src, 0, static_cast(img->dim_x - 1)); data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )]; data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; @@ -345,18 +345,17 @@ static void compute_pixel_region_variance( } } - int alpha_kdim = 2 * alpha_kernel_radius + 1; - // Compute a few constants used in the variance-calculation. + float alpha_kdim = static_cast(2 * alpha_kernel_radius + 1); float alpha_rsamples; if (have_z) { - alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim * alpha_kdim); + alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim); } else { - alpha_rsamples = 1.0f / (float)(alpha_kdim * alpha_kdim); + alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim); } // Use the summed-area tables to compute variance for each neighborhood diff --git a/libkram/astc-encoder/astcenc_decompress_symbolic.cpp b/libkram/astc-encoder/astcenc_decompress_symbolic.cpp index 4fde3d28..6d50d512 100644 --- a/libkram/astc-encoder/astcenc_decompress_symbolic.cpp +++ b/libkram/astc-encoder/astcenc_decompress_symbolic.cpp @@ -351,7 +351,8 @@ float compute_symbolic_block_difference_2plane( vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component); - float summa = 0.0f; + vfloat4 summa = vfloat4::zero(); + // Decode the color endpoints for this partition vint4 ep0; vint4 ep1; @@ -409,11 +410,10 @@ float compute_symbolic_block_difference_2plane( error = min(abs(error), 1e15f); error = error * error; - float metric = dot_s(error, blk.channel_weight); - summa += astc::min(metric, ERROR_CALC_DEFAULT); + summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT); } - return summa; + return summa.lane<0>(); } /* See header for documentation. */ @@ -445,7 +445,7 @@ float compute_symbolic_block_difference_1plane( int plane1_weights[BLOCK_MAX_TEXELS]; unpack_weights(bsd, scb, di, false, bm.get_weight_quant_mode(), plane1_weights, nullptr); - float summa = 0.0f; + vfloat4 summa = vfloat4::zero(); for (unsigned int i = 0; i < partition_count; i++) { // Decode the color endpoints for this partition @@ -506,12 +506,11 @@ float compute_symbolic_block_difference_1plane( error = min(abs(error), 1e15f); error = error * error; - float metric = dot_s(error, blk.channel_weight); - summa += astc::min(metric, ERROR_CALC_DEFAULT); + summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT); } } - return summa; + return summa.lane<0>(); } /* See header for documentation. */ @@ -560,7 +559,7 @@ float compute_symbolic_block_difference_1plane_1partition( } // Unpack and compute error for each texel in the partition - vfloat4 summav = vfloat4::zero(); + vfloatacc summav = vfloatacc::zero(); vint lane_id = vint::lane_id(); vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1); @@ -618,9 +617,7 @@ float compute_symbolic_block_difference_1plane_1partition( // Mask off bad lanes vmask mask = lane_id < vint(texel_count); lane_id += vint(ASTCENC_SIMD_WIDTH); - metric = select(vfloat::zero(), metric, mask); - - haccumulate(summav, metric); + haccumulate(summav, metric, mask); } return hadd_s(summav); diff --git a/libkram/astc-encoder/astcenc_entry.cpp b/libkram/astc-encoder/astcenc_entry.cpp index 02597cf0..3c21c55c 100644 --- a/libkram/astc-encoder/astcenc_entry.cpp +++ b/libkram/astc-encoder/astcenc_entry.cpp @@ -62,7 +62,7 @@ struct astcenc_preset_config static const std::array preset_configs_high {{ { ASTCENC_PRE_FASTEST, - 2, 8, 40, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25 + 2, 8, 42, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25 }, { ASTCENC_PRE_FAST, 3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20 @@ -261,7 +261,7 @@ static astcenc_error validate_flags( ) { // Flags field must not contain any unknown flag bits unsigned int exMask = ~ASTCENC_ALL_FLAGS; - if (astc::popcount(flags & exMask) != 0) + if (popcount(flags & exMask) != 0) { return ASTCENC_ERR_BAD_FLAGS; } @@ -270,7 +270,7 @@ static astcenc_error validate_flags( exMask = ASTCENC_FLG_MAP_MASK | ASTCENC_FLG_MAP_NORMAL | ASTCENC_FLG_MAP_RGBM; - if (astc::popcount(flags & exMask) > 1) + if (popcount(flags & exMask) > 1) { return ASTCENC_ERR_BAD_FLAGS; } @@ -423,7 +423,7 @@ static astcenc_error validate_config( config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f); config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u); - config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, (unsigned int)BLOCK_MAX_PARTITIONINGS); + config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u); config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u); config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES); @@ -557,9 +557,9 @@ astcenc_error astcenc_config_init( #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b)) #define LERPI(param) astc::flt2int_rtn(\ - (((float)node_a.param) * wt_node_a) + \ - (((float)node_b.param) * wt_node_b)) - #define LERPUI(param) (unsigned int)LERPI(param) + (static_cast(node_a.param) * wt_node_a) + \ + (static_cast(node_b.param) * wt_node_b)) + #define LERPUI(param) static_cast(LERPI(param)) config.tune_partition_count_limit = LERPI(tune_partition_count_limit); config.tune_partition_index_limit = LERPI(tune_partition_index_limit); @@ -832,9 +832,9 @@ static void compress_image( // Populate the block channel weights blk.channel_weight = vfloat4(ctx.config.cw_r_weight, - ctx.config.cw_g_weight, - ctx.config.cw_b_weight, - ctx.config.cw_a_weight); + ctx.config.cw_g_weight, + ctx.config.cw_b_weight, + ctx.config.cw_a_weight); // Use preallocated scratch buffer auto& temp_buffers = ctx.working_buffers[thread_index]; @@ -842,6 +842,23 @@ static void compress_image( // Only the first thread actually runs the initializer ctx.manage_compress.init(block_count); + + // Determine if we can use an optimized load function + bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || + (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A); + + bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) || + (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A); + + bool use_fast_load = !needs_swz && !needs_hdr && + block_z == 1 && image.data_type == ASTCENC_TYPE_U8; + + auto load_func = fetch_image_block; + if (use_fast_load) + { + load_func = fetch_image_block_fast_ldr; + } + // All threads run this processing loop until there is no work remaining while (true) { @@ -877,7 +894,7 @@ static void compress_image( int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1); - float footprint = (float)(x_footprint * y_footprint); + float footprint = static_cast(x_footprint * y_footprint); float threshold = 0.9f / (255.0f * footprint); // Do we have any alpha values? @@ -900,7 +917,7 @@ static void compress_image( // Fetch the full block for compression if (use_full_block) { - fetch_image_block(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle); + load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle); } // Apply alpha scale RDO - substitute constant color block else @@ -1116,7 +1133,8 @@ astcenc_error astcenc_decompress_image( unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16; const uint8_t* bp = data + offset; - physical_compressed_block pcb = *(const physical_compressed_block*)bp; + + const physical_compressed_block& pcb = *reinterpret_cast(bp); symbolic_compressed_block scb; physical_to_symbolic(*ctx->bsd, pcb, scb); @@ -1156,7 +1174,7 @@ astcenc_error astcenc_get_block_info( return ASTCENC_ERR_BAD_CONTEXT; #else // Decode the compressed data into a symbolic form - physical_compressed_block pcb = *(const physical_compressed_block*)data; + const physical_compressed_block&pcb = *reinterpret_cast(data); symbolic_compressed_block scb; physical_to_symbolic(*ctx->bsd, pcb, scb); @@ -1245,10 +1263,10 @@ astcenc_error astcenc_get_block_info( unpack_weights(bsd, scb, di, bm.is_dual_plane, bm.get_weight_quant_mode(), weight_plane1, weight_plane2); for (unsigned int i = 0; i < bsd.texel_count; i++) { - info->weight_values_plane1[i] = (float)weight_plane1[i] * (1.0f / WEIGHTS_TEXEL_SUM); + info->weight_values_plane1[i] = static_cast(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM); if (info->is_dual_plane_block) { - info->weight_values_plane2[i] = (float)weight_plane2[i] * (1.0f / WEIGHTS_TEXEL_SUM); + info->weight_values_plane2[i] = static_cast(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM); } } diff --git a/libkram/astc-encoder/astcenc_find_best_partitioning.cpp b/libkram/astc-encoder/astcenc_find_best_partitioning.cpp index 6a4eff1b..aa9a8ab8 100644 --- a/libkram/astc-encoder/astcenc_find_best_partitioning.cpp +++ b/libkram/astc-encoder/astcenc_find_best_partitioning.cpp @@ -253,8 +253,8 @@ static inline unsigned int partition_mismatch2( const uint64_t a[2], const uint64_t b[2] ) { - int v1 = astc::popcount(a[0] ^ b[0]) + astc::popcount(a[1] ^ b[1]); - int v2 = astc::popcount(a[0] ^ b[1]) + astc::popcount(a[1] ^ b[0]); + int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]); + int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]); return astc::min(v1, v2); } @@ -270,17 +270,17 @@ static inline unsigned int partition_mismatch3( const uint64_t a[3], const uint64_t b[3] ) { - int p00 = astc::popcount(a[0] ^ b[0]); - int p01 = astc::popcount(a[0] ^ b[1]); - int p02 = astc::popcount(a[0] ^ b[2]); + int p00 = popcount(a[0] ^ b[0]); + int p01 = popcount(a[0] ^ b[1]); + int p02 = popcount(a[0] ^ b[2]); - int p10 = astc::popcount(a[1] ^ b[0]); - int p11 = astc::popcount(a[1] ^ b[1]); - int p12 = astc::popcount(a[1] ^ b[2]); + int p10 = popcount(a[1] ^ b[0]); + int p11 = popcount(a[1] ^ b[1]); + int p12 = popcount(a[1] ^ b[2]); - int p20 = astc::popcount(a[2] ^ b[0]); - int p21 = astc::popcount(a[2] ^ b[1]); - int p22 = astc::popcount(a[2] ^ b[2]); + int p20 = popcount(a[2] ^ b[0]); + int p21 = popcount(a[2] ^ b[1]); + int p22 = popcount(a[2] ^ b[2]); int s0 = p11 + p22; int s1 = p12 + p21; @@ -309,25 +309,25 @@ static inline unsigned int partition_mismatch4( const uint64_t a[4], const uint64_t b[4] ) { - int p00 = astc::popcount(a[0] ^ b[0]); - int p01 = astc::popcount(a[0] ^ b[1]); - int p02 = astc::popcount(a[0] ^ b[2]); - int p03 = astc::popcount(a[0] ^ b[3]); - - int p10 = astc::popcount(a[1] ^ b[0]); - int p11 = astc::popcount(a[1] ^ b[1]); - int p12 = astc::popcount(a[1] ^ b[2]); - int p13 = astc::popcount(a[1] ^ b[3]); - - int p20 = astc::popcount(a[2] ^ b[0]); - int p21 = astc::popcount(a[2] ^ b[1]); - int p22 = astc::popcount(a[2] ^ b[2]); - int p23 = astc::popcount(a[2] ^ b[3]); - - int p30 = astc::popcount(a[3] ^ b[0]); - int p31 = astc::popcount(a[3] ^ b[1]); - int p32 = astc::popcount(a[3] ^ b[2]); - int p33 = astc::popcount(a[3] ^ b[3]); + int p00 = popcount(a[0] ^ b[0]); + int p01 = popcount(a[0] ^ b[1]); + int p02 = popcount(a[0] ^ b[2]); + int p03 = popcount(a[0] ^ b[3]); + + int p10 = popcount(a[1] ^ b[0]); + int p11 = popcount(a[1] ^ b[1]); + int p12 = popcount(a[1] ^ b[2]); + int p13 = popcount(a[1] ^ b[3]); + + int p20 = popcount(a[2] ^ b[0]); + int p21 = popcount(a[2] ^ b[1]); + int p22 = popcount(a[2] ^ b[2]); + int p23 = popcount(a[2] ^ b[3]); + + int p30 = popcount(a[3] ^ b[0]); + int p31 = popcount(a[3] ^ b[1]); + int p32 = popcount(a[3] ^ b[2]); + int p33 = popcount(a[3] ^ b[3]); int mx23 = astc::min(p22 + p33, p23 + p32); int mx13 = astc::min(p21 + p33, p23 + p31); @@ -360,7 +360,7 @@ static void count_partition_mismatch_bits( const uint64_t bitmaps[BLOCK_MAX_PARTITIONS], unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS] ) { - unsigned int active_count = bsd.partitioning_count[partition_count - 1]; + unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1]; if (partition_count == 2) { @@ -394,8 +394,10 @@ static void count_partition_mismatch_bits( * @param partitioning_count The number of packed partitionings. * @param mismatch_count Partitioning mismatch counts, in index order. * @param[out] partition_ordering Partition index values, in mismatch order. + * + * @return The number of active partitions in this selection. */ -static void get_partition_ordering_by_mismatch_bits( +static unsigned int get_partition_ordering_by_mismatch_bits( unsigned int partitioning_count, const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS], unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS] @@ -408,6 +410,8 @@ static void get_partition_ordering_by_mismatch_bits( mscount[mismatch_count[i]]++; } + unsigned int active_count = partitioning_count - mscount[255]; + // Create a running sum from the histogram array // Cells store previous values only; i.e. exclude self after sum unsigned int summa = 0; @@ -425,6 +429,8 @@ static void get_partition_ordering_by_mismatch_bits( unsigned int idx = mscount[mismatch_count[i]]++; partition_ordering[idx] = i; } + + return active_count; } /** @@ -434,8 +440,10 @@ static void get_partition_ordering_by_mismatch_bits( * @param blk The image block color data to compress. * @param partition_count The desired number of partitions in the block. * @param[out] partition_ordering The list of recommended partition indices, in priority order. + * + * @return The number of active partitionings in this selection. */ -static void compute_kmeans_partition_ordering( +static unsigned int compute_kmeans_partition_ordering( const block_size_descriptor& bsd, const image_block& blk, unsigned int partition_count, @@ -474,8 +482,9 @@ static void compute_kmeans_partition_ordering( count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts); // Sort the partitions based on the number of mismatched bits - get_partition_ordering_by_mismatch_bits(bsd.partitioning_count[partition_count - 1], - mismatch_counts, partition_ordering); + return get_partition_ordering_by_mismatch_bits( + bsd.partitioning_count_selected[partition_count - 1], + mismatch_counts, partition_ordering); } /* See header for documentation. */ @@ -509,9 +518,8 @@ void find_best_partition_candidates( weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim; unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS]; - compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); - partition_search_limit = astc::min(partition_search_limit, - bsd.partitioning_count[partition_count - 1]); + unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence); + partition_search_limit = astc::min(partition_search_limit, sequence_len); bool uses_alpha = !blk.is_constant_channel(3); @@ -531,16 +539,6 @@ void find_best_partition_candidates( unsigned int partition = partition_sequence[i]; const auto& pi = bsd.get_raw_partition_info(partition_count, partition); - // TODO: This escape shouldn't really be needed. We should return - // the number of blocks which have usable (!= 255) mismatch count - // from compute_kmeans_partition_ordering and use that as the upper - // loop limit. - unsigned int bk_partition_count = pi.partition_count; - if (bk_partition_count < partition_count) - { - break; - } - // Compute weighting to give to each component in each partition partition_metrics pms[BLOCK_MAX_PARTITIONS]; @@ -634,16 +632,6 @@ void find_best_partition_candidates( unsigned int partition = partition_sequence[i]; const auto& pi = bsd.get_raw_partition_info(partition_count, partition); - // TODO: This escape shouldn't really be needed. We should return - // the number of blocks which have usable (!= 255) mismatch count - // from compute_kmeans_partition_ordering and use that as the upper - // loop limit. - unsigned int bk_partition_count = pi.partition_count; - if (bk_partition_count < partition_count) - { - break; - } - // Compute weighting to give to each component in each partition partition_metrics pms[BLOCK_MAX_PARTITIONS]; compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); @@ -723,7 +711,7 @@ void find_best_partition_candidates( } } - // Same parition is best for both, so use this first unconditionally + // Same partition is best for both, so use this first unconditionally if (uncor_best_partition == samec_best_partitions[0]) { best_partitions[0] = samec_best_partitions[0]; diff --git a/libkram/astc-encoder/astcenc_ideal_endpoints_and_weights.cpp b/libkram/astc-encoder/astcenc_ideal_endpoints_and_weights.cpp index 2b5faa1c..9df44176 100644 --- a/libkram/astc-encoder/astcenc_ideal_endpoints_and_weights.cpp +++ b/libkram/astc-encoder/astcenc_ideal_endpoints_and_weights.cpp @@ -302,21 +302,21 @@ static void compute_ideal_colors_and_weights_3_comp( const float* data_vb = nullptr; if (omitted_component == 0) { - error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f; + error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); data_vr = blk.data_g; data_vg = blk.data_b; data_vb = blk.data_a; } else if (omitted_component == 1) { - error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()) / 3.0f; + error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>()); data_vr = blk.data_r; data_vg = blk.data_b; data_vb = blk.data_a; } else if (omitted_component == 2) { - error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()) / 3.0f; + error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>()); data_vr = blk.data_r; data_vg = blk.data_g; data_vb = blk.data_a; @@ -325,13 +325,22 @@ static void compute_ideal_colors_and_weights_3_comp( { assert(omitted_component == 3); - error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) / 3.0f; + error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); data_vr = blk.data_r; data_vg = blk.data_g; data_vb = blk.data_b; } - compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms); + error_weight = error_weight * (1.0f / 3.0f); + + if (omitted_component == 3) + { + compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); + } + else + { + compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms); + } bool is_constant_wes { true }; float partition0_len_sq { 0.0f }; @@ -611,7 +620,7 @@ float compute_error_of_weight_set_1plane( const decimation_info& di, const float* dec_weight_quant_uvalue ) { - vfloat4 error_summav = vfloat4::zero(); + vfloatacc error_summav = vfloatacc::zero(); float error_summa = 0.0f; unsigned int texel_count = di.texel_count; @@ -666,9 +675,7 @@ float compute_error_of_weight_set_1plane( } // Resolve the final scalar accumulator sum - haccumulate(error_summa, error_summav); - - return error_summa; + return error_summa = hadd_s(error_summav); } /* See header for documentation. */ @@ -679,8 +686,7 @@ float compute_error_of_weight_set_2planes( const float* dec_weight_quant_uvalue_plane1, const float* dec_weight_quant_uvalue_plane2 ) { - vfloat4 error_summav = vfloat4::zero(); - float error_summa = 0.0f; + vfloatacc error_summav = vfloatacc::zero(); unsigned int texel_count = di.texel_count; // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized @@ -761,9 +767,7 @@ float compute_error_of_weight_set_2planes( } // Resolve the final scalar accumulator sum - haccumulate(error_summa, error_summav); - - return error_summa; + return hadd_s(error_summav); } /* See header for documentation. */ @@ -795,7 +799,7 @@ void compute_ideal_weights_for_decimation( // Transfer enough to also copy zero initialized SIMD over-fetch region unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count); - for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH) + for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH) { vfloat weight(eai_in.weights + i); vfloat weight_error_scale(eai_in.weight_error_scale + i); @@ -1054,7 +1058,6 @@ static inline vfloat4 compute_rgbo_vector( } /* See header for documentation. */ -// TODO: Specialize for 1 partition? void recompute_ideal_colors_1plane( const image_block& blk, const partition_info& pi, @@ -1065,33 +1068,66 @@ void recompute_ideal_colors_1plane( vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS], vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS] ) { - int weight_count = di.weight_count; - int partition_count = pi.partition_count; + unsigned int weight_count = di.weight_count; + unsigned int total_texel_count = blk.texel_count; + unsigned int partition_count = pi.partition_count; promise(weight_count > 0); + promise(total_texel_count > 0); promise(partition_count > 0); const quantization_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_mode]; - float dec_weight_quant_uvalue[BLOCK_MAX_WEIGHTS]; - for (int i = 0; i < weight_count; i++) + float dec_weight[BLOCK_MAX_WEIGHTS]; + for (unsigned int i = 0; i < weight_count; i++) { - dec_weight_quant_uvalue[i] = qat.unquantized_value[dec_weights_quant_pvalue[i]] * (1.0f / 64.0f); + dec_weight[i] = qat.unquantized_value[dec_weights_quant_pvalue[i]] * (1.0f / 64.0f); } - for (int i = 0; i < partition_count; i++) + alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS]; + float* undec_weight_ref; + if (di.max_texel_weight_count == 1) + { + undec_weight_ref = dec_weight; + } + else if (di.max_texel_weight_count <= 2) + { + for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) + { + vfloat weight = bilinear_infill_vla_2(di, dec_weight, i); + storea(weight, undec_weight + i); + } + + undec_weight_ref = undec_weight; + } + else { - vfloat4 rgba_sum(1e-17f); + for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) + { + vfloat weight = bilinear_infill_vla(di, dec_weight, i); + storea(weight, undec_weight + i); + } + undec_weight_ref = undec_weight; + } + + vfloat4 rgba_sum(blk.data_mean * static_cast(blk.texel_count)); + + for (unsigned int i = 0; i < partition_count; i++) + { unsigned int texel_count = pi.partition_texel_count[i]; const uint8_t *texel_indexes = pi.texels_of_partition[i]; - // TODO: Use gathers? - promise(texel_count > 0); - for (unsigned int j = 0; j < texel_count; j++) + // Only compute a partition mean if more than one partition + if (partition_count > 1) { - unsigned int tix = texel_indexes[j]; - rgba_sum += blk.texel(tix); + rgba_sum = vfloat4(1e-17f); + promise(texel_count > 0); + for (unsigned int j = 0; j < texel_count; j++) + { + unsigned int tix = texel_indexes[j]; + rgba_sum += blk.texel(tix); + } } rgba_sum = rgba_sum * blk.channel_weight; @@ -1124,20 +1160,7 @@ void recompute_ideal_colors_1plane( vfloat4 rgba = blk.texel(tix); - float idx0; - if (di.max_texel_weight_count == 1) - { - assert(tix < BLOCK_MAX_WEIGHTS); - idx0 = dec_weight_quant_uvalue[tix]; - } - else if (di.max_texel_weight_count == 2) - { - idx0 = bilinear_infill_2(di, dec_weight_quant_uvalue, tix); - } - else - { - idx0 = bilinear_infill(di, dec_weight_quant_uvalue, tix); - } + float idx0 = undec_weight_ref[tix]; float om_idx0 = 1.0f - idx0; wmin1 = astc::min(idx0, wmin1); @@ -1270,18 +1293,61 @@ void recompute_ideal_colors_2planes( int plane2_component ) { unsigned int weight_count = di.weight_count; + unsigned int total_texel_count = blk.texel_count; + + promise(total_texel_count > 0); promise(weight_count > 0); const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_mode]); - float dec_weights_quant_uvalue_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; - float dec_weights_quant_uvalue_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; + float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE]; + float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE]; assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE); for (unsigned int i = 0; i < weight_count; i++) { - dec_weights_quant_uvalue_plane1[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane1[i]] * (1.0f / 64.0f); - dec_weights_quant_uvalue_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f); + dec_weight_plane1[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane1[i]] * (1.0f / 64.0f); + dec_weight_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f); + } + + alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS]; + alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS]; + + float* undec_weight_plane1_ref; + float* undec_weight_plane2_ref; + + if (di.max_texel_weight_count == 1) + { + undec_weight_plane1_ref = dec_weight_plane1; + undec_weight_plane2_ref = dec_weight_plane2; + } + else if (di.max_texel_weight_count <= 2) + { + for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) + { + vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i); + storea(weight, undec_weight_plane1 + i); + + weight = bilinear_infill_vla_2(di, dec_weight_plane2, i); + storea(weight, undec_weight_plane2 + i); + } + + undec_weight_plane1_ref = undec_weight_plane1; + undec_weight_plane2_ref = undec_weight_plane2; + } + else + { + for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH) + { + vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i); + storea(weight, undec_weight_plane1 + i); + + weight = bilinear_infill_vla(di, dec_weight_plane2, i); + storea(weight, undec_weight_plane2 + i); + } + + undec_weight_plane1_ref = undec_weight_plane1; + undec_weight_plane2_ref = undec_weight_plane2; } unsigned int texel_count = bsd.texel_count; @@ -1320,20 +1386,7 @@ void recompute_ideal_colors_2planes( { vfloat4 rgba = blk.texel(j); - float idx0; - if (di.max_texel_weight_count == 1) - { - assert(j < BLOCK_MAX_WEIGHTS_2PLANE); - idx0 = dec_weights_quant_uvalue_plane1[j]; - } - else if (di.max_texel_weight_count == 2) - { - idx0 = bilinear_infill_2(di, dec_weights_quant_uvalue_plane1, j); - } - else - { - idx0 = bilinear_infill(di, dec_weights_quant_uvalue_plane1, j); - } + float idx0 = undec_weight_plane1_ref[j]; float om_idx0 = 1.0f - idx0; wmin1 = astc::min(idx0, wmin1); @@ -1347,20 +1400,7 @@ void recompute_ideal_colors_2planes( middle1_sum_s += om_idx0 * idx0; right1_sum_s += idx0 * idx0; - float idx1; - if (di.max_texel_weight_count == 1) - { - assert(j < BLOCK_MAX_WEIGHTS_2PLANE); - idx1 = dec_weights_quant_uvalue_plane2[j]; - } - else if (di.max_texel_weight_count == 2) - { - idx1 = bilinear_infill_2(di, dec_weights_quant_uvalue_plane2, j); - } - else - { - idx1 = bilinear_infill(di, dec_weights_quant_uvalue_plane2, j); - } + float idx1 = undec_weight_plane2_ref[j]; float om_idx1 = 1.0f - idx1; wmin2 = astc::min(idx1, wmin2); diff --git a/libkram/astc-encoder/astcenc_image.cpp b/libkram/astc-encoder/astcenc_image.cpp index cda80722..ff8c6755 100644 --- a/libkram/astc-encoder/astcenc_image.cpp +++ b/libkram/astc-encoder/astcenc_image.cpp @@ -176,7 +176,7 @@ void fetch_image_block( vfloat4 data_mean(0.0f); vfloat4 data_mean_scale(1.0f / static_cast(bsd.texel_count)); vfloat4 data_max(-1e38f); - bool grayscale = true; + vmask4 grayscalev(true); // This works because we impose the same choice everywhere during encode uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) || @@ -230,10 +230,7 @@ void fetch_image_block( data_mean += datav * data_mean_scale; data_max = max(data_max, datav); - if (grayscale && (datav.lane<0>() != datav.lane<1>() || datav.lane<0>() != datav.lane<2>())) - { - grayscale = false; - } + grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); blk.data_r[idx] = datav.lane<0>(); blk.data_g[idx] = datav.lane<1>(); @@ -264,7 +261,74 @@ void fetch_image_block( blk.data_min = data_min; blk.data_mean = data_mean; blk.data_max = data_max; - blk.grayscale = grayscale; + blk.grayscale = all(grayscalev); +} + +/* See header for documentation. */ +void fetch_image_block_fast_ldr( + astcenc_profile decode_mode, + const astcenc_image& img, + image_block& blk, + const block_size_descriptor& bsd, + unsigned int xpos, + unsigned int ypos, + unsigned int zpos, + const astcenc_swizzle& swz +) { + (void)swz; + (void)decode_mode; + + unsigned int xsize = img.dim_x; + unsigned int ysize = img.dim_y; + + blk.xpos = xpos; + blk.ypos = ypos; + blk.zpos = zpos; + + vfloat4 data_min(1e38f); + vfloat4 data_mean = vfloat4::zero(); + vfloat4 data_max(-1e38f); + vmask4 grayscalev(true); + int idx = 0; + + const uint8_t* plane = static_cast(img.data[0]); + for (unsigned int y = ypos; y < ypos + bsd.ydim; y++) + { + unsigned int yi = astc::min(y, ysize - 1); + + for (unsigned int x = xpos; x < xpos + bsd.xdim; x++) + { + unsigned int xi = astc::min(x, xsize - 1); + + vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi)); + vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f); + + // Compute block metadata + data_min = min(data_min, datav); + data_mean += datav; + data_max = max(data_max, datav); + + grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); + + blk.data_r[idx] = datav.lane<0>(); + blk.data_g[idx] = datav.lane<1>(); + blk.data_b[idx] = datav.lane<2>(); + blk.data_a[idx] = datav.lane<3>(); + + idx++; + } + } + + // Reverse the encoding so we store origin block in the original format + blk.origin_texel = blk.texel(0) / 65535.0f; + + // Store block metadata + blk.rgb_lns[0] = 0; + blk.alpha_lns[0] = 0; + blk.data_min = data_min; + blk.data_mean = data_mean / static_cast(bsd.texel_count); + blk.data_max = data_max; + blk.grayscale = all(grayscalev); } /* See header for documentation. */ @@ -403,10 +467,10 @@ void write_image_block( color = float_to_float16(colorf); } - data16[(4 * xsize * y) + (4 * x )] = (uint16_t)color.lane<0>(); - data16[(4 * xsize * y) + (4 * x + 1)] = (uint16_t)color.lane<1>(); - data16[(4 * xsize * y) + (4 * x + 2)] = (uint16_t)color.lane<2>(); - data16[(4 * xsize * y) + (4 * x + 3)] = (uint16_t)color.lane<3>(); + data16[(4 * xsize * y) + (4 * x )] = static_cast(color.lane<0>()); + data16[(4 * xsize * y) + (4 * x + 1)] = static_cast(color.lane<1>()); + data16[(4 * xsize * y) + (4 * x + 2)] = static_cast(color.lane<2>()); + data16[(4 * xsize * y) + (4 * x + 3)] = static_cast(color.lane<3>()); idx++; } diff --git a/libkram/astc-encoder/astcenc_internal.h b/libkram/astc-encoder/astcenc_internal.h index 6b711deb..aa7f6001 100644 --- a/libkram/astc-encoder/astcenc_internal.h +++ b/libkram/astc-encoder/astcenc_internal.h @@ -112,10 +112,10 @@ static constexpr unsigned int WEIGHTS_PLANE2_OFFSET { BLOCK_MAX_WEIGHTS_2PLANE } /** @brief The sum of quantized weights for one texel. */ static constexpr float WEIGHTS_TEXEL_SUM { 16.0f }; -/** @brief The number of block modes suported by the ASTC format. */ +/** @brief The number of block modes supported by the ASTC format. */ static constexpr unsigned int WEIGHTS_MAX_BLOCK_MODES { 2048 }; -/** @brief The number of weight grid decimation modes suported by the ASTC format. */ +/** @brief The number of weight grid decimation modes supported by the ASTC format. */ static constexpr unsigned int WEIGHTS_MAX_DECIMATION_MODES { 87 }; /** @brief The high default error used to initialize error trackers. */ @@ -540,9 +540,6 @@ struct partition_info */ struct decimation_info { - // TODO: These structures are large. Any partitioning opportunities to - // improve caching and reduce miss rates? - /** @brief The total number of texels in the block. */ uint8_t texel_count; @@ -615,9 +612,6 @@ struct block_mode /** @brief Is a dual weight plane used by this block mode? */ uint8_t is_dual_plane : 1; - /** @brief Is this mode enabled in the current search preset? */ - uint8_t percentile_hit : 1; - /** * @brief Get the weight quantization used by this block mode. * @@ -625,7 +619,7 @@ struct block_mode */ inline quant_method get_weight_quant_mode() const { - return (quant_method)this->quant_mode; + return static_cast(this->quant_mode); } }; @@ -640,8 +634,11 @@ struct decimation_mode /** @brief The max weight precision for 2 planes, or -1 if not supported. */ int8_t maxprec_2planes; - /** @brief Is this mode enabled in the current search preset? */ - uint8_t percentile_hit; + /** @brief Was this actually referenced by an active 1 plane mode? */ + uint8_t ref_1_plane; + + /** @brief Was this actually referenced by an active 2 plane mode? */ + uint8_t ref_2_planes; }; /** @@ -677,28 +674,40 @@ struct block_size_descriptor /** @brief The block total texel count. */ uint8_t texel_count; - /** @brief The number of stored decimation modes. */ - unsigned int decimation_mode_count; - /** * @brief The number of stored decimation modes which are "always" modes. * * Always modes are stored at the start of the decimation_modes list. */ - unsigned int always_decimation_mode_count; + unsigned int decimation_mode_count_always; - /** @brief The number of stored block modes. */ - unsigned int block_mode_count; + /** @brief The number of stored decimation modes for selected encodings. */ + unsigned int decimation_mode_count_selected; - /** @brief The number of active partitionings for 1/2/3/4 partitionings. */ - unsigned int partitioning_count[BLOCK_MAX_PARTITIONS]; + /** @brief The number of stored decimation modes for any encoding. */ + unsigned int decimation_mode_count_all; /** * @brief The number of stored block modes which are "always" modes. * * Always modes are stored at the start of the block_modes list. */ - unsigned int always_block_mode_count; + unsigned int block_mode_count_1plane_always; + + /** @brief The number of stored block modes for active 1 plane encodings. */ + unsigned int block_mode_count_1plane_selected; + + /** @brief The number of stored block modes for active 1 and 2 plane encodings. */ + unsigned int block_mode_count_1plane_2plane_selected; + + /** @brief The number of stored block modes for any encoding. */ + unsigned int block_mode_count_all; + + /** @brief The number of selected partitionings for 1/2/3/4 partitionings. */ + unsigned int partitioning_count_selected[BLOCK_MAX_PARTITIONS]; + + /** @brief The number of partitionings for 1/2/3/4 partitionings. */ + unsigned int partitioning_count_all[BLOCK_MAX_PARTITIONS]; /** @brief The active decimation modes, stored in low indices. */ decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES]; @@ -781,7 +790,7 @@ struct block_size_descriptor const block_mode& get_block_mode(unsigned int block_mode) const { unsigned int packed_index = this->block_mode_packed_index[block_mode]; - assert(packed_index != BLOCK_BAD_BLOCK_MODE && packed_index < this->block_mode_count); + assert(packed_index != BLOCK_BAD_BLOCK_MODE && packed_index < this->block_mode_count_all); return this->block_modes[packed_index]; } @@ -850,7 +859,7 @@ struct block_size_descriptor packed_index = this->partitioning_packed_index[partition_count - 2][index]; } - assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count[partition_count - 1]); + assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]); auto& result = get_partition_table(partition_count)[packed_index]; assert(index == result.partition_index); return result; @@ -866,7 +875,7 @@ struct block_size_descriptor */ const partition_info& get_raw_partition_info(unsigned int partition_count, unsigned int packed_index) const { - assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count[partition_count - 1]); + assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]); auto& result = get_partition_table(partition_count)[packed_index]; return result; } @@ -970,7 +979,7 @@ struct image_block */ inline float get_default_alpha() const { - return this->alpha_lns[0] ? (float)0x7800 : (float)0xFFFF; + return this->alpha_lns[0] ? static_cast(0x7800) : static_cast(0xFFFF); } /** @@ -1396,9 +1405,6 @@ struct astcenc_context /** @brief The pixel region and variance worker arguments. */ avg_args avg_preprocess_args; - /** @brief The per-texel deblocking weights for the current block size. */ - float deblock_weights[BLOCK_MAX_TEXELS]; - /** @brief The parallel manager for averages computation. */ ParallelManager manage_avg; @@ -1505,16 +1511,20 @@ bool is_legal_3d_block_size( /** * @brief The precomputed table for quantizing color values. * - * Indexed by [quant_mode][data_value]. + * Returned value is in the ASTC BISE scrambled order. + * + * Indexed by [quant_mode - 4][data_value]. */ -extern const uint8_t color_quant_tables[21][256]; +extern const uint8_t color_quant_tables[17][256]; /** * @brief The precomputed table for unquantizing color values. * - * Indexed by [quant_mode][data_value]. + * Returned value is in the ASTC BISE scrambled order. + * + * Indexed by [quant_mode - 4][data_value]. */ -extern const uint8_t color_unquant_tables[21][256]; +extern const uint8_t color_unquant_tables[17][256]; /** * @brief The precomputed quant mode storage table. @@ -1523,7 +1533,7 @@ extern const uint8_t color_unquant_tables[21][256]; * number of compressed storage bits. Returns -1 for cases where the requested integer count cannot * ever fit in the supplied storage size. */ -extern const int8_t quant_mode_table[17][128]; +extern const int8_t quant_mode_table[10][128]; /** * @brief Encode a packed string using BISE. @@ -1760,7 +1770,7 @@ void compute_averages( const avg_args& ag); /** - * @brief Fetch a single image block from the input image + * @brief Fetch a single image block from the input image. * * @param decode_mode The compression color profile. * @param img The input image data. @@ -1782,7 +1792,32 @@ void fetch_image_block( const astcenc_swizzle& swz); /** - * @brief Write a single image block from the output image + * @brief Fetch a single image block from the input image. + * + * This specialized variant can be used only if the block is 2D LDR U8 data, + * with no swizzle. + * + * @param decode_mode The compression color profile. + * @param img The input image data. + * @param[out] blk The image block to populate. + * @param bsd The block size information. + * @param xpos The block X coordinate in the input image. + * @param ypos The block Y coordinate in the input image. + * @param zpos The block Z coordinate in the input image. + * @param swz The swizzle to apply on load. + */ +void fetch_image_block_fast_ldr( + astcenc_profile decode_mode, + const astcenc_image& img, + image_block& blk, + const block_size_descriptor& bsd, + unsigned int xpos, + unsigned int ypos, + unsigned int zpos, + const astcenc_swizzle& swz); + +/** + * @brief Write a single image block from the output image. * * @param[out] img The input image data. * @param blk The image block to populate. @@ -2113,14 +2148,14 @@ void unpack_weights( * combination for each. The modified quantization level can be used when all formats are the same, * as this frees up two additional bits of storage. * - * @param bsd The block size information. * @param pi The partition info for the current trial. * @param blk The image block color data to compress. * @param ep The ideal endpoints. * @param qwt_bitcounts Bit counts for different quantization methods. * @param qwt_errors Errors for different quantization methods. * @param tune_candidate_limit The max number of candidates to return, may be less. - * @param block_mode_count The number of blocks mofdes candidates to inspect. + * @param start_block_mode The first block mode to inspect. + * @param end_block_mode The last block mode to inspect. * @param[out] partition_format_specifiers The best formats per partition. * @param[out] block_mode The best packed block mode indexes. * @param[out] quant_level The best color quant level. @@ -2130,14 +2165,14 @@ void unpack_weights( * @return The actual number of candidate matches returned. */ unsigned int compute_ideal_endpoint_formats( - const block_size_descriptor& bsd, const partition_info& pi, const image_block& blk, const endpoints& ep, const int* qwt_bitcounts, const float* qwt_errors, unsigned int tune_candidate_limit, - unsigned int block_mode_count, + unsigned int start_block_mode, + unsigned int end_block_mode, int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS], int block_mode[TUNE_MAX_TRIAL_CANDIDATES], quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES], @@ -2209,14 +2244,14 @@ void prepare_angular_tables(); * @param tune_low_weight_limit Weight count cutoff below which we use simpler searches. * @param only_always Only consider block modes that are always enabled. * @param bsd The block size descriptor for the current trial. - * @param dec_weight_quant_uvalue The decimated and quantized weight values. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. * @param[out] tmpbuf Preallocated scratch buffers for the compressor. */ void compute_angular_endpoints_1plane( unsigned int tune_low_weight_limit, bool only_always, const block_size_descriptor& bsd, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, compression_working_buffers& tmpbuf); /** @@ -2224,13 +2259,13 @@ void compute_angular_endpoints_1plane( * * @param tune_low_weight_limit Weight count cutoff below which we use simpler searches. * @param bsd The block size descriptor for the current trial. - * @param dec_weight_quant_uvalue The decimated and quantized weight values. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. * @param[out] tmpbuf Preallocated scratch buffers for the compressor. */ void compute_angular_endpoints_2planes( unsigned int tune_low_weight_limit, const block_size_descriptor& bsd, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, compression_working_buffers& tmpbuf); /* ============================================================================ @@ -2433,9 +2468,9 @@ template void aligned_free(T* ptr) { #if defined(_WIN32) - _aligned_free((void*)ptr); + _aligned_free(reinterpret_cast(ptr)); #else - free((void*)ptr); + free(reinterpret_cast(ptr)); #endif } diff --git a/libkram/astc-encoder/astcenc_mathlib.h b/libkram/astc-encoder/astcenc_mathlib.h index 4876749b..67e989e7 100644 --- a/libkram/astc-encoder/astcenc_mathlib.h +++ b/libkram/astc-encoder/astcenc_mathlib.h @@ -308,7 +308,7 @@ static inline float flt_rd(float v) static inline int flt2int_rtn(float v) { - return (int)(v + 0.5f); + return static_cast(v + 0.5f); } /** @@ -320,32 +320,35 @@ static inline int flt2int_rtn(float v) */ static inline int flt2int_rd(float v) { - return (int)(v); + return static_cast(v); } /** - * @brief Population bit count. + * @brief SP float bit-interpreted as an integer. * - * @param v The value to population count. + * @param v The value to bitcast. * - * @return The number of 1 bits. + * @return The converted value. */ -static inline int popcount(uint64_t v) +static inline int float_as_int(float v) { -#if ASTCENC_POPCNT >= 1 - return (int)_mm_popcnt_u64(v); -#else - uint64_t mask1 = 0x5555555555555555ULL; - uint64_t mask2 = 0x3333333333333333ULL; - uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL; - v -= (v >> 1) & mask1; - v = (v & mask2) + ((v >> 2) & mask2); - v += v >> 4; - v &= mask3; - v *= 0x0101010101010101ULL; - v >>= 56; - return (int)v; -#endif + union { int a; float b; } u; + u.b = v; + return u.a; +} + +/** + * @brief Integer bit-interpreted as an SP float. + * + * @param v The value to bitcast. + * + * @return The converted value. + */ +static inline float int_as_float(int v) +{ + union { int a; float b; } u; + u.a = v; + return u.b; } /** diff --git a/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp b/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp index 2665c0d8..d95fb9da 100644 --- a/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp +++ b/libkram/astc-encoder/astcenc_mathlib_softfloat.cpp @@ -283,7 +283,7 @@ static sf16 sf32_to_sf16(sf32 inp, roundmode rmode) -inp will set the MSB if the input number is nonzero. Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise. */ - return (uint32_t) (-(int32_t) inp) >> 31; + return static_cast((-static_cast(inp))) >> 31; /* negative, exponent = , round-mode == DOWN, need to check whether number is diff --git a/libkram/astc-encoder/astcenc_partition_tables.cpp b/libkram/astc-encoder/astcenc_partition_tables.cpp index 8769ec63..6b97e49d 100644 --- a/libkram/astc-encoder/astcenc_partition_tables.cpp +++ b/libkram/astc-encoder/astcenc_partition_tables.cpp @@ -381,45 +381,71 @@ static void build_partition_table_for_one_partition_count( }; unsigned int next_index = 0; - bsd.partitioning_count[partition_count - 1] = 0; + bsd.partitioning_count_selected[partition_count - 1] = 0; + bsd.partitioning_count_all[partition_count - 1] = 0; + // Skip tables larger than config max partition count if we can omit modes if (can_omit_partitionings && (partition_count > partition_count_cutoff)) { return; } - for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++) + // Iterate through twice + // - Pass 0: Keep selected partitionings + // - Pass 1: Keep non-selected partitionings (skip if in omit mode) + unsigned int max_iter = can_omit_partitionings ? 1 : 2; + + // Tracker for things we built in the first iteration + uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 }; + for (unsigned int x = 0; x < max_iter; x++) { - bool keep = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]); - if (can_omit_partitionings && !keep) + for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++) { - bsd.partitioning_packed_index[partition_count - 2][i] = BLOCK_BAD_PARTITIONING; - continue; - } + // Don't include things we built in the first pass + if ((x == 1) && build[i]) + { + continue; + } - generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * 7); - keep = true; - for (unsigned int j = 0; j < next_index; j++) - { - bool match = compare_canonical_partitionings(canonical_patterns + 7 * next_index, canonical_patterns + 7 * j); - if (match) + bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]); + if ((x == 0) && !keep_useful) { - ptab[next_index].partition_count = 0; - partitioning_valid[partition_count - 2][next_index] = 255; - keep = !can_omit_partitionings; - break; + continue; } - } - if (keep) - { - bsd.partitioning_packed_index[partition_count - 2][i] = next_index; - bsd.partitioning_count[partition_count - 1] = next_index + 1; - next_index++; - } - else - { - bsd.partitioning_packed_index[partition_count - 2][i] = BLOCK_BAD_PARTITIONING; + generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * 7); + bool keep_canonical = true; + for (unsigned int j = 0; j < next_index; j++) + { + bool match = compare_canonical_partitionings(canonical_patterns + 7 * next_index, canonical_patterns + 7 * j); + if (match) + { + keep_canonical = false; + break; + } + } + + if (keep_useful && keep_canonical) + { + if (x == 0) + { + bsd.partitioning_packed_index[partition_count - 2][i] = next_index; + bsd.partitioning_count_selected[partition_count - 1]++; + bsd.partitioning_count_all[partition_count - 1]++; + build[i] = 1; + next_index++; + } + } + else + { + if (x == 1) + { + bsd.partitioning_packed_index[partition_count - 2][i] = next_index; + bsd.partitioning_count_all[partition_count - 1]++; + partitioning_valid[partition_count - 2][next_index] = 255; + next_index++; + } + } } } } @@ -436,7 +462,8 @@ void init_partition_tables( partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS; generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1); - bsd.partitioning_count[0] = 1; + bsd.partitioning_count_selected[0] = 1; + bsd.partitioning_count_all[0] = 1; uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * 7]; build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns); diff --git a/libkram/astc-encoder/astcenc_percentile_tables.cpp b/libkram/astc-encoder/astcenc_percentile_tables.cpp index 1744d996..57903050 100644 --- a/libkram/astc-encoder/astcenc_percentile_tables.cpp +++ b/libkram/astc-encoder/astcenc_percentile_tables.cpp @@ -1184,7 +1184,7 @@ const float *get_2d_percentile_table( unsigned int idx = item & 0x7FF; unsigned int weight = (item >> 11) & 0x1F; accum += weight; - unpacked_table[idx] = (float)accum / (float)difscale; + unpacked_table[idx] = static_cast(accum) / static_cast(difscale); } } diff --git a/libkram/astc-encoder/astcenc_pick_best_endpoint_format.cpp b/libkram/astc-encoder/astcenc_pick_best_endpoint_format.cpp index 032a680a..fc00b74e 100644 --- a/libkram/astc-encoder/astcenc_pick_best_endpoint_format.cpp +++ b/libkram/astc-encoder/astcenc_pick_best_endpoint_format.cpp @@ -89,10 +89,10 @@ static void compute_error_squared_rgb_single_partition( const uint8_t* texel_indexes = pi.texels_of_partition[partition_index]; promise(texel_count > 0); - vfloat4 a_drop_errv = vfloat4::zero(); + vfloatacc a_drop_errv = vfloatacc::zero(); vfloat default_a(blk.get_default_alpha()); - vfloat4 uncor_errv = vfloat4::zero(); + vfloatacc uncor_errv = vfloatacc::zero(); vfloat uncor_bs0(uncor_pline.bs.lane<0>()); vfloat uncor_bs1(uncor_pline.bs.lane<1>()); vfloat uncor_bs2(uncor_pline.bs.lane<2>()); @@ -101,12 +101,12 @@ static void compute_error_squared_rgb_single_partition( vfloat uncor_amod1(uncor_pline.amod.lane<1>()); vfloat uncor_amod2(uncor_pline.amod.lane<2>()); - vfloat4 samec_errv = vfloat4::zero(); + vfloatacc samec_errv = vfloatacc::zero(); vfloat samec_bs0(samec_pline.bs.lane<0>()); vfloat samec_bs1(samec_pline.bs.lane<1>()); vfloat samec_bs2(samec_pline.bs.lane<2>()); - vfloat4 rgbl_errv = vfloat4::zero(); + vfloatacc rgbl_errv = vfloatacc::zero(); vfloat rgbl_bs0(rgbl_pline.bs.lane<0>()); vfloat rgbl_bs1(rgbl_pline.bs.lane<1>()); vfloat rgbl_bs2(rgbl_pline.bs.lane<2>()); @@ -115,7 +115,7 @@ static void compute_error_squared_rgb_single_partition( vfloat rgbl_amod1(rgbl_pline.amod.lane<1>()); vfloat rgbl_amod2(rgbl_pline.amod.lane<2>()); - vfloat4 l_errv = vfloat4::zero(); + vfloatacc l_errv = vfloatacc::zero(); vfloat l_bs0(l_pline.bs.lane<0>()); vfloat l_bs1(l_pline.bs.lane<1>()); vfloat l_bs2(l_pline.bs.lane<2>()); @@ -132,8 +132,8 @@ static void compute_error_squared_rgb_single_partition( vfloat data_a = gatherf(blk.data_a, tix); vfloat alpha_diff = data_a - default_a; alpha_diff = alpha_diff * alpha_diff; - alpha_diff = select(vfloat::zero(), alpha_diff, mask); - haccumulate(a_drop_errv, alpha_diff); + + haccumulate(a_drop_errv, alpha_diff, mask); vfloat data_r = gatherf(blk.data_r, tix); vfloat data_g = gatherf(blk.data_g, tix); @@ -152,8 +152,7 @@ static void compute_error_squared_rgb_single_partition( + dist1 * dist1 * ews.lane<1>() + dist2 * dist2 * ews.lane<2>(); - error = select(vfloat::zero(), error, mask); - haccumulate(uncor_errv, error); + haccumulate(uncor_errv, error, mask); // Compute same chroma error - no "amod", its always zero param = data_r * samec_bs0 @@ -168,8 +167,7 @@ static void compute_error_squared_rgb_single_partition( + dist1 * dist1 * ews.lane<1>() + dist2 * dist2 * ews.lane<2>(); - error = select(vfloat::zero(), error, mask); - haccumulate(samec_errv, error); + haccumulate(samec_errv, error, mask); // Compute rgbl error param = data_r * rgbl_bs0 @@ -184,8 +182,7 @@ static void compute_error_squared_rgb_single_partition( + dist1 * dist1 * ews.lane<1>() + dist2 * dist2 * ews.lane<2>(); - error = select(vfloat::zero(), error, mask); - haccumulate(rgbl_errv, error); + haccumulate(rgbl_errv, error, mask); // Compute luma error - no "amod", its always zero param = data_r * l_bs0 @@ -200,11 +197,10 @@ static void compute_error_squared_rgb_single_partition( + dist1 * dist1 * ews.lane<1>() + dist2 * dist2 * ews.lane<2>(); - error = select(vfloat::zero(), error, mask); - haccumulate(l_errv, error); + haccumulate(l_errv, error, mask); } - a_drop_err = hadd_s(a_drop_errv * ews.lane<3>()); + a_drop_err = hadd_s(a_drop_errv) * ews.lane<3>(); uncor_err = hadd_s(uncor_errv); samec_err = hadd_s(samec_errv); rgbl_err = hadd_s(rgbl_errv); @@ -218,24 +214,19 @@ static void compute_error_squared_rgb_single_partition( * RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether * the endpoints are eligible for offset encoding or blue-contraction * - * @param bsd The block size information. * @param blk The image block. * @param pi The partition info data. * @param ep The idealized endpoints. * @param[out] eci The resulting encoding choice error metrics. */ static void compute_encoding_choice_errors( - const block_size_descriptor& bsd, const image_block& blk, const partition_info& pi, const endpoints& ep, encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]) { int partition_count = pi.partition_count; - int texels_per_block = bsd.texel_count; - promise(partition_count > 0); - promise(texels_per_block > 0); partition_metrics pms[BLOCK_MAX_PARTITIONS]; @@ -429,9 +420,9 @@ static void compute_color_error_for_every_integer_count_and_quant_level( // Estimate of color-component spread in low endpoint color float df = hmax_s(abs(pdif)); - int b = (int)bf; - int c = (int)cf; - int d = (int)df; + int b = static_cast(bf); + int c = static_cast(cf); + int d = static_cast(df); // Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode int rgbo_mode = 5; // 7 bits per component @@ -531,7 +522,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level( mode23mult *= 0.0005f; // Empirically determined .... // Pick among the available HDR endpoint modes - for (int i = 0; i < 8; i++) + for (int i = QUANT_2; i < QUANT_16; i++) { best_error[i][3] = ERROR_CALC_DEFAULT; best_error[i][2] = ERROR_CALC_DEFAULT; @@ -544,7 +535,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level( format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE; } - for (int i = 8; i < 21; i++) + for (int i = QUANT_16; i <= QUANT_256; i++) { // The base_quant_error should depend on the scale-factor that would be used during // actual encode of the color value @@ -579,7 +570,7 @@ static void compute_color_error_for_every_integer_count_and_quant_level( } else { - for (int i = 0; i < 4; i++) + for (int i = QUANT_2; i < QUANT_6; i++) { best_error[i][3] = ERROR_CALC_DEFAULT; best_error[i][2] = ERROR_CALC_DEFAULT; @@ -603,10 +594,10 @@ static void compute_color_error_for_every_integer_count_and_quant_level( float error_scale_oe_rgb = eci.can_offset_encode ? 0.25f : 1.0f; // Pick among the available LDR endpoint modes - for (int i = 4; i < 21; i++) + for (int i = QUANT_6; i <= QUANT_256; i++) { // Offset encoding not possible at higher quant levels - if (i == 19) + if (i >= QUANT_192) { error_scale_oe_rgba = 1.0f; error_scale_oe_rgb = 1.0f; @@ -702,7 +693,7 @@ static float one_partition_find_best_combination_for_bitcount( int& best_format ) { int best_integer_count = 0; - float best_integer_count_error = 1e20f; + float best_integer_count_error = ERROR_CALC_DEFAULT; for (int integer_count = 1; integer_count <= 4; integer_count++) { @@ -710,7 +701,7 @@ static float one_partition_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level < 0) + if (quant_level < QUANT_6) { continue; } @@ -725,10 +716,10 @@ static float one_partition_find_best_combination_for_bitcount( int ql = quant_mode_table[best_integer_count + 1][bits_available]; - best_quant_level = (quant_method)ql; + best_quant_level = static_cast(ql); best_format = FMT_LUMINANCE; - if (ql >= 0) + if (ql >= QUANT_6) { best_format = best_combined_format[ql][best_integer_count]; } @@ -750,7 +741,7 @@ static void two_partitions_find_best_combination_for_every_quantization_and_inte float best_combined_error[21][7], // indexed by (quant-level, integer-pair-count-minus-2) int best_combined_format[21][7][2] ) { - for (int i = 0; i < 21; i++) + for (int i = QUANT_2; i <= QUANT_256; i++) { for (int j = 0; j < 7; j++) { @@ -758,7 +749,7 @@ static void two_partitions_find_best_combination_for_every_quantization_and_inte } } - for (int quant = 5; quant < 21; quant++) + for (int quant = QUANT_6; quant <= QUANT_256; quant++) { for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair { @@ -805,7 +796,7 @@ static float two_partitions_find_best_combination_for_bitcount( int* best_formats ) { int best_integer_count = 0; - float best_integer_count_error = 1e20f; + float best_integer_count_error = ERROR_CALC_DEFAULT; for (int integer_count = 2; integer_count <= 8; integer_count++) { @@ -813,7 +804,7 @@ static float two_partitions_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level < 0) + if (quant_level < QUANT_6) { break; } @@ -829,10 +820,10 @@ static float two_partitions_find_best_combination_for_bitcount( int ql = quant_mode_table[best_integer_count][bits_available]; int ql_mod = quant_mode_table[best_integer_count][bits_available + 2]; - best_quant_level = (quant_method)ql; - best_quant_level_mod = (quant_method)ql_mod; + best_quant_level = static_cast(ql); + best_quant_level_mod = static_cast(ql_mod); - if (ql >= 0) + if (ql >= QUANT_6) { for (int i = 0; i < 2; i++) { @@ -864,7 +855,7 @@ static void three_partitions_find_best_combination_for_every_quantization_and_in float best_combined_error[21][10], int best_combined_format[21][10][3] ) { - for (int i = 0; i < 21; i++) + for (int i = QUANT_2; i <= QUANT_256; i++) { for (int j = 0; j < 10; j++) { @@ -872,7 +863,7 @@ static void three_partitions_find_best_combination_for_every_quantization_and_in } } - for (int quant = 5; quant < 21; quant++) + for (int quant = QUANT_6; quant <= QUANT_256; quant++) { for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair { @@ -930,7 +921,7 @@ static float three_partitions_find_best_combination_for_bitcount( int* best_formats ) { int best_integer_count = 0; - float best_integer_count_error = 1e20f; + float best_integer_count_error = ERROR_CALC_DEFAULT; for (int integer_count = 3; integer_count <= 9; integer_count++) { @@ -938,7 +929,7 @@ static float three_partitions_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level < 0) + if (quant_level < QUANT_6) { break; } @@ -954,10 +945,10 @@ static float three_partitions_find_best_combination_for_bitcount( int ql = quant_mode_table[best_integer_count][bits_available]; int ql_mod = quant_mode_table[best_integer_count][bits_available + 5]; - best_quant_level = (quant_method)ql; - best_quant_level_mod = (quant_method)ql_mod; + best_quant_level = static_cast(ql); + best_quant_level_mod = static_cast(ql_mod); - if (ql >= 0) + if (ql >= QUANT_6) { for (int i = 0; i < 3; i++) { @@ -989,7 +980,7 @@ static void four_partitions_find_best_combination_for_every_quantization_and_int float best_combined_error[21][13], int best_combined_format[21][13][4] ) { - for (int i = 0; i < 21; i++) + for (int i = QUANT_2; i <= QUANT_256; i++) { for (int j = 0; j < 13; j++) { @@ -997,7 +988,7 @@ static void four_partitions_find_best_combination_for_every_quantization_and_int } } - for (int quant = 5; quant < 21; quant++) + for (int quant = QUANT_6; quant <= QUANT_256; quant++) { for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair { @@ -1066,7 +1057,7 @@ static float four_partitions_find_best_combination_for_bitcount( int* best_formats ) { int best_integer_count = 0; - float best_integer_count_error = 1e20f; + float best_integer_count_error = ERROR_CALC_DEFAULT; for (int integer_count = 4; integer_count <= 9; integer_count++) { @@ -1074,7 +1065,7 @@ static float four_partitions_find_best_combination_for_bitcount( int quant_level = quant_mode_table[integer_count][bits_available]; // Don't have enough bits to represent a given endpoint format at all! - if (quant_level < 0) + if (quant_level < QUANT_6) { break; } @@ -1090,10 +1081,10 @@ static float four_partitions_find_best_combination_for_bitcount( int ql = quant_mode_table[best_integer_count][bits_available]; int ql_mod = quant_mode_table[best_integer_count][bits_available + 8]; - best_quant_level = (quant_method)ql; - best_quant_level_mod = (quant_method)ql_mod; + best_quant_level = static_cast(ql); + best_quant_level_mod = static_cast(ql_mod); - if (ql >= 0) + if (ql >= QUANT_6) { for (int i = 0; i < 4; i++) { @@ -1113,7 +1104,6 @@ static float four_partitions_find_best_combination_for_bitcount( /* See header for documentation. */ unsigned int compute_ideal_endpoint_formats( - const block_size_descriptor& bsd, const partition_info& pi, const image_block& blk, const endpoints& ep, @@ -1121,7 +1111,8 @@ unsigned int compute_ideal_endpoint_formats( const int* qwt_bitcounts, const float* qwt_errors, unsigned int tune_candidate_limit, - unsigned int block_mode_count, + unsigned int start_block_mode, + unsigned int end_block_mode, // output data int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS], int block_mode[TUNE_MAX_TRIAL_CANDIDATES], @@ -1132,7 +1123,6 @@ unsigned int compute_ideal_endpoint_formats( int partition_count = pi.partition_count; promise(partition_count > 0); - promise(block_mode_count > 0); int encode_hdr_rgb = blk.rgb_lns[0]; int encode_hdr_alpha = blk.alpha_lns[0]; @@ -1140,7 +1130,7 @@ unsigned int compute_ideal_endpoint_formats( // Compute the errors that result from various encoding choices (such as using luminance instead // of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on) encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]; - compute_encoding_choice_errors(bsd, blk, pi, ep, eci); + compute_encoding_choice_errors(blk, pi, ep, eci); float best_error[BLOCK_MAX_PARTITIONS][21][4]; int format_of_choice[BLOCK_MAX_PARTITIONS][21][4]; @@ -1159,8 +1149,17 @@ unsigned int compute_ideal_endpoint_formats( // Ensure that the "overstep" of the last iteration in the vectorized loop will contain data // that will never be picked as best candidate - const int packed_mode_count_simd_up = round_up_to_simd_multiple_vla(block_mode_count); - for (int i = block_mode_count; i < packed_mode_count_simd_up; i++) + const unsigned int packed_end_block_mode = round_up_to_simd_multiple_vla(end_block_mode); + + // TODO: Can we avoid this? + for (unsigned int i = 0; i < start_block_mode; i++) + { + errors_of_best_combination[i] = ERROR_CALC_DEFAULT; + best_quant_levels[i] = QUANT_2; + best_quant_levels_mod[i] = QUANT_2; + } + + for (unsigned int i = end_block_mode; i < packed_end_block_mode; i++) { errors_of_best_combination[i] = ERROR_CALC_DEFAULT; best_quant_levels[i] = QUANT_2; @@ -1174,7 +1173,7 @@ unsigned int compute_ideal_endpoint_formats( // The block contains 1 partition if (partition_count == 1) { - for (unsigned int i = 0; i < block_mode_count; ++i) + for (unsigned int i = start_block_mode; i < end_block_mode; ++i) { if (qwt_errors[i] >= ERROR_CALC_DEFAULT) { @@ -1206,7 +1205,8 @@ unsigned int compute_ideal_endpoint_formats( two_partitions_find_best_combination_for_every_quantization_and_integer_count( best_error, format_of_choice, combined_best_error, formats_of_choice); - for (unsigned int i = 0; i < block_mode_count; ++i) + assert(start_block_mode == 0); + for (unsigned int i = 0; i < end_block_mode; ++i) { if (qwt_errors[i] >= ERROR_CALC_DEFAULT) { @@ -1238,7 +1238,8 @@ unsigned int compute_ideal_endpoint_formats( three_partitions_find_best_combination_for_every_quantization_and_integer_count( best_error, format_of_choice, combined_best_error, formats_of_choice); - for (unsigned int i = 0; i < block_mode_count; ++i) + assert(start_block_mode == 0); + for (unsigned int i = 0; i < end_block_mode; ++i) { if (qwt_errors[i] >= ERROR_CALC_DEFAULT) { @@ -1271,7 +1272,8 @@ unsigned int compute_ideal_endpoint_formats( four_partitions_find_best_combination_for_every_quantization_and_integer_count( best_error, format_of_choice, combined_best_error, formats_of_choice); - for (unsigned int i = 0; i < block_mode_count; ++i) + assert(start_block_mode == 0); + for (unsigned int i = 0; i < end_block_mode; ++i) { if (qwt_errors[i] >= ERROR_CALC_DEFAULT) { @@ -1309,12 +1311,14 @@ unsigned int compute_ideal_endpoint_formats( { vint vbest_error_index(-1); vfloat vbest_ep_error(ERROR_CALC_DEFAULT); - vint lane_ids = vint::lane_id(); - for (unsigned int j = 0; j < block_mode_count; j += ASTCENC_SIMD_WIDTH) + + start_block_mode = round_down_to_simd_multiple_vla(start_block_mode); + vint lane_ids = vint::lane_id() + vint(start_block_mode); + for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH) { vfloat err = vfloat(&errors_of_best_combination[j]); vmask mask1 = err < vbest_ep_error; - vmask mask2 = vint((int*)(&best_quant_levels[j])) > vint(4); + vmask mask2 = vint(reinterpret_cast(best_quant_levels + j)) > vint(4); vmask mask = mask1 & mask2; vbest_ep_error = select(vbest_ep_error, err, mask); vbest_error_index = select(vbest_error_index, lane_ids, mask); @@ -1349,9 +1353,13 @@ unsigned int compute_ideal_endpoint_formats( } block_mode[i] = best_error_weights[i]; + quant_level[i] = best_quant_levels[best_error_weights[i]]; - assert(quant_level[i] >= 0 && quant_level[i] < 21); quant_level_mod[i] = best_quant_levels_mod[best_error_weights[i]]; + + assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256); + assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256); + for (int j = 0; j < partition_count; j++) { partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j]; diff --git a/libkram/astc-encoder/astcenc_quantization.cpp b/libkram/astc-encoder/astcenc_quantization.cpp index 2d48abff..233a9338 100644 --- a/libkram/astc-encoder/astcenc_quantization.cpp +++ b/libkram/astc-encoder/astcenc_quantization.cpp @@ -23,79 +23,9 @@ #if !defined(ASTCENC_DECOMPRESS_ONLY) -const uint8_t color_quant_tables[21][256] { - { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, - { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 - }, - { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 - }, - { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 - }, +// Starts from QUANT_6 +// Scrambled +const uint8_t color_quant_tables[17][256] { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, @@ -406,19 +336,9 @@ const uint8_t color_quant_tables[21][256] { #endif -const uint8_t color_unquant_tables[21][256] { - { - 0, 255 - }, - { - 0, 128, 255 - }, - { - 0, 85, 170, 255 - }, - { - 0, 64, 128, 192, 255 - }, +// Starts from QUANT_6 +// Scrambled +const uint8_t color_unquant_tables[17][256] { { 0, 255, 51, 204, 102, 153 }, @@ -535,7 +455,7 @@ const uint8_t color_unquant_tables[21][256] { // The quant_mode_table[integercount/2][bits] gives us the quantization level for a given integer // count and number of bits that the integer may fit into. -const int8_t quant_mode_table[17][128] { +const int8_t quant_mode_table[10][128] { { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -635,75 +555,5 @@ const int8_t quant_mode_table[17][128] { 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17 - }, - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, - 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, - 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 10, 10, - 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, - 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 15 - }, - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, - 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, - 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, - 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13 - }, - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, - 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11 - }, - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, - 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10 - }, - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, - 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9 - }, - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8 - }, - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 - }, + } }; diff --git a/libkram/astc-encoder/astcenc_symbolic_physical.cpp b/libkram/astc-encoder/astcenc_symbolic_physical.cpp index af68bd4a..2afd460e 100644 --- a/libkram/astc-encoder/astcenc_symbolic_physical.cpp +++ b/libkram/astc-encoder/astcenc_symbolic_physical.cpp @@ -352,7 +352,7 @@ void physical_to_symbolic( const auto& di = bsd.get_decimation_info(bm.decimation_mode); int weight_count = di.weight_count; - quant_method weight_quant_method = (quant_method)bm.quant_mode; + quant_method weight_quant_method = static_cast(bm.quant_mode); int is_dual_plane = bm.is_dual_plane; int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count; @@ -479,9 +479,10 @@ void physical_to_symbolic( } // Unpack the integer color values and assign to endpoints - scb.quant_mode = (quant_method)color_quant_level; + scb.quant_mode = static_cast(color_quant_level); uint8_t values_to_decode[32]; - decode_ise((quant_method)color_quant_level, color_integer_count, pcb.data, values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS)); + decode_ise(static_cast(color_quant_level), color_integer_count, pcb.data, + values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS)); int valuecount_to_decode = 0; for (int i = 0; i < partition_count; i++) diff --git a/libkram/astc-encoder/astcenc_vecmathlib.h b/libkram/astc-encoder/astcenc_vecmathlib.h index 069c03c9..07a2ff3f 100644 --- a/libkram/astc-encoder/astcenc_vecmathlib.h +++ b/libkram/astc-encoder/astcenc_vecmathlib.h @@ -75,6 +75,13 @@ #define ASTCENC_SIMD_WIDTH 8 using vfloat = vfloat8; + + #if defined(ASTCENC_NO_INVARIANCE) + using vfloatacc = vfloat8; + #else + using vfloatacc = vfloat4; + #endif + using vint = vint8; using vmask = vmask8; @@ -89,6 +96,7 @@ #define ASTCENC_SIMD_WIDTH 4 using vfloat = vfloat4; + using vfloatacc = vfloat4; using vint = vint4; using vmask = vmask4; @@ -103,6 +111,7 @@ #define ASTCENC_SIMD_WIDTH 4 using vfloat = vfloat4; + using vfloatacc = vfloat4; using vint = vint4; using vmask = vmask4; @@ -134,6 +143,7 @@ #define ASTCENC_SIMD_WIDTH 4 using vfloat = vfloat4; + using vfloatacc = vfloat4; using vint = vint4; using vmask = vmask4; @@ -201,7 +211,7 @@ ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b) { vint ia = float_as_int(a); vint ib = float_as_int(b); - vint sign_mask((int)0x80000000); + vint sign_mask(static_cast(0x80000000)); vint r = ia ^ (ib & sign_mask); return int_as_float(r); } @@ -227,7 +237,7 @@ ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x) { vfloat z = atan(abs(y / x)); vmask xmask = vmask(float_as_int(x).m); - return change_sign(select(z, vfloat(astc::PI) - z, xmask), y); + return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y); } /* diff --git a/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h b/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h index 0b0ea869..fe8a1b16 100755 --- a/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h +++ b/libkram/astc-encoder/astcenc_vecmathlib_avx2_8.h @@ -164,7 +164,7 @@ struct vint8 */ ASTCENC_SIMD_INLINE explicit vint8(const int *p) { - m = _mm256_loadu_si256((const __m256i*)p); + m = _mm256_loadu_si256(reinterpret_cast(p)); } /** @@ -173,7 +173,7 @@ struct vint8 ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p) { // _mm_loadu_si64 would be nicer syntax, but missing on older GCC - m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*(const long long*)p)); + m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast(p))); } /** @@ -242,7 +242,7 @@ struct vint8 */ static ASTCENC_SIMD_INLINE vint8 loada(const int* p) { - return vint8(_mm256_load_si256((const __m256i*)p)); + return vint8(_mm256_load_si256(reinterpret_cast(p))); } /** @@ -534,7 +534,7 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a) */ ASTCENC_SIMD_INLINE void storea(vint8 a, int* p) { - _mm256_store_si256((__m256i*)p, a.m); + _mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m); } /** @@ -542,7 +542,7 @@ ASTCENC_SIMD_INLINE void storea(vint8 a, int* p) */ ASTCENC_SIMD_INLINE void store(vint8 a, int* p) { - _mm256_storeu_si256((__m256i*)p, a.m); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m); } /** @@ -553,7 +553,7 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p) // This is the most logical implementation, but the convenience intrinsic // is missing on older compilers (supported in g++ 9 and clang++ 9). // _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0)) - _mm_storel_epi64((__m128i*)p, _mm256_extracti128_si256(a.m, 0)); + _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0)); } /** @@ -586,27 +586,12 @@ ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v) } /** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + * @brief Return lanes from @c b if @c cond is set, else @c a. */ ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond) { - // Don't use _mm256_blendv_epi8 directly, as it doesn't give the select on - // float sign-bit in the mask behavior which is useful. Performance is the - // same, these casts are free. - __m256 av = _mm256_castsi256_ps(a.m); - __m256 bv = _mm256_castsi256_ps(b.m); - return vint8(_mm256_castps_si256(_mm256_blendv_ps(av, bv, cond.m))); -} - -/** - * @brief Debug function to print a vector of ints. - */ -ASTCENC_SIMD_INLINE void print(vint8 a) -{ - alignas(ASTCENC_VECALIGN) int v[8]; - storea(a, v); - printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", - v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); + __m256i condi = _mm256_castps_si256(cond.m); + return vint8(_mm256_blendv_epi8(a.m, b.m, condi)); } // ============================================================================ @@ -886,28 +871,28 @@ ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a) } /** - * @brief Accumulate the full horizontal sum of a vector. + * @brief Return lanes from @c b if @c cond is set, else @c a. */ -ASTCENC_SIMD_INLINE void haccumulate(float& accum, vfloat8 a) +ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) { - // Two sequential 4-wide accumulates gives invariance with 4-wide code. - // Note that this approach gives higher error in the sum; adding the two - // smaller numbers together first would be more accurate. - vfloat4 lo(_mm256_extractf128_ps(a.m, 0)); - haccumulate(accum, lo); + return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); +} - vfloat4 hi(_mm256_extractf128_ps(a.m, 1)); - haccumulate(accum, hi); +/** + * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond) +{ + return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); } /** * @brief Accumulate lane-wise sums for a vector, folded 4-wide. + * + * This is invariant with 4-wide implementations. */ ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a) { - // Two sequential 4-wide accumulates gives invariance with 4-wide code. - // Note that this approach gives higher error in the sum; adding the two - // smaller numbers together first would be more accurate. vfloat4 lo(_mm256_extractf128_ps(a.m, 0)); haccumulate(accum, lo); @@ -916,19 +901,43 @@ ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a) } /** - * @brief Return the sqrt of the lanes in the vector. + * @brief Accumulate lane-wise sums for a vector. + * + * This is NOT invariant with 4-wide implementations. */ -ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a) +ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a) { - return vfloat8(_mm256_sqrt_ps(a.m)); + accum += a; } /** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide. + * + * This is invariant with 4-wide implementations. */ -ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond) +ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m) { - return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m)); + a = select(vfloat8::zero(), a, m); + haccumulate(accum, a); +} + +/** + * @brief Accumulate masked lane-wise sums for a vector. + * + * This is NOT invariant with 4-wide implementations. + */ +ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m) +{ + a = select(vfloat8::zero(), a, m); + haccumulate(accum, a); +} + +/** + * @brief Return the sqrt of the lanes in the vector. + */ +ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a) +{ + return vfloat8(_mm256_sqrt_ps(a.m)); } /** @@ -995,6 +1004,17 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a) return vfloat8(_mm256_castsi256_ps(a.m)); } +/** + * @brief Debug function to print a vector of ints. + */ +ASTCENC_SIMD_INLINE void print(vint8 a) +{ + alignas(ASTCENC_VECALIGN) int v[8]; + storea(a, v); + printf("v8_i32:\n %8d %8d %8d %8d %8d %8d %8d %8d\n", + v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); +} + /** * @brief Debug function to print a vector of floats. */ @@ -1003,8 +1023,18 @@ ASTCENC_SIMD_INLINE void print(vfloat8 a) alignas(ASTCENC_VECALIGN) float v[8]; storea(a, v); printf("v8_f32:\n %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n", - (double)v[0], (double)v[1], (double)v[2], (double)v[3], - (double)v[4], (double)v[5], (double)v[6], (double)v[7]); + static_cast(v[0]), static_cast(v[1]), + static_cast(v[2]), static_cast(v[3]), + static_cast(v[4]), static_cast(v[5]), + static_cast(v[6]), static_cast(v[7])); +} + +/** + * @brief Debug function to print a vector of masks. + */ +ASTCENC_SIMD_INLINE void print(vmask8 a) +{ + print(select(vint8(0), vint8(1), a)); } #endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED diff --git a/libkram/astc-encoder/astcenc_vecmathlib_common_4.h b/libkram/astc-encoder/astcenc_vecmathlib_common_4.h index 50394052..b7d644d8 100755 --- a/libkram/astc-encoder/astcenc_vecmathlib_common_4.h +++ b/libkram/astc-encoder/astcenc_vecmathlib_common_4.h @@ -129,17 +129,6 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a) return a.lane<0>() + a.lane<1>() + a.lane<2>(); } -/** - * @brief Debug function to print a vector of ints. - */ -ASTCENC_SIMD_INLINE void print(vint4 a) -{ - alignas(16) int v[4]; - storea(a, v); - printf("v4_i32:\n %8d %8d %8d %8d\n", - v[0], v[1], v[2], v[3]); -} - // ============================================================================ // vfloat4 operators and functions // ============================================================================ @@ -282,19 +271,20 @@ ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a) } /** - * @brief Accumulate the full horizontal sum of a vector. + * @brief Accumulate lane-wise sums for a vector. */ -ASTCENC_SIMD_INLINE void haccumulate(float& accum, vfloat4 a) +ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a) { - accum += hadd_s(a); + accum = accum + a; } /** - * @brief Accumulate lane-wise sums for a vector. + * @brief Accumulate lane-wise sums for a masked vector. */ -ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a) +ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m) { - accum = accum + a; + a = select(vfloat4::zero(), a, m); + haccumulate(accum, a); } /** @@ -305,6 +295,8 @@ ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a) return a.lane<0>() + a.lane<1>() + a.lane<2>(); } +#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT) + /** * @brief Return the dot product for the full 4 lanes, returning scalar. */ @@ -342,12 +334,42 @@ ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b) return vfloat4(d3, d3, d3, 0.0f); } +#endif + +#if !defined(ASTCENC_USE_NATIVE_POPCOUNT) + /** - * @brief Generate a reciprocal of a vector. + * @brief Population bit count. + * + * @param v The value to population count. + * + * @return The number of 1 bits. + */ +static inline int popcount(uint64_t v) +{ + uint64_t mask1 = 0x5555555555555555ULL; + uint64_t mask2 = 0x3333333333333333ULL; + uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL; + v -= (v >> 1) & mask1; + v = (v & mask2) + ((v >> 2) & mask2); + v += v >> 4; + v &= mask3; + v *= 0x0101010101010101ULL; + v >>= 56; + return static_cast(v); +} + +#endif + +/** + * @brief Debug function to print a vector of ints. */ -ASTCENC_SIMD_INLINE vfloat4 recip(vfloat4 b) +ASTCENC_SIMD_INLINE void print(vint4 a) { - return 1.0f / b; + alignas(16) int v[4]; + storea(a, v); + printf("v4_i32:\n %8d %8d %8d %8d\n", + v[0], v[1], v[2], v[3]); } /** @@ -358,7 +380,16 @@ ASTCENC_SIMD_INLINE void print(vfloat4 a) alignas(16) float v[4]; storea(a, v); printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n", - (double)v[0], (double)v[1], (double)v[2], (double)v[3]); + static_cast(v[0]), static_cast(v[1]), + static_cast(v[2]), static_cast(v[3])); +} + +/** + * @brief Debug function to print a vector of masks. + */ +ASTCENC_SIMD_INLINE void print(vmask4 a) +{ + print(select(vint4(0), vint4(1), a)); } #endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED diff --git a/libkram/astc-encoder/astcenc_vecmathlib_neon_4.h b/libkram/astc-encoder/astcenc_vecmathlib_neon_4.h index 91cc21f3..7ac9da3f 100755 --- a/libkram/astc-encoder/astcenc_vecmathlib_neon_4.h +++ b/libkram/astc-encoder/astcenc_vecmathlib_neon_4.h @@ -614,13 +614,11 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) } /** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + * @brief Return lanes from @c b if @c cond is set, else @c a. */ ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond) { - static const uint32x4_t msb = vdupq_n_u32(0x80000000u); - uint32x4_t mask = vcgeq_u32(cond.m, msb); - return vint4(vbslq_s32(mask, b.m, a.m)); + return vint4(vbslq_s32(cond.m, b.m, a.m)); } // ============================================================================ @@ -783,9 +781,17 @@ ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a) } /** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + * @brief Return lanes from @c b if @c cond is set, else @c a. */ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) +{ + return vfloat4(vbslq_f32(cond.m, b.m, a.m)); +} + +/** + * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) { static const uint32x4_t msb = vdupq_n_u32(0x80000000u); uint32x4_t mask = vcgeq_u32(cond.m, msb); @@ -918,4 +924,18 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(vreinterpretq_f32_s32(v.m)); } +#define ASTCENC_USE_NATIVE_POPCOUNT 1 + +/** + * @brief Population bit count. + * + * @param v The value to population count. + * + * @return The number of 1 bits. + */ +ASTCENC_SIMD_INLINE int popcount(uint64_t v) +{ + return static_cast(vaddlv_u8(vcnt_u8(vcreate_u8(v)))); +} + #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED diff --git a/libkram/astc-encoder/astcenc_vecmathlib_none_4.h b/libkram/astc-encoder/astcenc_vecmathlib_none_4.h index db489bce..5a399ef5 100644 --- a/libkram/astc-encoder/astcenc_vecmathlib_none_4.h +++ b/libkram/astc-encoder/astcenc_vecmathlib_none_4.h @@ -888,7 +888,7 @@ ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a) } /** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + * @brief Return lanes from @c b if @c cond is set, else @c a. */ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) { @@ -898,6 +898,17 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) (cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]); } +/** + * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) +{ + return vfloat4((cond.m[0] & 0x80000000) ? b.m[0] : a.m[0], + (cond.m[1] & 0x80000000) ? b.m[1] : a.m[1], + (cond.m[2] & 0x80000000) ? b.m[2] : a.m[2], + (cond.m[3] & 0x80000000) ? b.m[3] : a.m[3]); +} + /** * @brief Load a vector of gathered results from an array; */ diff --git a/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h b/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h index 89fc4837..868522dc 100755 --- a/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h +++ b/libkram/astc-encoder/astcenc_vecmathlib_sse_4.h @@ -206,7 +206,7 @@ struct vint4 */ ASTCENC_SIMD_INLINE explicit vint4(const int *p) { - m = _mm_loadu_si128((const __m128i*)p); + m = _mm_loadu_si128(reinterpret_cast(p)); } /** @@ -215,7 +215,7 @@ struct vint4 ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p) { // _mm_loadu_si32 would be nicer syntax, but missing on older GCC - __m128i t = _mm_cvtsi32_si128(*(const int*)p); + __m128i t = _mm_cvtsi32_si128(*reinterpret_cast(p)); #if ASTCENC_SSE >= 41 m = _mm_cvtepu8_epi32(t); @@ -270,9 +270,9 @@ struct vint4 m = _mm_insert_epi32(m, a, l); #else alignas(16) int idx[4]; - _mm_store_si128((__m128i*)idx, m); + _mm_store_si128(reinterpret_cast<__m128i*>(idx), m); idx[l] = a; - m = _mm_load_si128((const __m128i*)idx); + m = _mm_load_si128(reinterpret_cast(idx)); #endif } @@ -297,7 +297,7 @@ struct vint4 */ static ASTCENC_SIMD_INLINE vint4 loada(const int* p) { - return vint4(_mm_load_si128((const __m128i*)p)); + return vint4(_mm_load_si128(reinterpret_cast(p))); } /** @@ -613,7 +613,7 @@ ASTCENC_SIMD_INLINE int hadd_s(vint4 a) */ ASTCENC_SIMD_INLINE void storea(vint4 a, int* p) { - _mm_store_si128((__m128i*)p, a.m); + _mm_store_si128(reinterpret_cast<__m128i*>(p), a.m); } /** @@ -622,7 +622,7 @@ ASTCENC_SIMD_INLINE void storea(vint4 a, int* p) ASTCENC_SIMD_INLINE void store(vint4 a, int* p) { // Cast due to missing intrinsics - _mm_storeu_ps((float*)p, _mm_castsi128_ps(a.m)); + _mm_storeu_ps(reinterpret_cast(p), _mm_castsi128_ps(a.m)); } /** @@ -631,7 +631,7 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p) ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p) { // Cast due to missing intrinsics - _mm_store_ss((float*)p, _mm_castsi128_ps(a.m)); + _mm_store_ss(reinterpret_cast(p), _mm_castsi128_ps(a.m)); } /** @@ -664,20 +664,16 @@ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) } /** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + * @brief Return lanes from @c b if @c cond is set, else @c a. */ ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond) { + __m128i condi = _mm_castps_si128(cond.m); + #if ASTCENC_SSE >= 41 - // Don't use _mm_blendv_epi8 directly, as it doesn't give the select on - // float sign-bit in the mask behavior which is useful. Performance is the - // same, these casts are free. - __m128 av = _mm_castsi128_ps(a.m); - __m128 bv = _mm_castsi128_ps(b.m); - return vint4(_mm_castps_si128(_mm_blendv_ps(av, bv, cond.m))); + return vint4(_mm_blendv_epi8(a.m, b.m, condi)); #else - __m128i d = _mm_srai_epi32(_mm_castps_si128(cond.m), 31); - return vint4(_mm_or_si128(_mm_and_si128(d, b.m), _mm_andnot_si128(d, a.m))); + return vint4(_mm_or_si128(_mm_and_si128(condi, b.m), _mm_andnot_si128(condi, a.m))); #endif } @@ -863,10 +859,22 @@ ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a) } /** - * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + * @brief Return lanes from @c b if @c cond is set, else @c a. */ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond) { +#if ASTCENC_SSE >= 41 + return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m)); +#else + return vfloat4(_mm_or_ps(_mm_and_ps(cond.m, b.m), _mm_andnot_ps(cond.m, a.m))); +#endif +} + +/** + * @brief Return lanes from @c b if MSB of @c cond is set, else @c a. + */ +ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond) +{ #if ASTCENC_SSE >= 41 return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m)); #else @@ -955,7 +963,7 @@ static inline uint16_t float_to_float16(float a) { #if ASTCENC_F16C >= 1 __m128i f16 = _mm_cvtps_ph(_mm_set1_ps(a), 0); - return (uint16_t)_mm_cvtsi128_si32(f16); + return static_cast(_mm_cvtsi128_si32(f16)); #else return float_to_sf16(a); #endif @@ -1017,4 +1025,60 @@ ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v) return vfloat4(_mm_castsi128_ps(v.m)); } +#if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41) + +#define ASTCENC_USE_NATIVE_DOT_PRODUCT 1 + +/** + * @brief Return the dot product for the full 4 lanes, returning scalar. + */ +ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b) +{ + return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0xFF)); +} + +/** + * @brief Return the dot product for the full 4 lanes, returning vector. + */ +ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b) +{ + return vfloat4(_mm_dp_ps(a.m, b.m, 0xFF)); +} + +/** + * @brief Return the dot product for the bottom 3 lanes, returning scalar. + */ +ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b) +{ + return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0x77)); +} + +/** + * @brief Return the dot product for the bottom 3 lanes, returning vector. + */ +ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b) +{ + return vfloat4(_mm_dp_ps(a.m, b.m, 0x77)); +} + +#endif // #if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41) + +#if ASTCENC_POPCNT >= 1 + +#define ASTCENC_USE_NATIVE_POPCOUNT 1 + +/** + * @brief Population bit count. + * + * @param v The value to population count. + * + * @return The number of 1 bits. + */ +ASTCENC_SIMD_INLINE int popcount(uint64_t v) +{ + return static_cast(_mm_popcnt_u64(v)); +} + +#endif // ASTCENC_POPCNT >= 1 + #endif // #ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED diff --git a/libkram/astc-encoder/astcenc_weight_align.cpp b/libkram/astc-encoder/astcenc_weight_align.cpp index a5288357..96eb6ae6 100644 --- a/libkram/astc-encoder/astcenc_weight_align.cpp +++ b/libkram/astc-encoder/astcenc_weight_align.cpp @@ -54,11 +54,11 @@ static constexpr unsigned int SINCOS_STEPS { 64 }; static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0, "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH"); -static unsigned int max_angular_steps_needed_for_quant_level[13]; +static uint8_t max_angular_steps_needed_for_quant_level[13]; // The next-to-last entry is supposed to have the value 33. This because the 32-weight mode leaves a // double-sized hole in the middle of the weight space, so we are better off matching 33 weights. -static const unsigned int quantization_steps_for_level[13] { +static const uint8_t quantization_steps_for_level[13] { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 33, 36 }; @@ -75,7 +75,7 @@ void prepare_angular_tables() unsigned int max_angular_steps_needed_for_quant_steps[ANGULAR_STEPS + 1]; for (unsigned int i = 0; i < ANGULAR_STEPS; i++) { - float angle_step = (float)(i + 1); + float angle_step = static_cast(i + 1); for (unsigned int j = 0; j < SINCOS_STEPS; j++) { @@ -96,13 +96,13 @@ void prepare_angular_tables() * @brief Compute the angular alignment factors and offsets. * * @param weight_count The number of (decimated) weights. - * @param dec_weight_quant_uvalue The decimated and quantized weight values. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. * @param max_angular_steps The maximum number of steps to be tested. * @param[out] offsets The output angular offsets array. */ static void compute_angular_offsets( unsigned int weight_count, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, unsigned int max_angular_steps, float* offsets ) { @@ -115,7 +115,7 @@ static void compute_angular_offsets( for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH) { // Add 2^23 and interpreting bits extracts round-to-nearest int - vfloat sample = loada(dec_weight_quant_uvalue + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f); + vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f); vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1)); storea(isample, isamplev + i); } @@ -149,7 +149,7 @@ static void compute_angular_offsets( * forcing samples that should have had one weight value one step up or down. * * @param weight_count The number of (decimated) weights. - * @param dec_weight_quant_uvalue The decimated and quantized weight values. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. * @param max_angular_steps The maximum number of steps to be tested. * @param max_quant_steps The maximum quantization level to be tested. * @param offsets The angular offsets array. @@ -161,7 +161,7 @@ static void compute_angular_offsets( */ static void compute_lowest_and_highest_weight( unsigned int weight_count, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, unsigned int max_angular_steps, unsigned int max_quant_steps, const float* offsets, @@ -188,7 +188,7 @@ static void compute_lowest_and_highest_weight( for (unsigned int j = 0; j < weight_count; ++j) { - vfloat sval = load1(&dec_weight_quant_uvalue[j]) * rcp_stepsize - offset; + vfloat sval = load1(&dec_weight_ideal_value[j]) * rcp_stepsize - offset; vfloat svalrte = round(sval); vfloat diff = sval - svalrte; errval += diff * diff; @@ -237,14 +237,14 @@ static void compute_lowest_and_highest_weight( * @brief The main function for the angular algorithm. * * @param weight_count The number of (decimated) weights. - * @param dec_weight_quant_uvalue The decimated and quantized weight value. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. * @param max_quant_level The maximum quantization level to be tested. * @param[out] low_value Per angular step, the lowest weight value. * @param[out] high_value Per angular step, the highest weight value. */ static void compute_angular_endpoints_for_quant_levels( unsigned int weight_count, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, unsigned int max_quant_level, float low_value[12], float high_value[12] @@ -253,7 +253,7 @@ static void compute_angular_endpoints_for_quant_levels( alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS]; unsigned int max_angular_steps = max_angular_steps_needed_for_quant_level[max_quant_level]; - compute_angular_offsets(weight_count, dec_weight_quant_uvalue, + compute_angular_offsets(weight_count, dec_weight_ideal_value, max_angular_steps, angular_offsets); alignas(ASTCENC_VECALIGN) int32_t lowest_weight[ANGULAR_STEPS]; @@ -262,7 +262,7 @@ static void compute_angular_endpoints_for_quant_levels( alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS]; alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS]; - compute_lowest_and_highest_weight(weight_count, dec_weight_quant_uvalue, + compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value, max_angular_steps, max_quant_steps, angular_offsets, lowest_weight, weight_span, error, cut_low_weight_error, cut_high_weight_error); @@ -285,31 +285,34 @@ static void compute_angular_endpoints_for_quant_levels( promise(max_angular_steps > 0); for (unsigned int i = 0; i < max_angular_steps; i++) { + float i_flt = static_cast(i); + int idx_span = weight_span[i]; + float error_cut_low = error[i] + cut_low_weight_error[i]; float error_cut_high = error[i] + cut_high_weight_error[i]; float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i]; // Check best error against record N vfloat4 best_result = best_results[idx_span]; - vfloat4 new_result = vfloat4(error[i], (float)i, 0.0f, 0.0f); + vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f); vmask4 mask1(best_result.lane<0>() > error[i]); best_results[idx_span] = select(best_result, new_result, mask1); // Check best error against record N-1 with either cut low or cut high best_result = best_results[idx_span - 1]; - new_result = vfloat4(error_cut_low, (float)i, 1.0f, 0.0f); + new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f); vmask4 mask2(best_result.lane<0>() > error_cut_low); best_result = select(best_result, new_result, mask2); - new_result = vfloat4(error_cut_high, (float)i, 0.0f, 0.0f); + new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f); vmask4 mask3(best_result.lane<0>() > error_cut_high); best_results[idx_span - 1] = select(best_result, new_result, mask3); // Check best error against record N-2 with both cut low and high best_result = best_results[idx_span - 2]; - new_result = vfloat4(error_cut_low_high, (float)i, 1.0f, 0.0f); + new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f); vmask4 mask4(best_result.lane<0>() > error_cut_low_high); best_results[idx_span - 2] = select(best_result, new_result, mask4); } @@ -317,7 +320,7 @@ static void compute_angular_endpoints_for_quant_levels( for (unsigned int i = 0; i <= max_quant_level; i++) { unsigned int q = quantization_steps_for_level[i]; - int bsi = (int)best_results[q].lane<1>(); + int bsi = static_cast(best_results[q].lane<1>()); // Did we find anything? #if defined(ASTCENC_DIAGNOSTICS) @@ -330,8 +333,8 @@ static void compute_angular_endpoints_for_quant_levels( bsi = astc::max(0, bsi); - float stepsize = 1.0f / (1.0f + (float)bsi); - int lwi = lowest_weight[bsi] + (int)best_results[q].lane<2>(); + float stepsize = 1.0f / (1.0f + static_cast(bsi)); + int lwi = lowest_weight[bsi] + static_cast(best_results[q].lane<2>()); int hwi = lwi + q - 1; float offset = angular_offsets[bsi] * stepsize; @@ -416,14 +419,14 @@ static void compute_lowest_and_highest_weight_lwc( * @brief The main function for the angular algorithm, variant for low weight count. * * @param weight_count The number of (decimated) weights. - * @param dec_weight_quant_uvalue The decimated and quantized weight value. + * @param dec_weight_ideal_value The ideal decimated unquantized weight values. * @param max_quant_level The maximum quantization level to be tested. * @param[out] low_value Per angular step, the lowest weight value. * @param[out] high_value Per angular step, the highest weight value. */ static void compute_angular_endpoints_for_quant_levels_lwc( unsigned int weight_count, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, unsigned int max_quant_level, float low_value[12], float high_value[12] @@ -436,26 +439,24 @@ static void compute_angular_endpoints_for_quant_levels_lwc( alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS]; alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS]; - compute_angular_offsets(weight_count, dec_weight_quant_uvalue, + compute_angular_offsets(weight_count, dec_weight_ideal_value, max_angular_steps, angular_offsets); - compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_quant_uvalue, + compute_lowest_and_highest_weight_lwc(weight_count, dec_weight_ideal_value, max_angular_steps, max_quant_steps, angular_offsets, lowest_weight, weight_span, error); // For each quantization level, find the best error terms. Use packed vectors so data-dependent // branches can become selects. This involves some integer to float casts, but the values are // small enough so they never round the wrong way. - float best_error[ANGULAR_STEPS]; - int best_index[ANGULAR_STEPS]; + vfloat4 best_results[ANGULAR_STEPS]; // Initialize the array to some safe defaults promise(max_quant_steps > 0); for (unsigned int i = 0; i < (max_quant_steps + 4); i++) { - best_error[i] = ERROR_CALC_DEFAULT; - best_index[i] = -1; + best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f); } promise(max_angular_steps > 0); @@ -464,18 +465,16 @@ static void compute_angular_endpoints_for_quant_levels_lwc( int idx_span = weight_span[i]; // Check best error against record N - float current_best = best_error[idx_span]; - if (error[i] < current_best) - { - best_error[idx_span] = error[i]; - best_index[idx_span] = i; - } + vfloat4 current_best = best_results[idx_span]; + vfloat4 candidate = vfloat4(error[i], static_cast(i), 0.0f, 0.0f); + vmask4 mask(current_best.lane<0>() > error[i]); + best_results[idx_span] = select(current_best, candidate, mask); } for (unsigned int i = 0; i <= max_quant_level; i++) { unsigned int q = quantization_steps_for_level[i]; - int bsi = best_index[q]; + int bsi = static_cast(best_results[q].lane<1>()); // Did we find anything? #if defined(ASTCENC_DIAGNOSTICS) @@ -491,8 +490,8 @@ static void compute_angular_endpoints_for_quant_levels_lwc( int lwi = lowest_weight[bsi]; int hwi = lwi + q - 1; - low_value[i] = (angular_offsets[bsi] + static_cast(lwi)) / (1.0f + (float)bsi); - high_value[i] = (angular_offsets[bsi] + static_cast(hwi)) / (1.0f + (float)bsi); + low_value[i] = (angular_offsets[bsi] + static_cast(lwi)) / (1.0f + static_cast(bsi)); + high_value[i] = (angular_offsets[bsi] + static_cast(hwi)) / (1.0f + static_cast(bsi)); } } @@ -501,7 +500,7 @@ void compute_angular_endpoints_1plane( unsigned int tune_low_weight_limit, bool only_always, const block_size_descriptor& bsd, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, compression_working_buffers& tmpbuf ) { float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1; @@ -510,13 +509,13 @@ void compute_angular_endpoints_1plane( float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1; float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1; - unsigned int max_decimation_modes = only_always ? bsd.always_decimation_mode_count - : bsd.decimation_mode_count; + unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always + : bsd.decimation_mode_count_selected; promise(max_decimation_modes > 0); for (unsigned int i = 0; i < max_decimation_modes; i++) { const decimation_mode& dm = bsd.decimation_modes[i]; - if (dm.maxprec_1plane < 0 || !dm.percentile_hit) + if (!dm.ref_1_plane) { continue; } @@ -527,28 +526,25 @@ void compute_angular_endpoints_1plane( { compute_angular_endpoints_for_quant_levels_lwc( weight_count, - dec_weight_quant_uvalue + i * BLOCK_MAX_WEIGHTS, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, dm.maxprec_1plane, low_values[i], high_values[i]); } else { compute_angular_endpoints_for_quant_levels( weight_count, - dec_weight_quant_uvalue + i * BLOCK_MAX_WEIGHTS, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, dm.maxprec_1plane, low_values[i], high_values[i]); } } - unsigned int max_block_modes = only_always ? bsd.always_block_mode_count - : bsd.block_mode_count; + unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always + : bsd.block_mode_count_1plane_selected; promise(max_block_modes > 0); for (unsigned int i = 0; i < max_block_modes; ++i) { const block_mode& bm = bsd.block_modes[i]; - if (bm.is_dual_plane || !bm.percentile_hit) - { - continue; - } + assert(!bm.is_dual_plane); unsigned int quant_mode = bm.quant_mode; unsigned int decim_mode = bm.decimation_mode; @@ -562,7 +558,7 @@ void compute_angular_endpoints_1plane( void compute_angular_endpoints_2planes( unsigned int tune_low_weight_limit, const block_size_descriptor& bsd, - const float* dec_weight_quant_uvalue, + const float* dec_weight_ideal_value, compression_working_buffers& tmpbuf ) { float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1; @@ -575,11 +571,11 @@ void compute_angular_endpoints_2planes( float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values2; float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values2; - promise(bsd.decimation_mode_count > 0); - for (unsigned int i = 0; i < bsd.decimation_mode_count; i++) + promise(bsd.decimation_mode_count_selected > 0); + for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++) { const decimation_mode& dm = bsd.decimation_modes[i]; - if (dm.maxprec_2planes < 0 || !dm.percentile_hit) + if (!dm.ref_2_planes) { continue; } @@ -590,37 +586,33 @@ void compute_angular_endpoints_2planes( { compute_angular_endpoints_for_quant_levels_lwc( weight_count, - dec_weight_quant_uvalue + i * BLOCK_MAX_WEIGHTS, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, dm.maxprec_2planes, low_values1[i], high_values1[i]); compute_angular_endpoints_for_quant_levels_lwc( weight_count, - dec_weight_quant_uvalue + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, dm.maxprec_2planes, low_values2[i], high_values2[i]); } else { compute_angular_endpoints_for_quant_levels( weight_count, - dec_weight_quant_uvalue + i * BLOCK_MAX_WEIGHTS, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, dm.maxprec_2planes, low_values1[i], high_values1[i]); compute_angular_endpoints_for_quant_levels( weight_count, - dec_weight_quant_uvalue + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, + dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, dm.maxprec_2planes, low_values2[i], high_values2[i]); } } - promise(bsd.block_mode_count > 0); - for (unsigned int i = 0; i < bsd.block_mode_count; ++i) + unsigned int start = bsd.block_mode_count_1plane_selected; + unsigned int end = bsd.block_mode_count_1plane_2plane_selected; + for (unsigned int i = start; i < end; i++) { const block_mode& bm = bsd.block_modes[i]; - if (!bm.is_dual_plane || !bm.percentile_hit) - { - continue; - } - unsigned int quant_mode = bm.quant_mode; unsigned int decim_mode = bm.decimation_mode; From 768d9439cd15f3cd835f4bf97a0e75a655ab3702 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 30 Apr 2022 22:18:32 -0700 Subject: [PATCH 008/615] kramv - fix the size of the scroll view, and stop hiding the hud Can now see the hud response for actions, and load/save failure messages. Suppress the eyedropper text during next/prevItem since the eyedropper text is stale and overlaps. --- kramv/Base.lproj/Main.storyboard | 10 ++-- kramv/KramViewerMain.mm | 79 ++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 39 deletions(-) diff --git a/kramv/Base.lproj/Main.storyboard b/kramv/Base.lproj/Main.storyboard index fa4d3926..1d6caae7 100644 --- a/kramv/Base.lproj/Main.storyboard +++ b/kramv/Base.lproj/Main.storyboard @@ -174,15 +174,15 @@ - - - + + + - + - + diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index b372e5b6..522d0424 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -39,6 +39,19 @@ using namespace kram; using namespace NAMESPACE_STL; +bool isSupportedModelFilename(const char* filename) { +#if USE_GLTF + return endsWithExtension(filename, ".gltf") || + endsWithExtension(filename, ".glb"); +#else + return false; +#endif +} + +bool isSupportedArchiveFilename(const char* filename) { + return endsWithExtension(filename, ".zip"); +} + struct MouseData { NSPoint originPoint; @@ -533,6 +546,13 @@ - (void)awakeFromNib { [super awakeFromNib]; + // vertical offset of table down so hud can display info + NSScrollView* scrollView = [_tableView enclosingScrollView]; + CGRect rect = scrollView.frame; + rect.origin.y += 50; + scrollView.frame = rect; + + // TODO: see if can only open this // NSLog(@"AwakeFromNIB"); } @@ -597,8 +617,6 @@ - (instancetype)initWithCoder:(NSCoder *)coder _hudLabel = [self _addHud:NO]; [self setHudText:""]; - - return self; } @@ -2030,6 +2048,10 @@ - (void)updateHudVisibility { _hudLabel2.hidden = _hudHidden || !_showSettings->isHudShown; } +- (void)clearHud { + +} + - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyDown { // Some data depends on the texture data (isSigned, isNormal, ..) @@ -2389,8 +2411,9 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD if (_showSettings->isArchive) { if ([self advanceFileFromAchive:!isShiftKeyDown]) { - _hudHidden = true; - [self updateHudVisibility]; + //_hudHidden = true; + //[self updateHudVisibility]; + [self setEyedropperText:""]; isChanged = true; text = "Loaded " + _showSettings->lastFilename; @@ -2398,9 +2421,10 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD } else if (_showSettings->isFolder) { if ([self advanceFileFromFolder:!isShiftKeyDown]) { - _hudHidden = true; - [self updateHudVisibility]; - + //_hudHidden = true; + //[self updateHudVisibility]; + [self setEyedropperText:""]; + isChanged = true; text = "Loaded " + _showSettings->lastFilename; } @@ -2677,8 +2701,8 @@ - (BOOL)advanceFileFromAchive:(BOOL)increment [self showFileTable]; // also have to hide hud or it will obscure the visible table - _hudHidden = true; - [self updateHudVisibility]; + //_hudHidden = true; + //[self updateHudVisibility]; return [self loadFileFromArchive]; } @@ -2708,8 +2732,8 @@ - (BOOL)advanceFileFromFolder:(BOOL)increment // show the files table [self showFileTable]; - _hudHidden = true; - [self updateHudVisibility]; + //_hudHidden = true; + //[self updateHudVisibility]; return [self loadFileFromFolder]; } @@ -2795,9 +2819,7 @@ - (BOOL)loadFileFromFolder string fullFilename = filename; auto timestamp = FileHelper::modificationTimestamp(filename); - bool isModel = - endsWithExtension(filename, ".gltf") || - endsWithExtension(filename, ".gtb"); + bool isModel = isSupportedModelFilename(filename); if (isModel) return [self loadModelFile:nil filename:filename]; @@ -2901,9 +2923,7 @@ - (BOOL)loadFileFromArchive string fullFilename = filename; double timestamp = (double)entry.modificationDate; - bool isModel = - endsWithExtension(filename, ".gltf") || - endsWithExtension(filename, ".gtb"); + bool isModel = isSupportedModelFilename(filename); if (isModel) return [self loadModelFile:nil filename:filename]; @@ -3059,9 +3079,8 @@ - (BOOL)loadTextureFromURL:(NSURL *)url while (NSURL* fileOrDirectoryURL = [directoryEnumerator nextObject]) { const char* name = fileOrDirectoryURL.fileSystemRepresentation; - bool isGLTF = endsWithExtension(name, ".gltf"); - bool isGLB = endsWithExtension(name, ".glb"); - if (isGLTF || isGLB) + bool isModel = isSupportedModelFilename(filename); + if (isModel) { files.push_back(name); } @@ -3201,16 +3220,9 @@ - (BOOL)loadTextureFromURL:(NSURL *)url } // file is not a supported extension - if (!( - // archive - endsWithExtension(filename, ".zip") || - - // images + if (!(isSupportedArchiveFilename(filename) || isSupportedFilename(filename) || - - // models - endsWithExtension(filename, ".gltf") || - endsWithExtension(filename, ".glb") + isSupportedModelFilename(filename) )) { string errorText = @@ -3225,8 +3237,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url return NO; } - if (endsWithExtension(filename, ".gltf") || - endsWithExtension(filename, ".glb")) + if (isSupportedModelFilename(filename)) { return [self loadModelFile:url filename:nullptr]; } @@ -3237,7 +3248,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url //------------------- - if (endsWithExtension(filename, ".zip")) { + if (isSupportedArchiveFilename(filename)) { auto archiveTimestamp = FileHelper::modificationTimestamp(filename); if (!self.imageURL || (!([self.imageURL isEqualTo:url])) || @@ -3326,7 +3337,7 @@ -(BOOL)loadModelFile:(NSURL*)url filename:(const char*)filename // save out a scene with all of them in a single scene. But that should // probably reference original content in case it's updated. - Renderer *renderer = (Renderer *)self.delegate; + Renderer* renderer = (Renderer *)self.delegate; [renderer releaseAllPendingTextures]; setErrorLogCapture(true); @@ -3599,7 +3610,7 @@ - (void)viewDidLoad [_view addNotifications]; [_view setupUI]; - + // original sample code was sending down _view.bounds.size, but need // drawableSize this was causing all sorts of inconsistencies [_renderer mtkView:_view drawableSizeWillChange:_view.drawableSize]; From ad8a608a14c999322e7072fcc44af22425aa94ad Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 30 Apr 2022 23:12:01 -0700 Subject: [PATCH 009/615] kramv - more hud reduction --- kramv/KramViewerMain.mm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 522d0424..9db27f37 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -2048,10 +2048,6 @@ - (void)updateHudVisibility { _hudLabel2.hidden = _hudHidden || !_showSettings->isHudShown; } -- (void)clearHud { - -} - - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyDown { // Some data depends on the texture data (isSigned, isNormal, ..) @@ -2703,6 +2699,7 @@ - (BOOL)advanceFileFromAchive:(BOOL)increment // also have to hide hud or it will obscure the visible table //_hudHidden = true; //[self updateHudVisibility]; + [self setEyedropperText:""]; return [self loadFileFromArchive]; } @@ -2734,6 +2731,7 @@ - (BOOL)advanceFileFromFolder:(BOOL)increment //_hudHidden = true; //[self updateHudVisibility]; + [self setEyedropperText:""]; return [self loadFileFromFolder]; } From b3a62fcfde9d78fb09e0d284c95a802534a6d572 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 1 May 2022 09:25:25 -0700 Subject: [PATCH 010/615] kram - reorder hud vs. eydropper text, hide eyedropper text when files view is up --- kramv/KramViewerMain.mm | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 9db27f37..e7f0d9c2 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -1668,16 +1668,22 @@ - (void)showEyedropperData:(float2)uv // TODO: Stuff these on clipboard with a click, or use cmd+C? } +enum TextSlot +{ + kTextSlotHud, + kTextSlotEyedropper +}; + - (void)setEyedropperText:(const char *)text { - _textSlots[0] = text; + _textSlots[kTextSlotEyedropper] = text; [self updateHudText]; } - (void)setHudText:(const char *)text { - _textSlots[1] = text; + _textSlots[kTextSlotHud] = text; [self updateHudText]; } @@ -1685,7 +1691,11 @@ - (void)setHudText:(const char *)text - (void)updateHudText { // combine textSlots - string text = _textSlots[0] + _textSlots[1]; + string text = _textSlots[kTextSlotHud]; + + // don't show eyedropper text with table up, it's many lines and overlaps + if (!_tableView.hidden) + text += _textSlots[kTextSlotEyedropper]; NSString *textNS = [NSString stringWithUTF8String:text.c_str()]; _hudLabel2.stringValue = textNS; From 131aeeb54b1aa1cd291b448e94cad3afd1c84269 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 1 May 2022 23:38:09 -0700 Subject: [PATCH 011/615] kramv - fix newlines on hud --- kramv/KramViewerBase.cpp | 3 ++- kramv/KramViewerMain.mm | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/kramv/KramViewerBase.cpp b/kramv/KramViewerBase.cpp index e283be66..bd8b2935 100644 --- a/kramv/KramViewerBase.cpp +++ b/kramv/KramViewerBase.cpp @@ -7,7 +7,8 @@ using namespace NAMESPACE_STL; int32_t ShowSettings::totalChunks() const { int32_t one = 1; - return std::max(one, faceCount) * std::max(one, arrayCount) * + return std::max(one, faceCount) * + std::max(one, arrayCount) * std::max(one, sliceCount); } diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index e7f0d9c2..3380edd8 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -1692,7 +1692,9 @@ - (void)updateHudText { // combine textSlots string text = _textSlots[kTextSlotHud]; - + if (!text.empty() && text.back() != '\n') + text += "\n"; + // don't show eyedropper text with table up, it's many lines and overlaps if (!_tableView.hidden) text += _textSlots[kTextSlotEyedropper]; @@ -3234,7 +3236,11 @@ - (BOOL)loadTextureFromURL:(NSURL *)url )) { string errorText = - "Unsupported file extension, must be .zip, .png, .ktx, .ktx2, .dds, .gltf, .glb\n"; + "Unsupported file extension, must be .zip" +#if USE_GLTF + ", .gltf, .glb" +#endif + ", .png, .ktx, .ktx2, .dds\n"; string finalErrorText; append_sprintf(finalErrorText, "Could not load from file:\n %s\n", From e7c72d9b325dc9e1983b337dd02452e121ae1eea Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 6 May 2022 19:11:22 -0700 Subject: [PATCH 012/615] kramv - add uvPreview feature. This has no UI to enable it, but can unfold the shape UI back the textures they are using. Overlays the lines of the triangle onto them. Added concept of passes, since would like to separate debug overlay from texture render. Could use a lower bit-depth drawable then. --- kramv/KramRenderer.mm | 67 +++++++++++++++++++++++++++- kramv/KramViewerBase.h | 3 ++ kramv/Shaders/KramShaders.h | 15 +++++-- kramv/Shaders/KramShaders.metal | 78 +++++++++++++++++++++++++-------- 4 files changed, 140 insertions(+), 23 deletions(-) diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index 260cbf6a..75a692c6 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -1545,6 +1545,30 @@ - (void)_updateGameState float4x4 panTransform = matrix4x4_translation(-_showSettings->panX, _showSettings->panY, 0.0); + // interpolate this, also need to draw wireframe + // this is an animated effect, that overlays the shape uv wires over the image + // but it needs to set needsDisplay until animation finishes + static float delta = 1.0 / 60.0; + + // hack to see uvPreview + //_showSettings->isUVPreview = true; + + if (_showSettings->is3DView && _showSettings->isUVPreview) { + uniforms.uvPreview += delta; + + if (uniforms.uvPreview > 1.0) { + delta = -1.0 / 60.0; + uniforms.uvPreview = 1.0; + } + else if (uniforms.uvPreview < 0.0) { + delta = 1.0 / 60.0; + uniforms.uvPreview = 0.0; + } + } + else { + uniforms.uvPreview = 0.0; + } + // scale float zoom = _showSettings->zoom; @@ -1945,7 +1969,8 @@ - (void)drawMain:(id)commandBuffer UniformsLevel uniformsLevel; uniformsLevel.drawOffset = float2m(0.0f); - + uniformsLevel.passNumber = kPassDefault; + if (_showSettings->isPreview) { // upload this on each face drawn, since want to be able to draw all // mips/levels at once @@ -2068,6 +2093,7 @@ - (void)drawMain:(id)commandBuffer // mips on on screen faces and arrays and slices go across in a row, and // mips are displayed down from each of those in a column + for (MTKSubmesh* submesh in _mesh.submeshes) { [renderEncoder drawIndexedPrimitives:submesh.primitiveType indexCount:submesh.indexCount @@ -2075,6 +2101,45 @@ - (void)drawMain:(id)commandBuffer indexBuffer:submesh.indexBuffer.buffer indexBufferOffset:submesh.indexBuffer.offset]; } + + // Draw uv wire overlay + if (_showSettings->isUVPreview) { + // need to force color in shader or it's still sampling texture + // also need to add z offset + + [renderEncoder setTriangleFillMode:MTLTriangleFillModeLines]; + + // only applies to tris, not points/lines, pushes depth away (towards 0), after clip + // affects reads/tests and writes. Could also add in vertex shader. + // depthBias * 2^(exp(max abs(z) in primitive) - r) + slopeScale * maxSlope + [renderEncoder setDepthBias:0.015 slopeScale:3.0 clamp: 0.02]; + + uniformsLevel.passNumber = kPassUVPreview; + + [renderEncoder setVertexBytes:&uniformsLevel + length:sizeof(uniformsLevel) + atIndex:BufferIndexUniformsLevel]; + + [renderEncoder setFragmentBytes:&uniformsLevel + length:sizeof(uniformsLevel) + atIndex:BufferIndexUniformsLevel]; + + for (MTKSubmesh* submesh in _mesh.submeshes) { + [renderEncoder drawIndexedPrimitives:submesh.primitiveType + indexCount:submesh.indexCount + indexType:submesh.indexType + indexBuffer:submesh.indexBuffer.buffer + indexBufferOffset:submesh.indexBuffer.offset]; + } + + uniformsLevel.passNumber = kPassDefault; + + // restore state, even though this isn't a true state shadow + [renderEncoder setDepthBias:0.0 slopeScale:0.0 clamp:0.0]; + + [renderEncoder setTriangleFillMode:MTLTriangleFillModeFill]; + + } } } diff --git a/kramv/KramViewerBase.h b/kramv/KramViewerBase.h index a235ba93..9afe084c 100644 --- a/kramv/KramViewerBase.h +++ b/kramv/KramViewerBase.h @@ -131,6 +131,9 @@ class ShowSettings { // this mode shows the content with lighting or with bilinear/mips active bool isPreview = false; + // Can collapse 3d to 2d and overlay the uv + bool isUVPreview = false; + // the 2d view doesn't want to inset pixels for clamp, or point sampling is // thrown off expecially on small 4x4 textures #if USE_PERSPECTIVE diff --git a/kramv/Shaders/KramShaders.h b/kramv/Shaders/KramShaders.h index 39e90a6e..6e337c20 100644 --- a/kramv/Shaders/KramShaders.h +++ b/kramv/Shaders/KramShaders.h @@ -113,7 +113,8 @@ struct Uniforms { simd::float4x4 modelMatrix; simd::float4 modelMatrixInvScale2; // to supply inverse, w is determinant simd::float3 cameraPosition; // world-space - + float uvPreview; + bool isSigned; bool isNormal; bool isSwizzleAGToRG; @@ -122,8 +123,9 @@ struct Uniforms { bool isCheckerboardShown; bool isWrap; bool isSDF; - bool isPreview; - + bool isPreview; // render w/lighting, normals, etc + bool isUVPreview; // show uv overlay + bool is3DView; bool isNormalMapPreview; // for isNormal or combined @@ -156,6 +158,12 @@ struct Uniforms { ShaderLightingMode lightingMode; }; +enum PassNumber +{ + kPassDefault = 0, + kPassUVPreview = 1 +}; + // uploaded separately, so multiple mips, faces, array can be drawn to the // screen at one time although modelMatrix offset changes. Could store offset // in here. @@ -165,6 +173,7 @@ struct UniformsLevel { uint32_t arrayOrSlice; simd::float2 drawOffset; // pixel offset to apply simd::float4 textureSize; // width, height, 1/width, 1/height + uint32_t passNumber; // switch to enum }; // This is all tied to a single level sample diff --git a/kramv/Shaders/KramShaders.metal b/kramv/Shaders/KramShaders.metal index 16fc81c9..2e32b748 100644 --- a/kramv/Shaders/KramShaders.metal +++ b/kramv/Shaders/KramShaders.metal @@ -6,6 +6,8 @@ using namespace metal; + + //--------------------------------- // helpers @@ -565,6 +567,25 @@ struct ColorInOut half4 tangent; }; +void doUVPreview( + thread float3& position, + thread float3& normal, + thread float4& tangent, + float2 texCoord, + float uvPreview +) +{ + // convert [0,1] to [-1,1] plane + float3 uv(toSnorm(texCoord), 0.0); + uv.y *= -1; + uv.xy *= 0.5; // shrink it + position.xyz = mix(position.xyz, uv.xyz, uvPreview); + + // interpolate norma and tangent too + normal = mix(normal.xyz, float3(0,0,1), uvPreview); + tangent = mix(tangent, float4(1,0,0,1), uvPreview); +} + ColorInOut DrawImageFunc( Vertex in [[stage_in]], constant Uniforms& uniforms, @@ -573,11 +594,6 @@ ColorInOut DrawImageFunc( { ColorInOut out; - float4 position = in.position; - //position.xy += uniformsLevel.drawOffset; - - float4 worldPos = uniforms.modelMatrix * position; - // deal with full basis bool needsWorldBasis = @@ -588,20 +604,34 @@ ColorInOut DrawImageFunc( uniforms.shapeChannel == ShaderShapeChannel::ShShapeChannelNormal || uniforms.shapeChannel == ShaderShapeChannel::ShShapeChannelBitangent; + float4 position = in.position; + //position.xy += uniformsLevel.drawOffset; + float3 normal = in.normal; + float4 tangent = in.tangent; + + // interpolate position to uv plane coordinates (will flatten the shape + if (uniforms.uvPreview > 0.0) { + float3 pos = position.xyz; + doUVPreview(pos, normal, tangent, in.texCoord, uniforms.uvPreview); + position.xyz = pos; + } + + float4 worldPos = uniforms.modelMatrix * position; + if (needsWorldBasis) { - float3 normal = in.normal; - float3 tangent = in.tangent.xyz; - transformBasis(normal, tangent, uniforms.modelMatrix, uniforms.modelMatrixInvScale2.xyz, uniforms.useTangent); + float3 t = tangent.xyz; + transformBasis(normal, t, uniforms.modelMatrix, uniforms.modelMatrixInvScale2.xyz, uniforms.useTangent); + tangent.xyz = t; out.normal = toHalf(normal); // may be invalid if useTangent is false - out.tangent.xyz = toHalf(tangent); - out.tangent.w = toHalf(in.tangent.w); + out.tangent.xyz = toHalf(tangent.xyz); + out.tangent.w = toHalf(tangent.w); } else { - out.normal = toHalf(in.normal); - out.tangent = toHalf(in.tangent); + out.normal = toHalf(normal); + out.tangent = toHalf(tangent); } // try adding pixel offset to pixel values worldPos.xy += uniformsLevel.drawOffset; @@ -889,7 +919,8 @@ float4 DrawPixels( constant Uniforms& uniforms, float4 c, float4 nmap, - float2 textureSize + float2 textureSize, + uint passNumber ) { // auto-swizzle BC4 and EAC_R11 to rrr1 @@ -1256,6 +1287,15 @@ float4 DrawPixels( } } + // draw grayscale at alpha value + if (passNumber == kPassUVPreview) { + // always want to draw lines, even in low alpha + if (c.a < 0.1) + c = float4(0.1); + else + c = c.a; + } + return c; } @@ -1277,7 +1317,7 @@ fragment float4 Draw1DArrayPS( // colorMap.get_num_mip_levels(); float4 n = float4(0,0,1,1); - return DrawPixels(in, facing, uniforms, c, n, textureSize); + return DrawPixels(in, facing, uniforms, c, n, textureSize, uniformsLevel.passNumber); } fragment float4 DrawImagePS( @@ -1298,7 +1338,7 @@ fragment float4 DrawImagePS( float2 textureSize = float2(colorMap.get_width(lod), colorMap.get_height(lod)); // colorMap.get_num_mip_levels(); - return DrawPixels(in, facing, uniforms, c, n, textureSize); + return DrawPixels(in, facing, uniforms, c, n, textureSize, uniformsLevel.passNumber); } fragment float4 DrawImageArrayPS( @@ -1319,7 +1359,7 @@ fragment float4 DrawImageArrayPS( float2 textureSize = float2(colorMap.get_width(lod), colorMap.get_height(lod)); // colorMap.get_num_mip_levels(); - return DrawPixels(in, facing, uniforms, c, n, textureSize); + return DrawPixels(in, facing, uniforms, c, n, textureSize, uniformsLevel.passNumber); } @@ -1341,7 +1381,7 @@ fragment float4 DrawCubePS( // colorMap.get_num_mip_levels(); float4 n = float4(0,0,1,1); - return DrawPixels(in, facing, uniforms, c, n, textureSize); + return DrawPixels(in, facing, uniforms, c, n, textureSize, uniformsLevel.passNumber); } fragment float4 DrawCubeArrayPS( @@ -1362,7 +1402,7 @@ fragment float4 DrawCubeArrayPS( // colorMap.get_num_mip_levels(); float4 n = float4(0,0,1,1); - return DrawPixels(in, facing, uniforms, c, n, textureSize); + return DrawPixels(in, facing, uniforms, c, n, textureSize, uniformsLevel.passNumber); } @@ -1394,7 +1434,7 @@ fragment float4 DrawVolumePS( // colorMap.get_num_mip_levels(); float4 n = float4(0,0,1,1); - return DrawPixels(in, facing, uniforms, c, n, textureSize); + return DrawPixels(in, facing, uniforms, c, n, textureSize, uniformsLevel.passNumber); } //-------------------------------------------------- From 7e3c282cd2f065b8a5e514cb65f5df67019a6031 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 6 May 2022 21:01:10 -0700 Subject: [PATCH 013/615] kramv - more fixes to uvPreview --- build2/kramv.xcodeproj/project.pbxproj | 6 +++--- kramv/KramRenderer.mm | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/build2/kramv.xcodeproj/project.pbxproj b/build2/kramv.xcodeproj/project.pbxproj index e98743c5..cda84c46 100644 --- a/build2/kramv.xcodeproj/project.pbxproj +++ b/build2/kramv.xcodeproj/project.pbxproj @@ -218,12 +218,12 @@ isa = PBXGroup; children = ( 706EF22A26D17A81001C950E /* KramViewerBase.h */, - 706EF22C26D17A81001C950E /* KramViewerMain.mm */, 706EF22F26D17A81001C950E /* KramViewerBase.cpp */, - 706EF23026D17A81001C950E /* KramLoader.mm */, + 706EF22C26D17A81001C950E /* KramViewerMain.mm */, 706EF23226D17A81001C950E /* KramRenderer.h */, - 706EF23526D17A81001C950E /* KramLoader.h */, 706EF23726D17A81001C950E /* KramRenderer.mm */, + 706EF23526D17A81001C950E /* KramLoader.h */, + 706EF23026D17A81001C950E /* KramLoader.mm */, 706EF22B26D17A81001C950E /* kramv.entitlements */, 706EF23126D17A81001C950E /* Assets.xcassets */, 706EF23326D17A81001C950E /* Main.storyboard */, diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index 75a692c6..58c920ee 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -1548,26 +1548,30 @@ - (void)_updateGameState // interpolate this, also need to draw wireframe // this is an animated effect, that overlays the shape uv wires over the image // but it needs to set needsDisplay until animation finishes + + // TODO: need to reset these when shape changes static float delta = 1.0 / 60.0; + static float uvPreviewAmount = 0.0; // hack to see uvPreview - //_showSettings->isUVPreview = true; + _showSettings->isUVPreview = true; if (_showSettings->is3DView && _showSettings->isUVPreview) { - uniforms.uvPreview += delta; + uvPreviewAmount += delta; - if (uniforms.uvPreview > 1.0) { + if (uvPreviewAmount > 1.0) { delta = -1.0 / 60.0; - uniforms.uvPreview = 1.0; + uvPreviewAmount = 1.0; } - else if (uniforms.uvPreview < 0.0) { + else if (uvPreviewAmount < 0.0) { delta = 1.0 / 60.0; - uniforms.uvPreview = 0.0; + uvPreviewAmount = 0.0; } } else { - uniforms.uvPreview = 0.0; + uvPreviewAmount = 0.0; } + uniforms.uvPreview = uvPreviewAmount; // scale float zoom = _showSettings->zoom; @@ -2103,7 +2107,7 @@ - (void)drawMain:(id)commandBuffer } // Draw uv wire overlay - if (_showSettings->isUVPreview) { + if (_showSettings->is3DView && _showSettings->isUVPreview) { // need to force color in shader or it's still sampling texture // also need to add z offset From 57ed9878f1e4a55ad6f66b22a1a580cffa5ce6cf Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 6 May 2022 21:04:22 -0700 Subject: [PATCH 014/615] kramv - disable uvPreview for now --- kramv/KramRenderer.mm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index 58c920ee..4a54446a 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -1554,7 +1554,7 @@ - (void)_updateGameState static float uvPreviewAmount = 0.0; // hack to see uvPreview - _showSettings->isUVPreview = true; + //_showSettings->isUVPreview = true; if (_showSettings->is3DView && _showSettings->isUVPreview) { uvPreviewAmount += delta; From c4b2e4d0be84814880ba7e05ef7271f3e58603d9 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 6 May 2022 22:52:49 -0700 Subject: [PATCH 015/615] kramv - enable uvPreview for shapes of Key::Num6 --- kramv/KramRenderer.h | 3 +++ kramv/KramRenderer.mm | 52 +++++++++++++++++++--------------------- kramv/KramViewerBase.cpp | 25 +++++++++++++++++++ kramv/KramViewerBase.h | 6 +++++ kramv/KramViewerMain.mm | 34 +++++++++++++++++++------- 5 files changed, 83 insertions(+), 37 deletions(-) diff --git a/kramv/KramRenderer.h b/kramv/KramRenderer.h index d2411c8e..517cb2dc 100644 --- a/kramv/KramRenderer.h +++ b/kramv/KramRenderer.h @@ -65,6 +65,9 @@ class KTXImage; // unload gltf model - (void)unloadModel; +// called from view and renderer in render loop +- (void)updateAnimationState:(MTKView*)view; + // can play animations in gltf models @property (nonatomic) BOOL playAnimations; diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index 4a54446a..b84f4f4f 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -722,6 +722,23 @@ - (void)releaseAllPendingTextures } } +- (void)updateAnimationState:(MTKView*)view +{ + bool animateDisplay = self.playAnimations; + + // animate the uvPreviw until it reaches endPoint, no scrubber yet + _showSettings->updateUVPreviewState(); + + if (_showSettings->uvPreviewFrames > 0) { + _showSettings->uvPreviewFrames--; + animateDisplay = true; + } + + view.enableSetNeedsDisplay = !animateDisplay; + view.paused = !animateDisplay; +} + + - (void)updateModelSettings:(const string &)fullFilename { _showSettings->isModel = true; @@ -1545,33 +1562,9 @@ - (void)_updateGameState float4x4 panTransform = matrix4x4_translation(-_showSettings->panX, _showSettings->panY, 0.0); - // interpolate this, also need to draw wireframe // this is an animated effect, that overlays the shape uv wires over the image - // but it needs to set needsDisplay until animation finishes - - // TODO: need to reset these when shape changes - static float delta = 1.0 / 60.0; - static float uvPreviewAmount = 0.0; - - // hack to see uvPreview - //_showSettings->isUVPreview = true; - - if (_showSettings->is3DView && _showSettings->isUVPreview) { - uvPreviewAmount += delta; - - if (uvPreviewAmount > 1.0) { - delta = -1.0 / 60.0; - uvPreviewAmount = 1.0; - } - else if (uvPreviewAmount < 0.0) { - delta = 1.0 / 60.0; - uvPreviewAmount = 0.0; - } - } - else { - uvPreviewAmount = 0.0; - } - uniforms.uvPreview = uvPreviewAmount; + uniforms.isUVPreview = _showSettings->uvPreview > 0.0; + uniforms.uvPreview = _showSettings->uvPreview; // scale float zoom = _showSettings->zoom; @@ -1669,8 +1662,11 @@ - (void)_setUniformsLevel:(UniformsLevel &)uniforms mipLOD:(int32_t)mipLOD - (void)drawInMTKView:(nonnull MTKView *)view { @autoreleasepool { - /// Per frame updates here + // Per frame updates here + // update per frame state + [self updateAnimationState:view]; + // TODO: move this out, needs to get called off mouseMove, but don't want to // call drawMain [self drawSample]; @@ -2107,7 +2103,7 @@ - (void)drawMain:(id)commandBuffer } // Draw uv wire overlay - if (_showSettings->is3DView && _showSettings->isUVPreview) { + if (_showSettings->is3DView && _showSettings->uvPreview > 0.0) { // need to force color in shader or it's still sampling texture // also need to add z offset diff --git a/kramv/KramViewerBase.cpp b/kramv/KramViewerBase.cpp index bd8b2935..042f43da 100644 --- a/kramv/KramViewerBase.cpp +++ b/kramv/KramViewerBase.cpp @@ -275,6 +275,31 @@ void ShowSettings::advanceDebugMode(bool decrement) // clear color of farPlane. } +void ShowSettings::updateUVPreviewState() +{ + if (is3DView) { + if (isUVPreview) { + if (uvPreview < 1.0) + uvPreview += uvPreviewStep; + } + else + { + if (uvPreview > 0.0) + uvPreview -= uvPreviewStep; + } + + uvPreview = saturate(uvPreview); + } + else { + uvPreview = 0.0; + } + + // stop the frame update + if (uvPreview == 0.0f || uvPreview == 1.0f) { + uvPreviewFrames = 0; + } +} + void printChannels(string &tmp, const string &label, float4 c, int32_t numChannels, bool isFloat, bool isSigned) { diff --git a/kramv/KramViewerBase.h b/kramv/KramViewerBase.h index 9afe084c..146c01c0 100644 --- a/kramv/KramViewerBase.h +++ b/kramv/KramViewerBase.h @@ -134,6 +134,10 @@ class ShowSettings { // Can collapse 3d to 2d and overlay the uv bool isUVPreview = false; + uint32_t uvPreviewFrames = 0; + float uvPreviewStep = 1.0f / 10.0f; + float uvPreview = 0.0f; + // the 2d view doesn't want to inset pixels for clamp, or point sampling is // thrown off expecially on small 4x4 textures #if USE_PERSPECTIVE @@ -240,6 +244,8 @@ class ShowSettings { const char *meshNumberName(uint32_t meshNumber) const; + void updateUVPreviewState(); + float imageAspectRatio() const { float ar = 1.0f; if (meshNumber == 0 && !isModel && imageBoundsY > 0) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 3380edd8..67815c74 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -503,6 +503,7 @@ @implementation MyMTKView { int32_t _fileFolderIndex; Action* _actionPlay; + Action* _actionShapeUVPreview; Action* _actionHelp; Action* _actionInfo; Action* _actionHud; @@ -687,6 +688,7 @@ - (NSStackView *)_addButtons Action("", "", Key::A), // sep Action(" ", "Play", Key::Space), // TODO: really need icon on this + Action("6", "Shape UVPreview", Key::Num6), Action("S", "Shape", Key::S), Action("C", "Shape Channel", Key::C), Action("L", "Lighting", Key::L), @@ -732,6 +734,7 @@ - (NSStackView *)_addButtons &_actionFit, &_actionPlay, + &_actionShapeUVPreview, &_actionShapeMesh, &_actionShapeChannel, &_actionLighting, @@ -2055,11 +2058,14 @@ - (void)showFileTable scrollView.hidden = NO; } -- (void)updateHudVisibility { +- (void)updateHudVisibility +{ _hudLabel.hidden = _hudHidden || !_showSettings->isHudShown; _hudLabel2.hidden = _hudHidden || !_showSettings->isHudShown; } + + - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyDown { // Some data depends on the texture data (isSigned, isNormal, ..) @@ -2070,6 +2076,8 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD // f.e. clamped values don't need to re-render string text; + Renderer* renderer = (Renderer*)self.delegate; + if (action == _actionVertical) { bool isVertical = _buttonStack.orientation == NSUserInterfaceLayoutOrientationVertical; @@ -2162,23 +2170,31 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD } else if (action == _actionPlay) { if (!action->isHidden) { - Renderer* renderer = (Renderer*)self.delegate; - + renderer.playAnimations = !renderer.playAnimations; text = renderer.playAnimations ? "Play" : "Pause"; isChanged = true; - self.enableSetNeedsDisplay = !renderer.playAnimations; - self.paused = !renderer.playAnimations; + [renderer updateAnimationState:self]; } else { - self.enableSetNeedsDisplay = YES; - self.paused = YES; + [renderer updateAnimationState:self]; } - } - + else if (action == _actionShapeUVPreview) { + + // toggle state + _showSettings->isUVPreview = !_showSettings->isUVPreview; + text = _showSettings->isUVPreview ? "Show UVPreview" : "Hide UvPreview"; + isChanged = true; + + _showSettings->uvPreviewFrames = 10; + + // also need to call this in display link, for when it reaches end + [renderer updateAnimationState:self]; + } + else if (action == _actionShapeChannel) { _showSettings->advanceShapeChannel(isShiftKeyDown); From 2cd9ce065e9366a6a8f8a8852507f23c9877baf7 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 7 May 2022 00:32:03 -0700 Subject: [PATCH 016/615] kramv - fix directory listing for gltf If files have gltf, then don't list png's they reference --- kramv/KramViewerMain.mm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 67815c74..2b40d039 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -3105,7 +3105,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url while (NSURL* fileOrDirectoryURL = [directoryEnumerator nextObject]) { const char* name = fileOrDirectoryURL.fileSystemRepresentation; - bool isModel = isSupportedModelFilename(filename); + bool isModel = isSupportedModelFilename(name); if (isModel) { files.push_back(name); From b406e77e0a7d7d35434c8ba0668412dffb9ab730 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 7 May 2022 00:47:51 -0700 Subject: [PATCH 017/615] kramv - show model vs, name, show file --- kramv/KramViewerMain.mm | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 2b40d039..541951b7 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -2277,7 +2277,11 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD _showSettings->zoom = 1.0f; } - text = "Reload Image"; + // Name change if image + if (_showSettings->isModel) + text = "Reload Model\n"; + else + text = "Reload Image\n"; if (doPrintPanZoom) { string tmp; sprintf(tmp, @@ -3353,6 +3357,20 @@ - (BOOL)loadTextureFromURL:(NSURL *)url return success; } +-(double)getTimestampForFile:(NSURL*)url +{ + // TODO: could just use FileHelper::modificationTimestamp(filename); + + NSDate* fileDate = nil; + NSError* error = nil; + [url getResourceValue:&fileDate + forKey:NSURLContentModificationDateKey + error:&error]; + + double timestamp = fileDate.timeIntervalSince1970; + return timestamp; +} + -(BOOL)loadModelFile:(NSURL*)url filename:(const char*)filename { #if USE_GLTF @@ -3379,7 +3397,8 @@ -(BOOL)loadModelFile:(NSURL*)url filename:(const char*)filename NSURL* gltfFileURL = [NSURL fileURLWithPath:[NSString stringWithUTF8String:filename]]; - + double timestamp = [self getTimestampForFile:gltfFileURL]; + BOOL success = [renderer loadModel:gltfFileURL]; // TODO: split this off to a completion handler, since loadModel is async @@ -3437,6 +3456,10 @@ -(BOOL)loadModelFile:(NSURL*)url filename:(const char*)filename setErrorLogCapture(false); + // store the filename + _showSettings->lastFilename = filename; + _showSettings->lastTimestamp = timestamp; + self.needsDisplay = YES; return success; From 75ac6e38e8575ec3ae74aac6d6f817e1897156fb Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 8 May 2022 20:33:09 -0700 Subject: [PATCH 018/615] kram - enhance task system, only call updateAnimationState from one spot --- kramv/KramLoader.mm | 2 +- kramv/KramRenderer.h | 2 +- kramv/KramRenderer.mm | 2 +- kramv/KramViewerMain.mm | 6 +- kramv/Shaders/KramShaders.metal | 5 +- kramv/Shaders/hdr.metal | 4 + libkram/kram/TaskSystem.cpp | 252 ++++++++++++++++++++++++++++++-- libkram/kram/TaskSystem.h | 11 ++ 8 files changed, 263 insertions(+), 21 deletions(-) diff --git a/kramv/KramLoader.mm b/kramv/KramLoader.mm index 9cb54974..97a08efe 100644 --- a/kramv/KramLoader.mm +++ b/kramv/KramLoader.mm @@ -74,7 +74,7 @@ - (instancetype)init // for macOS/win Intel need to decode astc/etc // on macOS/arm, the M1 supports all 3 encode formats -#define DO_DECODE TARGET_CPU_X86_64 +#define DO_DECODE KRAM_SSE #if DO_DECODE diff --git a/kramv/KramRenderer.h b/kramv/KramRenderer.h index 517cb2dc..566d0b26 100644 --- a/kramv/KramRenderer.h +++ b/kramv/KramRenderer.h @@ -66,7 +66,7 @@ class KTXImage; - (void)unloadModel; // called from view and renderer in render loop -- (void)updateAnimationState:(MTKView*)view; +- (void)updateAnimationState:(nonnull MTKView*)view; // can play animations in gltf models @property (nonatomic) BOOL playAnimations; diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index b84f4f4f..f392894d 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -722,7 +722,7 @@ - (void)releaseAllPendingTextures } } -- (void)updateAnimationState:(MTKView*)view +- (void)updateAnimationState:(nonnull MTKView*)view { bool animateDisplay = self.playAnimations; diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 541951b7..65b59c42 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -2176,10 +2176,10 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD text = renderer.playAnimations ? "Play" : "Pause"; isChanged = true; - [renderer updateAnimationState:self]; + //[renderer updateAnimationState:self]; } else { - [renderer updateAnimationState:self]; + //[renderer updateAnimationState:self]; } } else if (action == _actionShapeUVPreview) { @@ -2192,7 +2192,7 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD _showSettings->uvPreviewFrames = 10; // also need to call this in display link, for when it reaches end - [renderer updateAnimationState:self]; + //[renderer updateAnimationState:self]; } else if (action == _actionShapeChannel) { diff --git a/kramv/Shaders/KramShaders.metal b/kramv/Shaders/KramShaders.metal index 2e32b748..e5301a0c 100644 --- a/kramv/Shaders/KramShaders.metal +++ b/kramv/Shaders/KramShaders.metal @@ -575,13 +575,16 @@ void doUVPreview( float uvPreview ) { + // TODO: should honor aspect ratio of original image + // this will convert to square -1,1 + // convert [0,1] to [-1,1] plane float3 uv(toSnorm(texCoord), 0.0); uv.y *= -1; uv.xy *= 0.5; // shrink it position.xyz = mix(position.xyz, uv.xyz, uvPreview); - // interpolate norma and tangent too + // interpolate normal and tangent too normal = mix(normal.xyz, float3(0,0,1), uvPreview); tangent = mix(tangent, float4(1,0,0,1), uvPreview); } diff --git a/kramv/Shaders/hdr.metal b/kramv/Shaders/hdr.metal index bab4a725..4749ca2c 100644 --- a/kramv/Shaders/hdr.metal +++ b/kramv/Shaders/hdr.metal @@ -67,6 +67,8 @@ fragment half4 blur_horizontal7_fragment_main(FragmentIn in [[stage_in]], float weights[]{ 0.134032, 0.126854, 0.107545, 0.08167, 0.055555, 0.033851, 0.018476, 0.009033 }; float offset = 1.0 / sourceTexture.get_width(); half4 color(0); + + // TODO: do this in half the samples with offsets and linearSampler, a 15x15 px blur w/8 weights color += weights[7] * sourceTexture.sample(nearestSampler, in.texCoords - float2(offset * 7, 0)); color += weights[6] * sourceTexture.sample(nearestSampler, in.texCoords - float2(offset * 6, 0)); color += weights[5] * sourceTexture.sample(nearestSampler, in.texCoords - float2(offset * 5, 0)); @@ -91,6 +93,8 @@ fragment half4 blur_vertical7_fragment_main(FragmentIn in [[stage_in]], float weights[]{ 0.134032, 0.126854, 0.107545, 0.08167, 0.055555, 0.033851, 0.018476, 0.009033 }; float offset = 1.0 / sourceTexture.get_height(); half4 color(0); + + // TODO: do this in half the samples with offsets and linearSampler color += weights[7] * sourceTexture.sample(nearestSampler, in.texCoords - float2(0, offset * 7)); color += weights[6] * sourceTexture.sample(nearestSampler, in.texCoords - float2(0, offset * 6)); color += weights[5] * sourceTexture.sample(nearestSampler, in.texCoords - float2(0, offset * 5)); diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index da3a8446..4a6e773f 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -1,9 +1,15 @@ #include "TaskSystem.h" -// TODO: bury in system/cpp file -#if KRAM_MAC || KRAM_IOS +#if KRAM_MAC + // affiniity #include #include + + #include + #include +#elif KRAM_IOS + #include + #include #elif KRAM_WIN #include #else @@ -13,24 +19,78 @@ namespace kram { using namespace NAMESPACE_STL; +void task_system::set_qos(std::thread& thread, ThreadQos level) +{ +#if KRAM_MAC || KRAM_IOS + auto handle = thread.native_handle(); + + // https://abhimuralidharan.medium.com/understanding-threads-in-ios-5b8d7ab16f09 + // user-interative, user-initiated, default, utility, background, unspecified + + qos_class_t qos = QOS_CLASS_UNSPECIFIED; + switch(level) { + case ThreadQos::Interactive: qos = QOS_CLASS_USER_INTERACTIVE; break; + case ThreadQos::High: qos = QOS_CLASS_USER_INITIATED; break; + case ThreadQos::Default: qos = QOS_CLASS_DEFAULT; break; + case ThreadQos::Medium: qos = QOS_CLASS_UTILITY; break; + case ThreadQos::Low: qos = QOS_CLASS_BACKGROUND; break; + } + + // note here the priority = 0, but is negative offsets + // note this is a start/end overide call, but can set override on existing thread + pthread_override_qos_class_start_np(handle, qos, 0); +#endif +} + void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) { // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ - // TODO: set affinity, but need to create a thread that doesn't launch - // so can set this up, and then run it. auto handle = thread.native_handle(); - uint64_t affinityMask = ((uint64_t)1) << threadIndex; // for now only allow single thread mask - -#if KRAM_MAC || KRAM_IOS - thread_affinity_policy_data_t policy = { (int)affinityMask }; + + // for now only allow single core mask + uint64_t affinityMask = ((uint64_t)1) << threadIndex; + + // These are used in most of the paths + macroUnusedVar(handle); + macroUnusedVar(affinityMask); + +#if KRAM_MAC + #if KRAM_SSE + if (!coreInfo.isTranslated) { + thread_affinity_policy_data_t policy = { (int)affinityMask }; - // TODO: check return - thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); + // TODO: consider skipping affinity on macOS altogether + // this is just a hint on x64-based macOS + int returnVal = thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); + + if (returnVal != 0) { + // TODO: unsupported on M1, only have QoS + } + } + #endif + +#elif KRAM_IOS + // no support + +#elif KRAM_ANDROID + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(threadIndex, &cpuset); + + // convert pthread to pid + pid_t pid; + pthread_getunique_np(handle, &pid); + if (!sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset)) { + // TODO: this can fail on some/all cores + } #elif KRAM_WIN // each processor group only has 64 bits - SetThreadAffinityMask(handle, (DWORD_PTR)&affinityMask); + DWORD_PTR mask = SetThreadAffinityMask(handle, *(const DWORD_PTR*)&affinityMask); + if (mask == 0) { + // TODO: failure case + } #else // most systems are pthread-based, this is represented with array of bits cpu_set_t cpuset; @@ -38,7 +98,10 @@ void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) CPU_SET(threadIndex, &cpuset); // TODO: check return - /*int rc = */ pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); + int returnVal = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); + if (returnVal != 0) { + // TODO: linux pthread failure case + } #endif } @@ -89,17 +152,178 @@ void task_system::run(int32_t threadIndex) } } +enum class CoreType +{ + Little, + // Medium, + Big, +}; + +struct CoreInfo +{ + // hyperthreading can result in logical = 2x physical cores (1.5x on Alderlake) + uint32_t logicalCoreCount; + uint32_t physicalCoreCount; + + // ARM is has big-little and big-medium-little, no HT, 2/4, 4/4, 6/2, 8/2. + // Intel x64 AlderLake has big-little. 24 threads (8x2HT/8) + uint32_t bigCoreCount; + uint32_t littleCoreCount; + + // x64 under Rosetta2 on M1 Arm chip, no AVX only SSE 4.2 + uint32_t isTranslated; + uint32_t isHyperthreaded; + + // TODO: this needs coreIndex, and then sort big to little + vector typeTable; + vector remapTable; +}; + +static const CoreInfo& GetCoreInfo() +{ + static CoreInfo coreInfo = {}; + if (coreInfo.logicalCoreCount != 0) + return coreInfo; + + // this includes hyperthreads + coreInfo.logicalCoreCount = std::thread::hardware_concurrency(); + coreInfo.physicalCoreCount = coreInfo.logicalCoreCount; + + #if KRAM_IOS || KRAM_MAC + // get big/little core counts + // use sysctl -a from command line to see all + size_t size = sizeof(coreInfo.bigCoreCount); + sysctlbyname("hw.perflevel0.physicalcpu", &coreInfo.bigCoreCount, &size, nullptr, 0); + sysctlbyname("hw.perflevel1.physicalcpu", &coreInfo.littleCoreCount, &size, nullptr, 0); + + // may not work on A10 2/2 exclusive + coreInfo.physicalCoreCount = std::min(coreInfo.bigCoreCount + coreInfo.littleCoreCount, coreInfo.physicalCoreCount); + + // no affinity, so core order here doesn't really matter. + for (uint32_t i = 0; i < coreInfo.bigCoreCount; ++i) { + coreInfo.typeTable.push_back(CoreType::Big); + coreInfo.remapTable.push_back(i); + } + for (uint32_t i = 0; i < coreInfo.littleCoreCount; ++i) { + coreInfo.typeTable.push_back(CoreType::Little); + coreInfo.remapTable.push_back(i + coreInfo.bigCoreCount); + } + + coreInfo.isHyperthreaded = coreInfo.logicalCoreCount != coreInfo.physicalCoreCount; + + #if KRAM_MAC + // Call the sysctl and if successful return the result + sysctlbyname("sysctl.proc_translated", &coreInfo.isTranslated, &size, NULL, 0); + #endif + + #elif KRAM_WIN + + // have to walk array of data, and assemble this info, ugh + // https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformation + + DWORD logicalCoreCount = 0; + DWORD physicalCoreCount = 0; + + DWORD returnLength = 0; + DWORD rc = GetLogicalProcessorInformation(buffer, &returnLength); + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = nullptr; + DWORD byteOffset = 0; + + // walk the array + bool isHyperthreaded = false; + ptr = buffer; + byteOffset = 0; + while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) { + switch (ptr->Relationship) { + case RelationProcessorCore: { + uint32_t logicalCores = CountSetBits(ptr->ProcessorMask); + if (logicalCores > 1) { + isHyperthreaded = true; + } + break; + } + } + + if (isHyperthreaded) + break; + + byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + ptr = buffer; + byteOffset = 0; + uint32_t coreNumber = 0; + while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) { + switch (ptr->Relationship) { + case RelationProcessorCore: { + physicalCoreCount++; + + // A hyperthreaded core supplies more than one logical processor. + // Can identify AlderLake big vs. little off this + uint32_t logicalCores = CountSetBits(ptr->ProcessorMask); + if (logicalCores > 1 || !isHyperthreaded) { + coreInfo.bigCoreCount++; + coreInfo.typeTable.push_back(CoreType::Big); + coreInfo.remapTable.push_back(coreNumber++); + } + else { + coreInfo.littleCoreCount++; + coreInfo.typeTable.push_back(CoreType::Little); + coreInfo.remapTable.push_back(coreNumber++); + } + + logicalCoreCount += logicalCores; + break; + } + } + byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + coreInfo.isHyperthreaded = isHyperthreaded; + coreInfo.physicalCoreCount = physicalCoreCount; + + #elif KRAM_ANDROID + + // TODO: have to walk array of proc/cpuinfo, and assemble this info, ugh + // then build a core remap table since big core are typically last, little early + // https://stackoverflow.com/questions/26239956/how-to-get-specific-information-of-an-android-device-from-proc-cpuinfo-file + + // JDK and NDK version of library with workarounds + // https://github.com/google/cpu_features + + // hack - assume all big cores, typical 1/3/4 or 2/2/4 + coreInfo.bigCoreCount = coreInfo.physicalCoreCount; + + for (int32_t i = coreInfo.bigCoreCount-1; i >= 0; --i) { + coreInfo.typeTable.push_back(CoreType::Big); + coreInfo.remapTable.push_back(i); + } + + #endif + + return coreInfo; +} -// TODO: don't want hyperthreads from hardware_concurrency task_system::task_system(int32_t count) : - _count(std::min(count, (int32_t)std::thread::hardware_concurrency())), + _count(std::min(count, (int32_t)GetCoreInfo().physicalCoreCount)), _q{(size_t)_count}, _index(0) { // start up the threads for (int32_t threadIndex = 0; threadIndex != _count; ++threadIndex) { _threads.emplace_back([&, threadIndex] { run(threadIndex); }); + +#if KRAM_IOS || KRAM_MAC + // No exposed affinity on Apple platforms, just this lame QoS setting + // which acts more like thread-priority. Good luck monitoring + // work on specific threads in profile captures. Even swift + // now doesn't allocate more threads than cores to avoid thread explosion. + set_qos(_threads.back(), ThreadQos::High); +#else set_affinity(_threads.back(), threadIndex); +#endif } } diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index b1b35262..12d26fef 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -122,6 +122,16 @@ class notification_queue { type(const type&) = delete; \ void operator=(const type&) = delete +// only for ioS/macOS +enum class ThreadQos +{ + Low = 1, + Medium = 2, + Default = 3, + High = 4, + Interactive = 5, +}; + class task_system { NOT_COPYABLE(task_system); @@ -135,6 +145,7 @@ class task_system { void run(int32_t threadIndex); void set_affinity(std::thread& thread, uint32_t threadIndex); + void set_qos(std::thread& thread, ThreadQos level); public: task_system(int32_t count = 1); From e52859ee9adcbedfd83902db797541ab89927bf1 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 8 May 2022 20:44:13 -0700 Subject: [PATCH 019/615] kramv - don't set uvPreview to 0 on 2d views, it's already disabled there --- kramv/KramViewerBase.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/kramv/KramViewerBase.cpp b/kramv/KramViewerBase.cpp index 042f43da..54ae5d75 100644 --- a/kramv/KramViewerBase.cpp +++ b/kramv/KramViewerBase.cpp @@ -278,20 +278,23 @@ void ShowSettings::advanceDebugMode(bool decrement) void ShowSettings::updateUVPreviewState() { if (is3DView) { - if (isUVPreview) { - if (uvPreview < 1.0) - uvPreview += uvPreviewStep; + if (uvPreviewFrames > 0) { + if (isUVPreview) { + if (uvPreview < 1.0) + uvPreview += uvPreviewStep; + } + else + { + if (uvPreview > 0.0) + uvPreview -= uvPreviewStep; + } + + uvPreview = saturate(uvPreview); } - else - { - if (uvPreview > 0.0) - uvPreview -= uvPreviewStep; - } - - uvPreview = saturate(uvPreview); } else { - uvPreview = 0.0; + // This hides the uvView even when switchig back to 3d shape + //uvPreview = 0.0; } // stop the frame update From 1447aca7f43a4e2d347e1c7bf8bd48e8fd13c59c Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 8 May 2022 20:53:00 -0700 Subject: [PATCH 020/615] kram - add ODS for Win logs, fix TaskSystem --- libkram/kram/KramLog.cpp | 43 +++++++++++++++++++++++++++++++++++-- libkram/kram/TaskSystem.cpp | 20 +++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp index ee439cee..e3a812f3 100644 --- a/libkram/kram/KramLog.cpp +++ b/libkram/kram/KramLog.cpp @@ -202,7 +202,7 @@ extern int32_t logMessage(const char* group, int32_t logLevel, groupString = group; space = " "; -#if _WIN32 +#if KRAM_WIN const char fileSeparator = '\\'; #else const char fileSeparator = '/'; @@ -238,7 +238,46 @@ extern int32_t logMessage(const char* group, int32_t logLevel, } } - fprintf(fp, "%s%s%s%s%s%s", tag, groupString, space, msg, needsNewline ? "\n" : "", fileLineFunc.c_str()); + // format into a buffer + static string buffer; + sprintf(buffer, "%s%s%s%s%s%s", tag, groupString, space, msg, needsNewline ? "\n" : "", fileLineFunc.c_str()); + +#if KRAM_WIN + // won't this print twice? + //fprintf(fp, "%s", buffer.c_str()); + + if (::IsDebuggerPresent()) { + // TODO: split string up into multiple logs + // this is limited to 32K + OutputDebugString(buffer.c_str()); + } +#elif KRAM_ANDROID + AndroidLogLevel androidLogLevel = ANDROID_LOG_ERROR; + switch (logLevel) { + case LogLevelDebug: + androidLogLevel = ANDROID_LOG_DEBUG; + break; + case LogLevelInfo: + androidLogLevel = ANDROID_LOG_INFO; + break; + + case LogLevelWarning: + androidLogLevel = ANDROID_LOG_WARNING; + break; + case LogLevelError: + androidLogLevel = ANDROID_LOG_ERROR; + break; + } + + // TODO: can also fix printf to work on Android + // but can't set log level like with this call, but no dump buffer limit + + // TODO: split string up into multiple logs + // this can only write 4K - 40? chars at time, don't use print it's 1023 + __android_log_write(androidLogLevel, buffer.c_str()); +#else + fprintf(fp, "%s", buffer.c_str()); +#endif return 0; // reserved for later } diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 4a6e773f..e196612f 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -179,6 +179,25 @@ struct CoreInfo vector remapTable; }; +#if KRAM_WIN +// Helper function to count set bits in the processor mask. +DWORD CountSetBits(ULONG_PTR bitMask) +{ + DWORD LSHIFT = sizeof(ULONG_PTR)*8 - 1; + DWORD bitSetCount = 0; + ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT; + DWORD i; + + for (i = 0; i <= LSHIFT; ++i) + { + bitSetCount += ((bitMask & bitTest)?1:0); + bitTest /= 2; + } + + return bitSetCount; +} +#endif + static const CoreInfo& GetCoreInfo() { static CoreInfo coreInfo = {}; @@ -225,6 +244,7 @@ static const CoreInfo& GetCoreInfo() DWORD physicalCoreCount = 0; DWORD returnLength = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; DWORD rc = GetLogicalProcessorInformation(buffer, &returnLength); PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = nullptr; DWORD byteOffset = 0; From 94cde87a00447745843f773eb1d672e2bd9648c6 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 8 May 2022 20:58:57 -0700 Subject: [PATCH 021/615] kram - fix log on Win --- libkram/kram/KramLog.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp index e3a812f3..3281f841 100644 --- a/libkram/kram/KramLog.cpp +++ b/libkram/kram/KramLog.cpp @@ -11,6 +11,11 @@ #include +#if KRAM_WIN +#include +#elif KRAM_ANDROID +#include +#endif namespace kram { using mymutex = std::recursive_mutex; From 4768b344c75709cf6a170224e15c1a854b5b58d8 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 8 May 2022 22:42:04 -0700 Subject: [PATCH 022/615] kram - win log --- libkram/kram/KramLog.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp index 3281f841..c932cd51 100644 --- a/libkram/kram/KramLog.cpp +++ b/libkram/kram/KramLog.cpp @@ -248,14 +248,15 @@ extern int32_t logMessage(const char* group, int32_t logLevel, sprintf(buffer, "%s%s%s%s%s%s", tag, groupString, space, msg, needsNewline ? "\n" : "", fileLineFunc.c_str()); #if KRAM_WIN - // won't this print twice? - //fprintf(fp, "%s", buffer.c_str()); - if (::IsDebuggerPresent()) { // TODO: split string up into multiple logs // this is limited to 32K OutputDebugString(buffer.c_str()); } + else { + // avoid double print to debugger + fprintf(fp, "%s", buffer.c_str()); + } #elif KRAM_ANDROID AndroidLogLevel androidLogLevel = ANDROID_LOG_ERROR; switch (logLevel) { From e7d8915f4c9b530d171ff1318265225fdff79109 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 8 May 2022 22:44:55 -0700 Subject: [PATCH 023/615] kram - log fix --- libkram/kram/KramLog.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp index c932cd51..8e3fa994 100644 --- a/libkram/kram/KramLog.cpp +++ b/libkram/kram/KramLog.cpp @@ -280,7 +280,7 @@ extern int32_t logMessage(const char* group, int32_t logLevel, // TODO: split string up into multiple logs // this can only write 4K - 40? chars at time, don't use print it's 1023 - __android_log_write(androidLogLevel, buffer.c_str()); + __android_log_write(androidLogLevel, tag, buffer.c_str()); #else fprintf(fp, "%s", buffer.c_str()); #endif From fbd8c5f09adcebc8db94f0992780aa8728db3e43 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Tue, 10 May 2022 09:11:30 -0700 Subject: [PATCH 024/615] tasks - handle coers --- libkram/kram/TaskSystem.cpp | 389 ++++++++++++++++++++++-------------- libkram/kram/TaskSystem.h | 15 ++ 2 files changed, 249 insertions(+), 155 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index e196612f..fe12e50d 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -6,6 +6,7 @@ #include #include + #include #include #elif KRAM_IOS #include @@ -19,146 +20,19 @@ namespace kram { using namespace NAMESPACE_STL; -void task_system::set_qos(std::thread& thread, ThreadQos level) -{ -#if KRAM_MAC || KRAM_IOS - auto handle = thread.native_handle(); - - // https://abhimuralidharan.medium.com/understanding-threads-in-ios-5b8d7ab16f09 - // user-interative, user-initiated, default, utility, background, unspecified - - qos_class_t qos = QOS_CLASS_UNSPECIFIED; - switch(level) { - case ThreadQos::Interactive: qos = QOS_CLASS_USER_INTERACTIVE; break; - case ThreadQos::High: qos = QOS_CLASS_USER_INITIATED; break; - case ThreadQos::Default: qos = QOS_CLASS_DEFAULT; break; - case ThreadQos::Medium: qos = QOS_CLASS_UTILITY; break; - case ThreadQos::Low: qos = QOS_CLASS_BACKGROUND; break; - } - - // note here the priority = 0, but is negative offsets - // note this is a start/end overide call, but can set override on existing thread - pthread_override_qos_class_start_np(handle, qos, 0); -#endif -} - -void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) -{ - // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ - - auto handle = thread.native_handle(); - - // for now only allow single core mask - uint64_t affinityMask = ((uint64_t)1) << threadIndex; - - // These are used in most of the paths - macroUnusedVar(handle); - macroUnusedVar(affinityMask); - -#if KRAM_MAC - #if KRAM_SSE - if (!coreInfo.isTranslated) { - thread_affinity_policy_data_t policy = { (int)affinityMask }; - - // TODO: consider skipping affinity on macOS altogether - // this is just a hint on x64-based macOS - int returnVal = thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); - - if (returnVal != 0) { - // TODO: unsupported on M1, only have QoS - } - } - #endif - -#elif KRAM_IOS - // no support - -#elif KRAM_ANDROID - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(threadIndex, &cpuset); - - // convert pthread to pid - pid_t pid; - pthread_getunique_np(handle, &pid); - if (!sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset)) { - // TODO: this can fail on some/all cores - } - -#elif KRAM_WIN - // each processor group only has 64 bits - DWORD_PTR mask = SetThreadAffinityMask(handle, *(const DWORD_PTR*)&affinityMask); - if (mask == 0) { - // TODO: failure case - } -#else - // most systems are pthread-based, this is represented with array of bits - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(threadIndex, &cpuset); - - // TODO: check return - int returnVal = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); - if (returnVal != 0) { - // TODO: linux pthread failure case - } -#endif -} - -void task_system::run(int32_t threadIndex) -{ - while (true) { - // pop() wait avoids a spinloop. - - function f; - - // start with ours, but steal from other queues if nothing found - // Note that if threadIndex queue is empty and stays empty - // then pop() below will stop using that thread. But async_ is round-robining - // all work across the available queues. - int32_t multiple = 4; // 32; - int32_t numTries = 0; - for (int32_t n = 0, nEnd = _count * multiple; n < nEnd; ++n) { - numTries++; - - // break for loop if work found - if (_q[(threadIndex + n) % _count].try_pop(f)) { - break; - } - } - - // numTries is 64 when queues are empty, and typically 1 when queues are full - //KLOGD("task_system", "thread %d searched %d tries", threadIndex, numTries); - - // if no task, and nothing to steal, pop own queue if possible - // pop blocks until it's queue receives tasks - if (!f && !_q[threadIndex].pop(f)) { - // shutdown if tasks have all been submitted and queue marked as done. - if (_q[threadIndex].is_done()) { - KLOGD("task_system", "thread %d shutting down", threadIndex); - - break; - } - else { - KLOGD("task_system", "no work found for %d in %d tries", threadIndex, numTries); - - // keep searching - continue; - } - } - - // do the work - f(); - } -} - -enum class CoreType +enum class CoreType : uint8_t { Little, // Medium, Big, }; +struct CoreNum +{ + uint8_t index; + CoreType type; +}; + struct CoreInfo { // hyperthreading can result in logical = 2x physical cores (1.5x on Alderlake) @@ -174,14 +48,12 @@ struct CoreInfo uint32_t isTranslated; uint32_t isHyperthreaded; - // TODO: this needs coreIndex, and then sort big to little - vector typeTable; - vector remapTable; + vector remapTable; }; #if KRAM_WIN // Helper function to count set bits in the processor mask. -DWORD CountSetBits(ULONG_PTR bitMask) +static DWORD CountSetBits(ULONG_PTR bitMask) { DWORD LSHIFT = sizeof(ULONG_PTR)*8 - 1; DWORD bitSetCount = 0; @@ -212,20 +84,30 @@ static const CoreInfo& GetCoreInfo() // get big/little core counts // use sysctl -a from command line to see all size_t size = sizeof(coreInfo.bigCoreCount); - sysctlbyname("hw.perflevel0.physicalcpu", &coreInfo.bigCoreCount, &size, nullptr, 0); - sysctlbyname("hw.perflevel1.physicalcpu", &coreInfo.littleCoreCount, &size, nullptr, 0); + + uint32_t perfLevelCount = 0; + + // only big-little core counts on macOS12/iOS15 + sysctlbyname("hw.nperflevels", &perfLevelCount, &size, nullptr, 0); + if (perfLevelCount > 0) { + sysctlbyname("hw.perflevel0.physicalcpu", &coreInfo.bigCoreCount, &size, nullptr, 0); + if (perfLevelCount > 1) + sysctlbyname("hw.perflevel1.physicalcpu", &coreInfo.littleCoreCount, &size, nullptr, 0); + } + else { + // can't identify little cores + sysctlbyname("hw.perflevel0.physicalcpu", &coreInfo.bigCoreCount, &size, nullptr, 0); + } // may not work on A10 2/2 exclusive coreInfo.physicalCoreCount = std::min(coreInfo.bigCoreCount + coreInfo.littleCoreCount, coreInfo.physicalCoreCount); // no affinity, so core order here doesn't really matter. for (uint32_t i = 0; i < coreInfo.bigCoreCount; ++i) { - coreInfo.typeTable.push_back(CoreType::Big); - coreInfo.remapTable.push_back(i); + coreInfo.remapTable.push_back({(uint8_t)i, CoreType::Big}); } for (uint32_t i = 0; i < coreInfo.littleCoreCount; ++i) { - coreInfo.typeTable.push_back(CoreType::Little); - coreInfo.remapTable.push_back(i + coreInfo.bigCoreCount); + coreInfo.remapTable.push_back({(uint8_t)(i + coreInfo.bigCoreCount), CoreType::Little}); } coreInfo.isHyperthreaded = coreInfo.logicalCoreCount != coreInfo.physicalCoreCount; @@ -284,13 +166,11 @@ static const CoreInfo& GetCoreInfo() uint32_t logicalCores = CountSetBits(ptr->ProcessorMask); if (logicalCores > 1 || !isHyperthreaded) { coreInfo.bigCoreCount++; - coreInfo.typeTable.push_back(CoreType::Big); - coreInfo.remapTable.push_back(coreNumber++); + coreInfo.remapTable.push_back({(uint8_t)coreNumber++, CoreType::Big}); } else { coreInfo.littleCoreCount++; - coreInfo.typeTable.push_back(CoreType::Little); - coreInfo.remapTable.push_back(coreNumber++); + coreInfo.remapTable.push_back({(uint8_t)coreNumber++, CoreType::Little}); } logicalCoreCount += logicalCores; @@ -317,30 +197,229 @@ static const CoreInfo& GetCoreInfo() coreInfo.bigCoreCount = coreInfo.physicalCoreCount; for (int32_t i = coreInfo.bigCoreCount-1; i >= 0; --i) { - coreInfo.typeTable.push_back(CoreType::Big); - coreInfo.remapTable.push_back(i); + coreInfo.remapTable.push_back({(uint8_t)i, CoreType::Big}); } #endif + // sort faster cores first in the remap table + sort(coreInfo.remapTable.begin(), coreInfo.remapTable.end(), [](const CoreNum& lhs, const CoreNum& rhs){ + if (lhs.type == rhs.type) + return lhs.index > rhs.index; + + return lhs.type > rhs.type; + }); + return coreInfo; } + +//------------------ + +#if KRAM_MAC || KRAM_IOS + +void task_system::set_rr_priority(std::thread& thread, uint8_t priority) +{ + auto handle = thread.native_handle(); + + struct sched_param param = { priority }; + pthread_setschedparam(handle, SCHED_RR, ¶m); +} + +void task_system::set_main_rr_priority(uint8_t priority) +{ + auto handle = pthread_self(); + + struct sched_param param = { priority }; + pthread_setschedparam(handle, SCHED_RR, ¶m); +} + +void task_system::set_main_qos(ThreadQos level) +{ + set_qos(pthread_self(), level); +} + +void task_system::set_qos(std::thread& thread, ThreadQos level) +{ + auto handle = thread.native_handle(); + set_qos(handle, level); +} + +void task_system::set_qos(std::thread::native_handle_type handle, ThreadQos level) +{ + // https://abhimuralidharan.medium.com/understanding-threads-in-ios-5b8d7ab16f09 + // user-interactive, user-initiated, default, utility, background, unspecified + + qos_class_t qos = QOS_CLASS_UNSPECIFIED; + switch(level) { + case ThreadQos::Interactive: qos = QOS_CLASS_USER_INTERACTIVE; break; + case ThreadQos::High: qos = QOS_CLASS_USER_INITIATED; break; + case ThreadQos::Default: qos = QOS_CLASS_DEFAULT; break; + case ThreadQos::Medium: qos = QOS_CLASS_UTILITY; break; + case ThreadQos::Low: qos = QOS_CLASS_BACKGROUND; break; + } + + // qos is transferred to GCD jobs, and can experience thread depriority + // can system can try to adjust priority inversion. + + // note here the priorityOffset = 0, but is negative offsets + // there is a narrow range of offsets + + // note this is a start/end overide call, but can set override on existing thread + pthread_override_qos_class_start_np(handle, qos, 0); +} + +#endif + +void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) +{ + // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ + + auto handle = thread.native_handle(); + + set_affinity(handle, threadIndex); +} + +void task_system::set_main_affinity(uint32_t threadIndex) +{ + set_affinity(pthread_self(), threadIndex); +} + +void task_system::set_affinity(std::thread::native_handle_type handle, uint32_t threadIndex) +{ + const auto& coreInfo = GetCoreInfo(); + + if (threadIndex > coreInfo.remapTable.size()) + threadIndex = coreInfo.remapTable.size() - 1; + + threadIndex = coreInfo.remapTable[threadIndex].index; + + // for now only allow single core mask + uint64_t affinityMask = ((uint64_t)1) << threadIndex; + + // These are used in most of the paths + macroUnusedVar(handle); + macroUnusedVar(affinityMask); + +#if KRAM_MAC + #if KRAM_SSE + if (!coreInfo.isTranslated) { + thread_affinity_policy_data_t policy = { (int)affinityMask }; + + // TODO: consider skipping affinity on macOS altogether + // this is just a hint on x64-based macOS + int returnVal = thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); + + if (returnVal != 0) { + // TODO: unsupported on iOS/M1, only have QoS and priority + // big P cores can also be disabled to resolve thermals + } + } + #endif + +#elif KRAM_IOS + // no support + +#elif KRAM_ANDROID + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(threadIndex, &cpuset); + + // convert pthread to pid + pid_t pid; + pthread_getunique_np(handle, &pid); + if (!sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset)) { + // TODO: this can fail on some/all cores + } + +#elif KRAM_WIN + // each processor group only has 64 bits + DWORD_PTR mask = SetThreadAffinityMask(handle, *(const DWORD_PTR*)&affinityMask); + if (mask == 0) { + // TODO: failure case + } +#else + // most systems are pthread-based, this is represented with array of bits + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(threadIndex, &cpuset); + + // TODO: check return + int returnVal = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); + if (returnVal != 0) { + // TODO: linux pthread failure case + } +#endif +} + +void task_system::run(int32_t threadIndex) +{ + while (true) { + // pop() wait avoids a spinloop. + + function f; + + // start with ours, but steal from other queues if nothing found + // Note that if threadIndex queue is empty and stays empty + // then pop() below will stop using that thread. But async_ is round-robining + // all work across the available queues. + int32_t multiple = 4; // 32; + int32_t numTries = 0; + for (int32_t n = 0, nEnd = _count * multiple; n < nEnd; ++n) { + numTries++; + + // break for loop if work found + if (_q[(threadIndex + n) % _count].try_pop(f)) { + break; + } + } + + // numTries is 64 when queues are empty, and typically 1 when queues are full + //KLOGD("task_system", "thread %d searched %d tries", threadIndex, numTries); + + // if no task, and nothing to steal, pop own queue if possible + // pop blocks until it's queue receives tasks + if (!f && !_q[threadIndex].pop(f)) { + // shutdown if tasks have all been submitted and queue marked as done. + if (_q[threadIndex].is_done()) { + KLOGD("task_system", "thread %d shutting down", threadIndex); + + break; + } + else { + KLOGD("task_system", "no work found for %d in %d tries", threadIndex, numTries); + + // keep searching + continue; + } + } + + // do the work + f(); + } +} + + + + task_system::task_system(int32_t count) : _count(std::min(count, (int32_t)GetCoreInfo().physicalCoreCount)), _q{(size_t)_count}, _index(0) { +#if KRAM_IOS || KRAM_MAC + set_main_rr_priority(45); +#else + set_main_affinity(0); +#endif + // start up the threads for (int32_t threadIndex = 0; threadIndex != _count; ++threadIndex) { _threads.emplace_back([&, threadIndex] { run(threadIndex); }); #if KRAM_IOS || KRAM_MAC - // No exposed affinity on Apple platforms, just this lame QoS setting - // which acts more like thread-priority. Good luck monitoring - // work on specific threads in profile captures. Even swift - // now doesn't allocate more threads than cores to avoid thread explosion. - set_qos(_threads.back(), ThreadQos::High); + // it's either this or qos + set_rr_priority(_threads.back(), 41); #else set_affinity(_threads.back(), threadIndex); #endif diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index 12d26fef..33beb26c 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -144,9 +144,24 @@ class task_system { void run(int32_t threadIndex); + // affinity isn't really supported on Apple void set_affinity(std::thread& thread, uint32_t threadIndex); + static void set_main_affinity(uint32_t threadIndex); + +#if KRAM_MAC || KRAM_IOS + // these are Apple specific, due to lack of affinity control + // once priority set, can't use qos void set_qos(std::thread& thread, ThreadQos level); + static void set_main_qos(ThreadQos level); + + void set_rr_priority(std::thread& thread, uint8_t priority); + static void set_main_rr_priority(uint8_t priority); +#endif + // impl + static void set_qos(std::thread::native_handle_type handle, ThreadQos level); + static void set_affinity(std::thread::native_handle_type handle, uint32_t threadIndex); + public: task_system(int32_t count = 1); ~task_system(); From c38fec01ff6dab3254cebb1b9ea41a2fd1da75a4 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Tue, 10 May 2022 09:13:38 -0700 Subject: [PATCH 025/615] tasks - fix index clamp --- libkram/kram/TaskSystem.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index fe12e50d..7a1dfd54 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -289,8 +289,9 @@ void task_system::set_affinity(std::thread::native_handle_type handle, uint32_t { const auto& coreInfo = GetCoreInfo(); - if (threadIndex > coreInfo.remapTable.size()) - threadIndex = coreInfo.remapTable.size() - 1; + uint32_t maxIndex = coreInfo.remapTable.size() - 1; + if (threadIndex > maxIndex) + threadIndex = maxIndex; threadIndex = coreInfo.remapTable[threadIndex].index; From 5e8c740ca38af23a5a380efc4fdef1e6341f44d9 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Tue, 10 May 2022 23:29:17 -0700 Subject: [PATCH 026/615] Tasks - fix win --- libkram/kram/TaskSystem.cpp | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 7a1dfd54..4b37dc36 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -124,7 +124,8 @@ static const CoreInfo& GetCoreInfo() DWORD logicalCoreCount = 0; DWORD physicalCoreCount = 0; - + bool isHyperthreaded = false; + DWORD returnLength = 0; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; DWORD rc = GetLogicalProcessorInformation(buffer, &returnLength); @@ -132,7 +133,6 @@ static const CoreInfo& GetCoreInfo() DWORD byteOffset = 0; // walk the array - bool isHyperthreaded = false; ptr = buffer; byteOffset = 0; while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) { @@ -204,10 +204,19 @@ static const CoreInfo& GetCoreInfo() // sort faster cores first in the remap table sort(coreInfo.remapTable.begin(), coreInfo.remapTable.end(), [](const CoreNum& lhs, const CoreNum& rhs){ +#if KRAM_ANDROID + // sort largest index if (lhs.type == rhs.type) return lhs.index > rhs.index; - return lhs.type > rhs.type; +#else + // sort smallest index + if (lhs.type == rhs.type) + return lhs.index < rhs.index; + return lhs.type > rhs.type; +#endif + + }); return coreInfo; @@ -282,7 +291,11 @@ void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) void task_system::set_main_affinity(uint32_t threadIndex) { +#if KRAM_WIN + set_affinity(::GetCurrentThread(), threadIndex); +#else set_affinity(pthread_self(), threadIndex); +#endif } void task_system::set_affinity(std::thread::native_handle_type handle, uint32_t threadIndex) @@ -303,6 +316,7 @@ void task_system::set_affinity(std::thread::native_handle_type handle, uint32_t macroUnusedVar(affinityMask); #if KRAM_MAC + // don't use this, it's unsupported on ARM chips, and only an affinity hints on x64 #if KRAM_SSE if (!coreInfo.isTranslated) { thread_affinity_policy_data_t policy = { (int)affinityMask }; @@ -408,10 +422,13 @@ task_system::task_system(int32_t count) : _q{(size_t)_count}, _index(0) { + // see WWDC 2021 presentation here + // Tune CPU job scheduling for Apple silicon games + // https://developer.apple.com/videos/play/tech-talks/110147/ #if KRAM_IOS || KRAM_MAC - set_main_rr_priority(45); + set_main_rr_priority(45); #else - set_main_affinity(0); + set_main_affinity(0); #endif // start up the threads From 04ec65152376352616888188db1735ee232d677a Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 15 May 2022 11:06:56 -0700 Subject: [PATCH 027/615] Tasks - add name and logging --- libkram/kram/TaskSystem.cpp | 340 ++++++++++++++++++++++++++++++------ libkram/kram/TaskSystem.h | 34 ++-- 2 files changed, 309 insertions(+), 65 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 4b37dc36..44f47b94 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -222,39 +222,123 @@ static const CoreInfo& GetCoreInfo() return coreInfo; } +//---------------------- -//------------------ +// Ugh C++ "portable" thread classes that don't do anything useful +// and make you define all this over and over again in so many apps. +// https://stackoverflow.com/questions/10121560/stdthread-naming-your-thread +// Of course, Windows has to make portability difficult. +// And Mac non-standardly, doesn't even pass thread to call. +// This requires it to be set from thread itself). -#if KRAM_MAC || KRAM_IOS +#if KRAM_WIN -void task_system::set_rr_priority(std::thread& thread, uint8_t priority) +// Isn't this in a header? +#pragma pack(push,8) +typedef struct tagTHREADNAME_INFO { - auto handle = thread.native_handle(); - - struct sched_param param = { priority }; - pthread_setschedparam(handle, SCHED_RR, ¶m); + DWORD dwType; // Must be 0x1000. + LPCSTR szName; // Pointer to name (in user addr space). + DWORD dwThreadID; // Thread ID (-1=caller thread). + DWORD dwFlags; // Reserved for future use, must be zero. +} THREADNAME_INFO; +#pragma pack(pop) + +void setThreadName(std::thread::native_handle_type handle, const char* threadName) +{ + DWORD threadID = ::GetThreadId(handle); + + THREADNAME_INFO info; + info.dwType = 0x1000; + info.szName = threadName; + info.dwThreadID = threadID; + info.dwFlags = 0; + + __try + { + // Limits to how long this name can be. Also copy into ptr to change name. + RaiseException(0x406D1388, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info); + } + __except(EXCEPTION_EXECUTE_HANDLER) + { + } +} + +void setCurrentThreadName(const char* threadName) +{ + setThreadName(GetCurrentThread(), threadName); } -void task_system::set_main_rr_priority(uint8_t priority) +void setThreadName(std::thread& thread, const char* threadName) +{ + DWORD threadId = ::GetThreadId(thread.native_handle()); + setThreadName(threadId, threadName); +} + +#elif KRAM_MAC || KRAM_IOS + +void setThreadName(std::thread::native_handle_type macroUnusedArg(handle), const char* threadName) +{ + // This can only set on self + int val = pthread_setname_np(threadName); + if (val != 0) + KLOGW("Thread", "Could not set thread name"); +} + +void setCurrentThreadName(const char* threadName) { auto handle = pthread_self(); - - struct sched_param param = { priority }; - pthread_setschedparam(handle, SCHED_RR, ¶m); + setThreadName(handle, threadName); } -void task_system::set_main_qos(ThreadQos level) +// This doesn't exist on macOS. What a pain. Doesn't line up with getter calls. +// Means can't set threadName externally without telling thread to wake and set itself. +//void setThreadName(std::thread& thread, const char* threadName) +//{ +// auto handle = thread.native_handle(); +// setThreadName(handle, threadName); +//} + +#else + +void setThreadName(std::thread::native_handle_type handle, const char* threadName) { - set_qos(pthread_self(), level); + // This can only set on self + int val = pthread_setname_np(handle, threadName); + if (val != 0) + KLOGW("Thread", "Could not set thread name"); } -void task_system::set_qos(std::thread& thread, ThreadQos level) +void setCurrentThreadName(const char* threadName) +{ + auto handle = pthread_self(); + setThreadName(handle, threadName); +} + +void setThreadName(std::thread& thread, const char* threadName) { auto handle = thread.native_handle(); - set_qos(handle, level); + setThreadName(handle, threadName); +} + +#endif + +//------------------ + +#if SUPPORT_PRIORITY +#if KRAM_MAC || KRAM_IOS + +static void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) +{ + struct sched_param param = { priority }; + + // this sets policy to round-robin and priority + int val = pthread_setschedparam(handle, SCHED_RR, ¶m); + if (val != 0) + KLOGW("Thread", "Failed to set priority %d", priority); } -void task_system::set_qos(std::thread::native_handle_type handle, ThreadQos level) +static void setThreadQos(std::thread::native_handle_type handle, ThreadQos level) { // https://abhimuralidharan.medium.com/understanding-threads-in-ios-5b8d7ab16f09 // user-interactive, user-initiated, default, utility, background, unspecified @@ -275,30 +359,93 @@ void task_system::set_qos(std::thread::native_handle_type handle, ThreadQos leve // there is a narrow range of offsets // note this is a start/end overide call, but can set override on existing thread - pthread_override_qos_class_start_np(handle, qos, 0); + // TODO: this returns a newly allocated object which isn't released here + // need to release with pthread_override_qos_class_end_np(override); + auto val = pthread_override_qos_class_start_np(handle, qos, 0); + if (val != nullptr) + KLOGW("Thread", "Failed to set qos %d", (int)qos); } -#endif +void task_system::set_priority(std::thread& thread, uint8_t priority) +{ + setThreadPriority(thread.native_handle(), priority); +} -void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) +void task_system::set_current_priority(uint8_t priority) +{ + setThreadPriority(pthread_self(), priority); +} + +void task_system::set_current_qos(ThreadQos level) +{ + setThreadQos(pthread_self(), level); +} + +void task_system::set_qos(std::thread& thread, ThreadQos level) { - // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ - auto handle = thread.native_handle(); + setThreadQos(handle, level); +} + + + +#elif KRAM_ANDROID + +void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) +{ + struct sched_param param = { priority }; - set_affinity(handle, threadIndex); + // Android doesn not allow policy change (prob SCHED_OTHER), and only allows setting priority; + // Only from Android 10 (API 28). + int val = pthread_setschedprio(handle, priority); + if (val != 0) + KLOGW("Thread", "Failed to set priority %d", priority); } -void task_system::set_main_affinity(uint32_t threadIndex) + +static uint8_t convertQosToPriority(ThreadQos level) { -#if KRAM_WIN - set_affinity(::GetCurrentThread(), threadIndex); -#else - set_affinity(pthread_self(), threadIndex); -#endif + // TODO: fix these priorities. Linux had 20 to -20 as priorities + // but unclear what Android wants set from the docs. + uint8_t priority = 30; + switch(level) { + case ThreadQos::Interactive: priority = 45; break; + case ThreadQos::High: priority = 41; break; + case ThreadQos::Default: priority = 31; break; + case ThreadQos::Medium: priority = 20; break; + case ThreadQos::Low: priority = 10; break; + } + return priority; +} + +void task_system::set_priority(std::thread& thread, uint8_t priority) +{ + setThreadPriority(thread.native_handle(), priority); +} + +void task_system::set_current_priority(uint8_t priority) +{ + setThreadPriority(pthread_self(), priority); } -void task_system::set_affinity(std::thread::native_handle_type handle, uint32_t threadIndex) + +void task_system::set_main_qos(ThreadQos level) +{ + uint8_t priority = convertQosToPriority(level); + set_current_priority(priority); +} + +void task_system::set_qos(std::thread& thread, ThreadQos level) +{ + uint8_t priority = convertQosToPriority(level); + set_priority(thread, priority); +} +#endif +#endif + +#if SUPPORT_AFFINITY + +static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t threadIndex) { const auto& coreInfo = GetCoreInfo(); @@ -317,20 +464,20 @@ void task_system::set_affinity(std::thread::native_handle_type handle, uint32_t #if KRAM_MAC // don't use this, it's unsupported on ARM chips, and only an affinity hints on x64 - #if KRAM_SSE - if (!coreInfo.isTranslated) { - thread_affinity_policy_data_t policy = { (int)affinityMask }; - - // TODO: consider skipping affinity on macOS altogether - // this is just a hint on x64-based macOS - int returnVal = thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); - - if (returnVal != 0) { - // TODO: unsupported on iOS/M1, only have QoS and priority - // big P cores can also be disabled to resolve thermals - } - } - #endif +// #if KRAM_SSE +// if (!coreInfo.isTranslated) { +// thread_affinity_policy_data_t policy = { (int)affinityMask }; +// +// // TODO: consider skipping affinity on macOS altogether +// // this is just a hint on x64-based macOS +// int returnVal = thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); +// +// if (returnVal != 0) { +// // TODO: unsupported on iOS/M1, only have QoS and priority +// // big P cores can also be disabled to resolve thermal throttling +// } +// } +// #endif #elif KRAM_IOS // no support @@ -367,6 +514,25 @@ void task_system::set_affinity(std::thread::native_handle_type handle, uint32_t #endif } +void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) +{ + // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ + auto handle = thread.native_handle(); + setThreadAffinity(handle, threadIndex); +} + +void task_system::set_main_affinity(uint32_t threadIndex) +{ +#if KRAM_WIN + setThreadAffinity(::GetCurrentThread(), threadIndex); +#else + setThreadAffinity(pthread_self(), threadIndex); +#endif +} + + +#endif + void task_system::run(int32_t threadIndex) { while (true) { @@ -425,24 +591,94 @@ task_system::task_system(int32_t count) : // see WWDC 2021 presentation here // Tune CPU job scheduling for Apple silicon games // https://developer.apple.com/videos/play/tech-talks/110147/ -#if KRAM_IOS || KRAM_MAC - set_main_rr_priority(45); -#else - set_main_affinity(0); +#if SUPPORT_PRIORITY + set_current_priority(45); #endif +#if SUPPORT_AFFINITY + set_current_affinity(0); +#endif + + setCurrentThreadName("Main"); + + // Note that running work on core0 when core0 may starve it + // from assigning work to threads. + // start up the threads + string name; for (int32_t threadIndex = 0; threadIndex != _count; ++threadIndex) { - _threads.emplace_back([&, threadIndex] { run(threadIndex); }); -#if KRAM_IOS || KRAM_MAC + // Generate a name, also corresponds to core for affinity + // May want to include priority too. + sprintf(name, "Task%d", threadIndex); + _threadNames.push_back(name); + + _threads.emplace_back([&, threadIndex, name] { + // Have to set name from thread only for Apple. + setCurrentThreadName(name.c_str()); + + run(threadIndex); + }); + +#if SUPPORT_PRIORITY // it's either this or qos - set_rr_priority(_threads.back(), 41); -#else + set_priority(_threads.back(), 41); +#endif + +#if SUPPORT_AFFINITY set_affinity(_threads.back(), threadIndex); #endif } + + // dump out thread data + log_threads(); +} + +struct ThreadInfo { + const char* name; + int policy; + int priority; + int affinity; // single core for now +}; + +static void getThreadInfo(std::thread::native_handle_type handle, int& policy, int& priority) +{ +#if KRAM_MAC || KRAM_IOS || KRAM_ANDROID + struct sched_param priorityVal; + int val = pthread_getschedparam(handle, &policy, &priorityVal); + if (val != 0) + KLOGW("Thread", "failed to retrieve thread data"); + priority = priorityVal.sched_priority; +#endif +} + + +void task_system::log_threads() +{ + ThreadInfo info = {}; + info.name = "Main"; +#if SUPPORT_AFFINITY + info.affinity = 0; +#endif + getThreadInfo(pthread_self(), info.policy, info.priority); + KLOGI("Thread", "Thread:%s (pol:%d pri:%d aff:%d)", + info.name, info.policy, info.priority, info.affinity); + + for (uint32_t i = 0; i < _threads.size(); ++i) + { + info.policy = 0; + info.priority = 0; + info.name = _threadNames[i].c_str(); +#if SUPPORT_AFFINITY + // TODO: if more tasks/threads than cores, then this isn't accurate + // but don't want to write a getter for this right now. + info.affinity = i; +#endif + getThreadInfo(_threads[i].native_handle(), info.policy, info.priority); + KLOGI("Thread", "Thread:%s (pol:%d pri:%d aff:%d)", + info.name, info.policy, info.priority, info.affinity); + } } task_system::~task_system() @@ -451,7 +687,7 @@ task_system::~task_system() for (auto& e : _q) e.set_done(); - // wait until threads are all done, but joining each thread + // wait until threads are all done by joining each thread for (auto& e : _threads) e.join(); } diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index 33beb26c..746d15f6 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -122,6 +122,10 @@ class notification_queue { type(const type&) = delete; \ void operator=(const type&) = delete +#define SUPPORT_AFFINITY (KRAM_ANDROID || KRAM_WIN) +#define SUPPORT_PRIORITY (KRAM_MAC || KRAM_IOS || KRAM_ANDROID) + + // only for ioS/macOS enum class ThreadQos { @@ -137,31 +141,35 @@ class task_system { const int32_t _count; vector _threads; - + + // want to store with thread itself, but no field. Also have affinity, priority data. + vector _threadNames; + // currently one queue to each thread, but can steal from other queues vector _q; std::atomic _index; void run(int32_t threadIndex); +#if SUPPORT_AFFINITY // affinity isn't really supported on Apple void set_affinity(std::thread& thread, uint32_t threadIndex); - static void set_main_affinity(uint32_t threadIndex); - -#if KRAM_MAC || KRAM_IOS + static void set_current_affinity(uint32_t threadIndex); +#endif + +#if SUPPORT_PRIORITY // these are Apple specific, due to lack of affinity control - // once priority set, can't use qos + // once priority set, can't use qos. Also Android can't control + // policy, only the priority in API 28. void set_qos(std::thread& thread, ThreadQos level); - static void set_main_qos(ThreadQos level); - - void set_rr_priority(std::thread& thread, uint8_t priority); - static void set_main_rr_priority(uint8_t priority); + static void set_current_qos(ThreadQos level); + + void set_priority(std::thread& thread, uint8_t priority); + static void set_current_priority(uint8_t priority); #endif - // impl - static void set_qos(std::thread::native_handle_type handle, ThreadQos level); - static void set_affinity(std::thread::native_handle_type handle, uint32_t threadIndex); - + void log_threads(); + public: task_system(int32_t count = 1); ~task_system(); From f92b3268346a5e8bc07ac0644326052514eabcb5 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 15 May 2022 11:27:29 -0700 Subject: [PATCH 028/615] kram - fix tasks on Win --- libkram/kram/TaskSystem.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 44f47b94..211439b6 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -266,7 +266,7 @@ void setThreadName(std::thread::native_handle_type handle, const char* threadNam void setCurrentThreadName(const char* threadName) { - setThreadName(GetCurrentThread(), threadName); + setThreadName(::GetCurrentThread(), threadName); } void setThreadName(std::thread& thread, const char* threadName) @@ -429,7 +429,7 @@ void task_system::set_current_priority(uint8_t priority) } -void task_system::set_main_qos(ThreadQos level) +void task_system::set_current_qos(ThreadQos level) { uint8_t priority = convertQosToPriority(level); set_current_priority(priority); @@ -521,7 +521,7 @@ void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) setThreadAffinity(handle, threadIndex); } -void task_system::set_main_affinity(uint32_t threadIndex) +void task_system::set_current_affinity(uint32_t threadIndex) { #if KRAM_WIN setThreadAffinity(::GetCurrentThread(), threadIndex); @@ -661,7 +661,11 @@ void task_system::log_threads() info.affinity = 0; #endif +#if KRAM_WIN + getThreadInfo(GetCurrentThread(), info.policy, info.priority); +#else getThreadInfo(pthread_self(), info.policy, info.priority); +#endif KLOGI("Thread", "Thread:%s (pol:%d pri:%d aff:%d)", info.name, info.policy, info.priority, info.affinity); From e9e1b50354674cd643253fe282b7e41c135ad563 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 15 May 2022 11:38:24 -0700 Subject: [PATCH 029/615] kram - fix Win tasks --- libkram/kram/TaskSystem.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 211439b6..855a8018 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -271,8 +271,7 @@ void setCurrentThreadName(const char* threadName) void setThreadName(std::thread& thread, const char* threadName) { - DWORD threadId = ::GetThreadId(thread.native_handle()); - setThreadName(threadId, threadName); + setThreadName(thread.native_handle(), threadName); } #elif KRAM_MAC || KRAM_IOS From f9e2565de12d4050bed7558aeb1524e8edef2451 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 15 May 2022 12:55:53 -0700 Subject: [PATCH 030/615] kram - task priority for Win Commented out until priority remap finished. --- libkram/kram/TaskSystem.cpp | 76 ++++++++++++++++++++++++++++++++++--- libkram/kram/TaskSystem.h | 2 +- 2 files changed, 72 insertions(+), 6 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 855a8018..d80449b6 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -166,13 +166,17 @@ static const CoreInfo& GetCoreInfo() uint32_t logicalCores = CountSetBits(ptr->ProcessorMask); if (logicalCores > 1 || !isHyperthreaded) { coreInfo.bigCoreCount++; - coreInfo.remapTable.push_back({(uint8_t)coreNumber++, CoreType::Big}); + coreInfo.remapTable.push_back({(uint8_t)coreNumber, CoreType::Big}); } else { coreInfo.littleCoreCount++; - coreInfo.remapTable.push_back({(uint8_t)coreNumber++, CoreType::Little}); + coreInfo.remapTable.push_back({(uint8_t)coreNumber, CoreType::Little}); } + // Is this the correct index for physical cores? + // Always go through remap table + coreNumber += logicalCores; + logicalCoreCount += logicalCores; break; } @@ -392,13 +396,14 @@ void task_system::set_qos(std::thread& thread, ThreadQos level) void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) { +/* TODO: finish priority remap first struct sched_param param = { priority }; - - // Android doesn not allow policy change (prob SCHED_OTHER), and only allows setting priority; - // Only from Android 10 (API 28). + + // Win has 0 to 15 normal, then 16-31 real time priority int val = pthread_setschedprio(handle, priority); if (val != 0) KLOGW("Thread", "Failed to set priority %d", priority); +*/ } @@ -439,6 +444,56 @@ void task_system::set_qos(std::thread& thread, ThreadQos level) uint8_t priority = convertQosToPriority(level); set_priority(thread, priority); } + +#elif KRAM_WIN + +static uint8_t convertQosToPriority(ThreadQos level) +{ + // TODO: fix these priorities. Linux had 20 to -20 as priorities + // but unclear what Android wants set from the docs. + uint8_t priority = 30; + switch(level) { + case ThreadQos::Interactive: priority = 45; break; + case ThreadQos::High: priority = 41; break; + case ThreadQos::Default: priority = 31; break; + case ThreadQos::Medium: priority = 20; break; + case ThreadQos::Low: priority = 10; break; + } + return priority; +} + +void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) +{ +/* TODO: finish priority remap first + + BOOL success = SetThreadPriority(handle, priority); + if (!success) + LOGW("Thread", "Failed to set priority %d", priority); +*/ +} + +void task_system::set_priority(std::thread& thread, uint8_t priority) +{ + setThreadPriority(thread.native_handle(), priority); +} + +void task_system::set_current_priority(uint8_t priority) +{ + setThreadPriority(pthread_self(), priority); +} + +void task_system::set_current_qos(ThreadQos level) +{ + uint8_t priority = convertQosToPriority(level); + set_current_priority(priority); +} + +void task_system::set_qos(std::thread& thread, ThreadQos level) +{ + uint8_t priority = convertQosToPriority(level); + set_priority(thread, priority); +} + #endif #endif @@ -648,6 +703,17 @@ static void getThreadInfo(std::thread::native_handle_type handle, int& policy, i if (val != 0) KLOGW("Thread", "failed to retrieve thread data"); priority = priorityVal.sched_priority; +#elif KRAM_WIN + // all threads same policy on Win? + // https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Windows%20never%20adjusts%20the%20priority,the%20process%20that%20created%20it. + + // scheduling based on process priority class, thread priority is +/- offset + // DWORD priorityClass = GetPriorityClass(GetCurrentProcess()); + + // The handle must have the THREAD_QUERY_INFORMATION or THREAD_QUERY_LIMITED_INFORMATION access right. + priority = GetThreadPriority(handle); + if (priority == THREAD_PRIORITY_ERROR_RETURN) + priority = 0; #endif } diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index 746d15f6..632d0fc9 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -123,7 +123,7 @@ class notification_queue { void operator=(const type&) = delete #define SUPPORT_AFFINITY (KRAM_ANDROID || KRAM_WIN) -#define SUPPORT_PRIORITY (KRAM_MAC || KRAM_IOS || KRAM_ANDROID) +#define SUPPORT_PRIORITY (KRAM_MAC || KRAM_IOS || KRAM_ANDROID || KRAM_WIN) // only for ioS/macOS From ba528ee88c4ffd6632fa11d0ecc2e11a93138da9 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 15 May 2022 13:05:33 -0700 Subject: [PATCH 031/615] kram - fix Win --- libkram/kram/TaskSystem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index d80449b6..c89c967d 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -479,7 +479,7 @@ void task_system::set_priority(std::thread& thread, uint8_t priority) void task_system::set_current_priority(uint8_t priority) { - setThreadPriority(pthread_self(), priority); + setThreadPriority(::GetCurrentThread(), priority); } void task_system::set_current_qos(ThreadQos level) From 489f94e4b5fe0cf7aab5a5f83839399a4b5929f4 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 15 May 2022 14:58:34 -0700 Subject: [PATCH 032/615] kram - task cleanup --- libkram/kram/TaskSystem.cpp | 97 +++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index c89c967d..a2990c4e 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -228,6 +228,15 @@ static const CoreInfo& GetCoreInfo() //---------------------- +std::thread::native_handle_type getCurrentThread() +{ +#if KRAM_WIN + return ::GetCurrentThread(); +#else + return pthread_self(); +#endif +} + // Ugh C++ "portable" thread classes that don't do anything useful // and make you define all this over and over again in so many apps. // https://stackoverflow.com/questions/10121560/stdthread-naming-your-thread @@ -270,7 +279,7 @@ void setThreadName(std::thread::native_handle_type handle, const char* threadNam void setCurrentThreadName(const char* threadName) { - setThreadName(::GetCurrentThread(), threadName); + setThreadName(getCurrentThread(), threadName); } void setThreadName(std::thread& thread, const char* threadName) @@ -290,8 +299,7 @@ void setThreadName(std::thread::native_handle_type macroUnusedArg(handle), const void setCurrentThreadName(const char* threadName) { - auto handle = pthread_self(); - setThreadName(handle, threadName); + setThreadName(getCurrentThread(), threadName); } // This doesn't exist on macOS. What a pain. Doesn't line up with getter calls. @@ -314,14 +322,12 @@ void setThreadName(std::thread::native_handle_type handle, const char* threadNam void setCurrentThreadName(const char* threadName) { - auto handle = pthread_self(); - setThreadName(handle, threadName); + setThreadName(getCurrentThread(), threadName); } void setThreadName(std::thread& thread, const char* threadName) { - auto handle = thread.native_handle(); - setThreadName(handle, threadName); + setThreadName(thread.native_handle(), threadName); } #endif @@ -376,18 +382,17 @@ void task_system::set_priority(std::thread& thread, uint8_t priority) void task_system::set_current_priority(uint8_t priority) { - setThreadPriority(pthread_self(), priority); + setThreadPriority(getCurrentThread(), priority); } void task_system::set_current_qos(ThreadQos level) { - setThreadQos(pthread_self(), level); + setThreadQos(getCurrentThread(), level); } void task_system::set_qos(std::thread& thread, ThreadQos level) { - auto handle = thread.native_handle(); - setThreadQos(handle, level); + setThreadQos(thread.native_handle(), level); } @@ -429,7 +434,7 @@ void task_system::set_priority(std::thread& thread, uint8_t priority) void task_system::set_current_priority(uint8_t priority) { - setThreadPriority(pthread_self(), priority); + setThreadPriority(getCurrentThread(), priority); } @@ -479,7 +484,7 @@ void task_system::set_priority(std::thread& thread, uint8_t priority) void task_system::set_current_priority(uint8_t priority) { - setThreadPriority(::GetCurrentThread(), priority); + setThreadPriority(getCurrentThread(), priority); } void task_system::set_current_qos(ThreadQos level) @@ -571,17 +576,12 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) { // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ - auto handle = thread.native_handle(); - setThreadAffinity(handle, threadIndex); + setThreadAffinity(thread.native_handle(), threadIndex); } void task_system::set_current_affinity(uint32_t threadIndex) { -#if KRAM_WIN - setThreadAffinity(::GetCurrentThread(), threadIndex); -#else - setThreadAffinity(pthread_self(), threadIndex); -#endif + setThreadAffinity(getCurrentThread(), threadIndex); } @@ -634,8 +634,25 @@ void task_system::run(int32_t threadIndex) } } +struct ThreadInfo { + const char* name; + int policy; + int priority; + int affinity; // single core for now +}; + +// This only works for current thread, but simplifies setting several thread params. +void setThreadInfo(ThreadInfo& info) { + setCurrentThreadName(info.name); + #if SUPPORT_PRIORITY + setThreadPriority(getCurrentThread(), info.priority); + #endif + #if SUPPORT_AFFINITY + setThreadAffinity(getCurrentThread(), info.affinity); + #endif +} task_system::task_system(int32_t count) : _count(std::min(count, (int32_t)GetCoreInfo().physicalCoreCount)), @@ -645,16 +662,9 @@ task_system::task_system(int32_t count) : // see WWDC 2021 presentation here // Tune CPU job scheduling for Apple silicon games // https://developer.apple.com/videos/play/tech-talks/110147/ -#if SUPPORT_PRIORITY - set_current_priority(45); -#endif - -#if SUPPORT_AFFINITY - set_current_affinity(0); -#endif - - setCurrentThreadName("Main"); - + ThreadInfo infoMain = { "Main", 0, 45, 0 }; + setThreadInfo(infoMain); + // Note that running work on core0 when core0 may starve it // from assigning work to threads. @@ -668,32 +678,17 @@ task_system::task_system(int32_t count) : _threadNames.push_back(name); _threads.emplace_back([&, threadIndex, name] { - // Have to set name from thread only for Apple. - setCurrentThreadName(name.c_str()); - + ThreadInfo infoTask = { name.c_str(), 0, 41, threadIndex }; + setThreadInfo(infoTask); + run(threadIndex); }); - -#if SUPPORT_PRIORITY - // it's either this or qos - set_priority(_threads.back(), 41); -#endif - -#if SUPPORT_AFFINITY - set_affinity(_threads.back(), threadIndex); -#endif } // dump out thread data log_threads(); } -struct ThreadInfo { - const char* name; - int policy; - int priority; - int affinity; // single core for now -}; static void getThreadInfo(std::thread::native_handle_type handle, int& policy, int& priority) { @@ -726,11 +721,7 @@ void task_system::log_threads() info.affinity = 0; #endif -#if KRAM_WIN - getThreadInfo(GetCurrentThread(), info.policy, info.priority); -#else - getThreadInfo(pthread_self(), info.policy, info.priority); -#endif + getThreadInfo(getCurrentThread(), info.policy, info.priority); KLOGI("Thread", "Thread:%s (pol:%d pri:%d aff:%d)", info.name, info.policy, info.priority, info.affinity); From 4aaac61959348404a524bb6a41006e0581ae081a Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 21 May 2022 10:02:05 -0700 Subject: [PATCH 033/615] kram - switch to C++20, disable clang modules, switch to header includes Required disabling clang modules. These seem to just break all Objective-C++ compilation. May need to use C++20 modules, but those don't work for ObjC/C++ libraries. --- build2/kram.xcodeproj/project.pbxproj | 8 ++++---- build2/kramc.xcodeproj/project.pbxproj | 8 ++++---- build2/kramv.xcodeproj/project.pbxproj | 20 ++++++------------- gtlf/GLTF/GLTF.h | 5 +++-- gtlf/GLTF/GLTF.xcodeproj/project.pbxproj | 8 ++++---- .../Headers/Extensions/GLTFExtensionNames.h | 3 ++- gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h | 2 +- gtlf/GLTF/Headers/GLTFAccessor.h | 2 +- gtlf/GLTF/Headers/GLTFAsset.h | 2 +- gtlf/GLTF/Headers/GLTFBinaryChunk.h | 3 ++- gtlf/GLTF/Headers/GLTFBufferAllocator.h | 2 +- gtlf/GLTF/Headers/GLTFCamera.h | 2 +- .../GLTF/Headers/GLTFDefaultBufferAllocator.h | 2 +- gtlf/GLTF/Headers/GLTFEnums.h | 5 ++++- gtlf/GLTF/Headers/GLTFImage.h | 2 +- gtlf/GLTF/Headers/GLTFMaterial.h | 2 +- gtlf/GLTF/Headers/GLTFNode.h | 2 +- gtlf/GLTF/Headers/GLTFNodeVisitor.h | 2 +- gtlf/GLTF/Headers/GLTFObject.h | 4 +++- gtlf/GLTF/Headers/GLTFSkin.h | 2 +- gtlf/GLTF/Headers/GLTFTexture.h | 2 +- gtlf/GLTF/Headers/GLTFUtilities.h | 4 ++-- gtlf/GLTF/Headers/GLTFVertexDescriptor.h | 2 +- gtlf/GLTFMTL/GLTFMTL.h | 4 ++-- .../GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj | 9 +++++---- gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h | 7 +++++-- .../Headers/GLTFMTLLightingEnvironment.h | 7 +++++-- gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h | 4 ++-- gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h | 2 +- gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h | 5 ++++- gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h | 8 ++++++-- kram-thumb/KramThumbnailProvider.mm | 3 ++- kramv/KramRenderer.h | 14 +++++++------ kramv/KramRenderer.mm | 6 +++--- kramv/KramViewerMain.mm | 16 +++++++-------- libkram/kram/TaskSystem.cpp | 4 ++-- 36 files changed, 100 insertions(+), 83 deletions(-) diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index 85574ec7..856e7154 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -1619,7 +1619,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -1676,7 +1676,7 @@ GCC_WARN_UNUSED_VARIABLE = YES; HEADER_SEARCH_PATHS = ""; IPHONEOS_DEPLOYMENT_TARGET = 14.1; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; ONLY_ACTIVE_ARCH = YES; @@ -1703,7 +1703,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -1754,7 +1754,7 @@ GCC_WARN_UNUSED_VARIABLE = YES; HEADER_SEARCH_PATHS = ""; IPHONEOS_DEPLOYMENT_TARGET = 14.1; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = NO; MTL_FAST_MATH = YES; OTHER_CFLAGS = ( diff --git a/build2/kramc.xcodeproj/project.pbxproj b/build2/kramc.xcodeproj/project.pbxproj index 9def6b8d..0c1c5c27 100644 --- a/build2/kramc.xcodeproj/project.pbxproj +++ b/build2/kramc.xcodeproj/project.pbxproj @@ -152,7 +152,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -197,7 +197,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; ONLY_ACTIVE_ARCH = YES; @@ -211,7 +211,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -250,7 +250,7 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = NO; MTL_FAST_MATH = YES; SDKROOT = macosx; diff --git a/build2/kramv.xcodeproj/project.pbxproj b/build2/kramv.xcodeproj/project.pbxproj index cda84c46..060736bb 100644 --- a/build2/kramv.xcodeproj/project.pbxproj +++ b/build2/kramv.xcodeproj/project.pbxproj @@ -506,7 +506,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -553,7 +553,7 @@ GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram"; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; MTL_LANGUAGE_REVISION = UseDeploymentTarget; @@ -563,11 +563,7 @@ "-include", KramConfig.h, ); - OTHER_CPLUSPLUSFLAGS = ( - "$(OTHER_CFLAGS)", - "-fcxx-modules", - "-fmodules", - ); + OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)"; SDKROOT = macosx; }; name = Debug; @@ -578,7 +574,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -619,7 +615,7 @@ GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram"; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = NO; MTL_FAST_MATH = YES; MTL_LANGUAGE_REVISION = UseDeploymentTarget; @@ -628,11 +624,7 @@ "-include", KramConfig.h, ); - OTHER_CPLUSPLUSFLAGS = ( - "$(OTHER_CFLAGS)", - "-fcxx-modules", - "-fmodules", - ); + OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)"; SDKROOT = macosx; }; name = Release; diff --git a/gtlf/GLTF/GLTF.h b/gtlf/GLTF/GLTF.h index 6d2807d0..9612a95f 100644 --- a/gtlf/GLTF/GLTF.h +++ b/gtlf/GLTF/GLTF.h @@ -17,9 +17,10 @@ #include #if TARGET_OS_OSX -@import Cocoa; +// eliminate Cocoa, can't this use AppKit? +#include #elif TARGET_OS_IOS -@import UIKit; +#include #endif //! Project version number for GLTF. diff --git a/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj b/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj index 06789696..047e484f 100644 --- a/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj +++ b/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj @@ -400,8 +400,8 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 8.0; - MACOSX_DEPLOYMENT_TARGET = 10.12; + IPHONEOS_DEPLOYMENT_TARGET = 14.1; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = YES; ONLY_ACTIVE_ARCH = YES; SDKROOT = macosx; @@ -454,8 +454,8 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 8.0; - MACOSX_DEPLOYMENT_TARGET = 10.12; + IPHONEOS_DEPLOYMENT_TARGET = 14.1; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = NO; SDKROOT = macosx; VERSIONING_SYSTEM = "apple-generic"; diff --git a/gtlf/GLTF/Headers/Extensions/GLTFExtensionNames.h b/gtlf/GLTF/Headers/Extensions/GLTFExtensionNames.h index 5f77b187..cfb734b0 100644 --- a/gtlf/GLTF/Headers/Extensions/GLTFExtensionNames.h +++ b/gtlf/GLTF/Headers/Extensions/GLTFExtensionNames.h @@ -14,7 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +//@import Foundation; +#import extern NSString *const GLTFExtensionKHRMaterialsPBRSpecularGlossiness; extern NSString *const GLTFExtensionKHRLights; diff --git a/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h b/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h index 45859dbe..5044f4e4 100644 --- a/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h +++ b/gtlf/GLTF/Headers/Extensions/GLTFKHRLight.h @@ -16,7 +16,7 @@ #import -@import simd; +//@import simd; typedef NS_ENUM(NSInteger, GLTFKHRLightType) { GLTFKHRLightTypeAmbient, diff --git a/gtlf/GLTF/Headers/GLTFAccessor.h b/gtlf/GLTF/Headers/GLTFAccessor.h index 78f7e1ba..453a1d41 100644 --- a/gtlf/GLTF/Headers/GLTFAccessor.h +++ b/gtlf/GLTF/Headers/GLTFAccessor.h @@ -17,7 +17,7 @@ #import #import -@import simd; +//@import simd; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFAsset.h b/gtlf/GLTF/Headers/GLTFAsset.h index ce6a2eb9..ee203e1a 100644 --- a/gtlf/GLTF/Headers/GLTFAsset.h +++ b/gtlf/GLTF/Headers/GLTFAsset.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +//@import Foundation; #import #import diff --git a/gtlf/GLTF/Headers/GLTFBinaryChunk.h b/gtlf/GLTF/Headers/GLTFBinaryChunk.h index 27e821f2..1af848d1 100644 --- a/gtlf/GLTF/Headers/GLTFBinaryChunk.h +++ b/gtlf/GLTF/Headers/GLTFBinaryChunk.h @@ -14,7 +14,8 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +//@import Foundation; +#import extern const UInt32 GLTFBinaryMagic; diff --git a/gtlf/GLTF/Headers/GLTFBufferAllocator.h b/gtlf/GLTF/Headers/GLTFBufferAllocator.h index e04119f6..75e4729a 100644 --- a/gtlf/GLTF/Headers/GLTFBufferAllocator.h +++ b/gtlf/GLTF/Headers/GLTFBufferAllocator.h @@ -16,7 +16,7 @@ #import -@import Foundation; +//@import Foundation; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFCamera.h b/gtlf/GLTF/Headers/GLTFCamera.h index 942b845a..348a44ca 100644 --- a/gtlf/GLTF/Headers/GLTFCamera.h +++ b/gtlf/GLTF/Headers/GLTFCamera.h @@ -17,7 +17,7 @@ #import #import -@import simd; +//@import simd; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h b/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h index 18e8a591..20efcc32 100644 --- a/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h +++ b/gtlf/GLTF/Headers/GLTFDefaultBufferAllocator.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +//@import Foundation; #import diff --git a/gtlf/GLTF/Headers/GLTFEnums.h b/gtlf/GLTF/Headers/GLTFEnums.h index 1ed2e8d7..25a8d832 100644 --- a/gtlf/GLTF/Headers/GLTFEnums.h +++ b/gtlf/GLTF/Headers/GLTFEnums.h @@ -14,7 +14,10 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; + +#import +#import +//@import Foundation; typedef NS_ENUM(NSInteger, GLTFDataType) { GLTFBaseTypeUnknown, diff --git a/gtlf/GLTF/Headers/GLTFImage.h b/gtlf/GLTF/Headers/GLTFImage.h index 118e4cd5..266122e0 100644 --- a/gtlf/GLTF/Headers/GLTFImage.h +++ b/gtlf/GLTF/Headers/GLTFImage.h @@ -17,7 +17,7 @@ #import #import -@import Foundation; +//@import Foundation; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFMaterial.h b/gtlf/GLTF/Headers/GLTFMaterial.h index cebe325e..0d80fe10 100644 --- a/gtlf/GLTF/Headers/GLTFMaterial.h +++ b/gtlf/GLTF/Headers/GLTFMaterial.h @@ -17,7 +17,7 @@ #import #import -@import simd; +//@import simd; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFNode.h b/gtlf/GLTF/Headers/GLTFNode.h index 08ddff42..6f80f837 100644 --- a/gtlf/GLTF/Headers/GLTFNode.h +++ b/gtlf/GLTF/Headers/GLTFNode.h @@ -20,7 +20,7 @@ NS_ASSUME_NONNULL_BEGIN -@import simd; +//@import simd; @class GLTFCamera, GLTFSkin, GLTFMesh; @class GLTFKHRLight; diff --git a/gtlf/GLTF/Headers/GLTFNodeVisitor.h b/gtlf/GLTF/Headers/GLTFNodeVisitor.h index 36be13b8..cd6b67fc 100644 --- a/gtlf/GLTF/Headers/GLTFNodeVisitor.h +++ b/gtlf/GLTF/Headers/GLTFNodeVisitor.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +//@import Foundation; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFObject.h b/gtlf/GLTF/Headers/GLTFObject.h index 9b7cc8de..9d5c7654 100644 --- a/gtlf/GLTF/Headers/GLTFObject.h +++ b/gtlf/GLTF/Headers/GLTFObject.h @@ -14,7 +14,9 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +#import +#import +//@import Foundation; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFSkin.h b/gtlf/GLTF/Headers/GLTFSkin.h index e3c1c60c..869fd76e 100644 --- a/gtlf/GLTF/Headers/GLTFSkin.h +++ b/gtlf/GLTF/Headers/GLTFSkin.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +//@import Foundation; #import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFTexture.h b/gtlf/GLTF/Headers/GLTFTexture.h index 14ab1a6f..5cf0357c 100644 --- a/gtlf/GLTF/Headers/GLTFTexture.h +++ b/gtlf/GLTF/Headers/GLTFTexture.h @@ -17,7 +17,7 @@ #import #import -@import simd; +//@import simd; typedef struct { simd_float2 offset; diff --git a/gtlf/GLTF/Headers/GLTFUtilities.h b/gtlf/GLTF/Headers/GLTFUtilities.h index ea90a6a8..88d65925 100644 --- a/gtlf/GLTF/Headers/GLTFUtilities.h +++ b/gtlf/GLTF/Headers/GLTFUtilities.h @@ -16,8 +16,8 @@ #import -@import Foundation; -@import simd; +//@import Foundation; +//@import simd; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTF/Headers/GLTFVertexDescriptor.h b/gtlf/GLTF/Headers/GLTFVertexDescriptor.h index 3523d137..8591edde 100644 --- a/gtlf/GLTF/Headers/GLTFVertexDescriptor.h +++ b/gtlf/GLTF/Headers/GLTFVertexDescriptor.h @@ -14,7 +14,7 @@ // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // -@import Foundation; +//@import Foundation; #import NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTFMTL/GLTFMTL.h b/gtlf/GLTFMTL/GLTFMTL.h index 35bc6931..7b32e20d 100644 --- a/gtlf/GLTFMTL/GLTFMTL.h +++ b/gtlf/GLTFMTL/GLTFMTL.h @@ -17,9 +17,9 @@ #import #if TARGET_OS_OSX -@import Cocoa; +#import #elif TARGET_OS_IOS -@import UIKit; +#import #endif //! Project version number for GLTFMTL. diff --git a/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj b/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj index ebc83f38..2573207b 100644 --- a/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj +++ b/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj @@ -170,6 +170,7 @@ developmentRegion = English; hasScannedForEncodings = 0; knownRegions = ( + English, en, ); mainGroup = 83D6FFA71F48BCB500F71E0C; @@ -259,8 +260,8 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 10.0; - MACOSX_DEPLOYMENT_TARGET = 10.12; + IPHONEOS_DEPLOYMENT_TARGET = 14.1; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = YES; ONLY_ACTIVE_ARCH = YES; SDKROOT = macosx; @@ -315,8 +316,8 @@ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 10.0; - MACOSX_DEPLOYMENT_TARGET = 10.12; + IPHONEOS_DEPLOYMENT_TARGET = 14.1; + MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = NO; SDKROOT = macosx; SUPPORTED_PLATFORMS = "macosx iphoneos"; diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h b/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h index f62c4025..acd40e02 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLBufferAllocator.h @@ -15,9 +15,12 @@ // #import +#import +#import +#import -@import Foundation; -@import Metal; +//@import Foundation; +//@import Metal; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h b/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h index 6045a5fc..95a99c44 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLLightingEnvironment.h @@ -15,9 +15,12 @@ // #import +#import +#import +#import -@import Foundation; -@import Metal; +//@import Foundation; +//@import Metal; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h b/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h index a2a4d04a..0ba84149 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLRenderer.h @@ -17,8 +17,8 @@ #import #import -@import Foundation; -@import Metal; +//@import Foundation; +//@import Metal; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h b/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h index 30733fc5..77ebed2f 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLShaderBuilder.h @@ -17,7 +17,7 @@ #import #import -@import Metal; +//@import Metal; #define GLTFMTLMaximumLightCount 3 diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h b/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h index 2818af75..afc00fdd 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLTextureLoader.h @@ -15,8 +15,11 @@ // #import +#import +#import +#import -@import Metal; +//@import Metal; NS_ASSUME_NONNULL_BEGIN diff --git a/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h b/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h index 3338443e..bc0fa5f8 100644 --- a/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h +++ b/gtlf/GLTFMTL/Headers/GLTFMTLUtilities.h @@ -15,9 +15,13 @@ // #import +#import +#import +#import -@import Metal; -@import simd; + +//@import Metal; +//@import simd; NS_ASSUME_NONNULL_BEGIN diff --git a/kram-thumb/KramThumbnailProvider.mm b/kram-thumb/KramThumbnailProvider.mm index 33b35cf4..d08119fb 100644 --- a/kram-thumb/KramThumbnailProvider.mm +++ b/kram-thumb/KramThumbnailProvider.mm @@ -5,7 +5,8 @@ #import "KramThumbnailProvider.h" #include "KramLib.h" -#include +#import +#import #import // for vImage using namespace kram; diff --git a/kramv/KramRenderer.h b/kramv/KramRenderer.h index 566d0b26..a413b715 100644 --- a/kramv/KramRenderer.h +++ b/kramv/KramRenderer.h @@ -2,10 +2,10 @@ // The license and copyright notice shall be included // in all copies or substantial portions of the Software. -@import Foundation; -@import MetalKit; -//#import -//#import +//@import Foundation; +//@import MetalKit; +#import +#import #include "KramLib.h" #import "KramShaders.h" // for TextureChannels @@ -18,8 +18,10 @@ #define USE_PERSPECTIVE 0 #if USE_GLTF -@import GLTF; -@import GLTFMTL; +#import "GLTF/GLTF.h" +#import "GLTFMTL/GLTFMTL.h" +//@import GLTF; +//@import GLTFMTL; #endif diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index f392894d..40cbd404 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -4,9 +4,9 @@ #import "KramRenderer.h" -@import ModelIO; -//#import -//#import +//@import ModelIO; +#import +#import // Include header shared between C code here, which executes Metal API commands, // and .metal files diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 65b59c42..485845e7 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -3,14 +3,14 @@ // in all copies or substantial portions of the Software. // using -fmodules and -fcxx-modules -@import Cocoa; -@import Metal; -@import MetalKit; - -//#import -//#import -//#import -//#import +//@import Cocoa; +//@import Metal; +//@import MetalKit; + +#import +#import +#import +#import #import "KramRenderer.h" #import "KramShaders.h" diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index a2990c4e..159787ec 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -248,13 +248,13 @@ std::thread::native_handle_type getCurrentThread() // Isn't this in a header? #pragma pack(push,8) -typedef struct tagTHREADNAME_INFO +struct THREADNAME_INFO { DWORD dwType; // Must be 0x1000. LPCSTR szName; // Pointer to name (in user addr space). DWORD dwThreadID; // Thread ID (-1=caller thread). DWORD dwFlags; // Reserved for future use, must be zero. -} THREADNAME_INFO; +}; #pragma pack(pop) void setThreadName(std::thread::native_handle_type handle, const char* threadName) From ba09bac7df89e635cc291f8125173f2d40f29ee2 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 21 May 2022 10:25:08 -0700 Subject: [PATCH 034/615] kram - cmake, bump win up to C++20, macOS/iOS are 11.0/14.1 minimum now too --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 19a82556..13c6b570 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,7 +44,7 @@ endif() # suppress ZERO_CHECK project set(CMAKE_SUPPRESS_REGENERATION true) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED YES) set(CMAKE_CXX_EXTENSIONS NO) @@ -55,10 +55,10 @@ set(CMAKE_CXX_EXTENSIONS NO) # CMAKE_OSX_DEPLOYMENT_TARGET must be set as a CACHE variable, or it will be stripped if (BUILD_IOS) - set(CMAKE_OSX_DEPLOYMENT_TARGET "14.0" CACHE STRING "Minimum iOS") + set(CMAKE_OSX_DEPLOYMENT_TARGET "14.1" CACHE STRING "Minimum iOS") set(CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Architecture iOS") elseif (BUILD_MAC) - set(CMAKE_OSX_DEPLOYMENT_TARGET "10.15" CACHE STRING "Minimum macOS") + set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS") set(CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)" CACHE STRING "Architecture macOS") endif() From 4b2b3025b4b7f60e2f70abe5b8cb36b8bf3dece8 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 21 May 2022 20:25:11 -0700 Subject: [PATCH 035/615] kram - fix warnings and cmake min macOS --- CMakeLists.txt | 2 +- kram-preview/KramPreviewViewController.mm | 2 +- kram-thumb/KramThumbnailProvider.mm | 2 +- libkram/kram/KramConfig.h | 12 ++++++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 13c6b570..064119e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,7 @@ if (BUILD_MAC OR BUILD_IOS) endif() # check the SDK - set(XCODE_MIN_SDK_IOS 14.0) + set(XCODE_MIN_SDK_IOS 14.1) set(XCODE_MIN_SDK_MACOS 11.0) execute_process( diff --git a/kram-preview/KramPreviewViewController.mm b/kram-preview/KramPreviewViewController.mm index b2eaf928..bd0f6b7a 100644 --- a/kram-preview/KramPreviewViewController.mm +++ b/kram-preview/KramPreviewViewController.mm @@ -262,7 +262,7 @@ - (void)preparePreviewOfFileAtURL:(NSURL *)url completionHandler:(void (^)(NSErr .bitsPerPixel = 32, }; - format.bitmapInfo = kCGBitmapByteOrderDefault | (isPremul ? kCGImageAlphaPremultipliedLast : kCGImageAlphaLast); + format.bitmapInfo = kCGBitmapByteOrderDefault | (CGBitmapInfo)(isPremul ? kCGImageAlphaPremultipliedLast : kCGImageAlphaLast); format.colorSpace = isSrgb ? CGColorSpaceCreateWithName(kCGColorSpaceSRGB) : CGColorSpaceCreateDeviceRGB(); // don't need to allocate, can requse memory from mip diff --git a/kram-thumb/KramThumbnailProvider.mm b/kram-thumb/KramThumbnailProvider.mm index d08119fb..e263cde1 100644 --- a/kram-thumb/KramThumbnailProvider.mm +++ b/kram-thumb/KramThumbnailProvider.mm @@ -243,7 +243,7 @@ - (void)provideThumbnailForFileRequest:(QLFileThumbnailRequest *)request complet .bitsPerPixel = 32, }; - format.bitmapInfo = kCGBitmapByteOrderDefault | (isPremul ? kCGImageAlphaPremultipliedLast : kCGImageAlphaLast); + format.bitmapInfo = kCGBitmapByteOrderDefault | (CGBitmapInfo)(isPremul ? kCGImageAlphaPremultipliedLast : kCGImageAlphaLast); format.colorSpace = isSrgb ? CGColorSpaceCreateWithName(kCGColorSpaceSRGB) : CGColorSpaceCreateDeviceRGB(); // don't need to allocate, can reuse memory from mip diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h index 2a130a0a..fa3ba302 100644 --- a/libkram/kram/KramConfig.h +++ b/libkram/kram/KramConfig.h @@ -191,6 +191,17 @@ #else +/* +// seems that Modules have "partial" support in Xcode, whatever that means +// these imports are taken from MSVC which has a full implementation + +import std.memory; +import std.threading; +import std.core; +import std.filesystem; +import std.regex; +*/ + #include // for max #include @@ -207,6 +218,7 @@ #include #include + #define NAMESPACE_STL std #endif From 335e7ebef7cd28c379ee39664fca39d1e97ca720 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Wed, 25 May 2022 23:02:18 -0700 Subject: [PATCH 036/615] kram - small fixes --- kramv/KramViewerMain.mm | 6 ++++++ libkram/kram/Kram.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 485845e7..c5663eb1 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -1717,6 +1717,12 @@ - (void)scrollWheel:(NSEvent *)event return; } + // From ImGui notes: + // From macOS 12.1, scrolling with two fingers and then decelerating + // by tapping two fingers results in two events appearing. + if (event.phase == NSEventPhaseCancelled) + return; + double wheelX = [event scrollingDeltaX]; double wheelY = [event scrollingDeltaY]; diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index e4b19ab0..69351494 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -2787,7 +2787,7 @@ static int32_t kramAppEncode(vector& args) break; } - // TODO: if args ends with /, then output to that dir + // TODO: if args Ωƒends with /, then output to that dir dstFilename = args[i]; } else if (isStringEqual(word, "-input") || From 93792d82cf9462c61b7f3c28846a4936d6c19e82 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 28 May 2022 22:04:28 -0700 Subject: [PATCH 037/615] kram - fix toHalf4 --- libkram/kram/KramConfig.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h index fa3ba302..9d186d8a 100644 --- a/libkram/kram/KramConfig.h +++ b/libkram/kram/KramConfig.h @@ -438,7 +438,7 @@ inline float4 toFloat4(const half4& vv) } inline half4 toHalf4(const float4& vv) { - return half(vcvt_f16_f32(*(const float32x4_t*)&vv)); + return half4(vcvt_f16_f32(*(const float32x4_t*)&vv)); } #endif From 74cb9119aaaac93166f34f6514535cf799435743 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Mon, 13 Jun 2022 23:57:53 -0700 Subject: [PATCH 038/615] kram - cleanup for Xcode 14 Turn on dead-code stripping. Turn off exceptions/RTTI. Turn off warnings for conversion of 64 to 32-bit. Fix sprintf use to safer alternative. --- build2/kram.xcodeproj/project.pbxproj | 12 ++++++--- build2/kramc.xcodeproj/project.pbxproj | 12 +++++++-- build2/kramv.xcodeproj/project.pbxproj | 8 ++++++ gtlf/GLTF/GLTF.xcodeproj/project.pbxproj | 6 +++++ .../GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj | 6 +++++ libkram/ate/ateencoder.mm | 2 +- libkram/kram/KramFileHelper.cpp | 2 +- libkram/kram/KramImage.cpp | 2 +- libkram/kram/KramLog.cpp | 8 +++--- libkram/kram/TaskSystem.cpp | 25 ++++++++++++++++++- libkram/kram/TaskSystem.h | 3 +++ libkram/tmpfileplus/tmpfileplus.cpp | 12 +++------ libkram/transcoder/basisu_containers_impl.h | 13 +++------- 13 files changed, 79 insertions(+), 32 deletions(-) diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index 856e7154..e57e4e91 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -866,18 +866,18 @@ 706EEE1926D1583F001C950E /* KramZipHelper.h */, 706EEE1E26D1583F001C950E /* KramZipHelper.cpp */, 706EEE2326D1583F001C950E /* KramConfig.h */, + 706EEE3126D1583F001C950E /* KramImageInfo.h */, 706EEE2526D1583F001C950E /* KramImageInfo.cpp */, - 706EEE2626D1583F001C950E /* KramImage.cpp */, 706EEE2726D1583F001C950E /* KramLib.h */, 706EEE2426D1583F001C950E /* KramLog.h */, 706EEE2826D1583F001C950E /* KramLog.cpp */, 706EEE2926D1583F001C950E /* KramVersion.h */, 706EEE2A26D1583F001C950E /* KramImage.h */, + 706EEE2626D1583F001C950E /* KramImage.cpp */, 706EEE2026D1583F001C950E /* KramSDFMipper.h */, 706EEE2B26D1583F001C950E /* KramSDFMipper.cpp */, 706EEE3026D1583F001C950E /* KTXImage.h */, 706EEE1B26D1583F001C950E /* KTXImage.cpp */, - 706EEE3126D1583F001C950E /* KramImageInfo.h */, 706EEE3226D1583F001C950E /* KramTimer.h */, 706EEE1A26D1583F001C950E /* KramTimer.cpp */, 706EEE3326D1583F001C950E /* KramMmapHelper.h */, @@ -1650,6 +1650,7 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_TESTABILITY = YES; @@ -1663,6 +1664,7 @@ "DEBUG=1", "$(inherited)", ); + "GCC_WARN_64_TO_32_BIT_CONVERSION[arch=*64]" = NO; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS = YES; @@ -1692,6 +1694,7 @@ "-include", KramConfig.h, ); + PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; USER_HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram"; }; @@ -1734,6 +1737,7 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; ENABLE_NS_ASSERTIONS = NO; ENABLE_STRICT_OBJC_MSGSEND = YES; @@ -1741,6 +1745,7 @@ GCC_ENABLE_CPP_EXCEPTIONS = NO; GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; + "GCC_WARN_64_TO_32_BIT_CONVERSION[arch=*64]" = NO; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_HIDDEN_VIRTUAL_FUNCTIONS = YES; @@ -1769,6 +1774,7 @@ "-include", KramConfig.h, ); + PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; USER_HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram"; }; @@ -1797,7 +1803,6 @@ CLANG_WARN_OBJC_REPEATED_USE_OF_WEAK = YES; CLANG_X86_VECTOR_INSTRUCTIONS = avx; CODE_SIGN_STYLE = Automatic; - DEAD_CODE_STRIPPING = NO; EXECUTABLE_PREFIX = lib; GCC_PRECOMPILE_PREFIX_HEADER = NO; GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramConfig.h"; @@ -1833,7 +1838,6 @@ CLANG_WARN_DOCUMENTATION_COMMENTS = NO; CLANG_X86_VECTOR_INSTRUCTIONS = default; CODE_SIGN_STYLE = Automatic; - DEAD_CODE_STRIPPING = NO; EXECUTABLE_PREFIX = lib; GCC_PRECOMPILE_PREFIX_HEADER = NO; GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramConfig.h"; diff --git a/build2/kramc.xcodeproj/project.pbxproj b/build2/kramc.xcodeproj/project.pbxproj index 0c1c5c27..e73338b2 100644 --- a/build2/kramc.xcodeproj/project.pbxproj +++ b/build2/kramc.xcodeproj/project.pbxproj @@ -180,18 +180,21 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_TESTABILITY = YES; GCC_C_LANGUAGE_STANDARD = gnu11; GCC_DYNAMIC_NO_PIC = NO; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( "DEBUG=1", "$(inherited)", ); - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = NO; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_UNDECLARED_SELECTOR = YES; GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; @@ -201,6 +204,7 @@ MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; ONLY_ACTIVE_ARCH = YES; + PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; }; name = Debug; @@ -239,12 +243,15 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; ENABLE_NS_ASSERTIONS = NO; ENABLE_STRICT_OBJC_MSGSEND = YES; GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = NO; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_UNDECLARED_SELECTOR = YES; GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; @@ -253,6 +260,7 @@ MACOSX_DEPLOYMENT_TARGET = 11.0; MTL_ENABLE_DEBUG_INFO = NO; MTL_FAST_MATH = YES; + PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; }; name = Release; diff --git a/build2/kramv.xcodeproj/project.pbxproj b/build2/kramv.xcodeproj/project.pbxproj index 060736bb..cad5656d 100644 --- a/build2/kramv.xcodeproj/project.pbxproj +++ b/build2/kramv.xcodeproj/project.pbxproj @@ -534,12 +534,15 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = dwarf; DONT_GENERATE_INFOPLIST_FILE = YES; ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_TESTABILITY = YES; GCC_C_LANGUAGE_STANDARD = gnu11; GCC_DYNAMIC_NO_PIC = NO; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( @@ -564,6 +567,7 @@ KramConfig.h, ); OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)"; + PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; }; name = Debug; @@ -602,11 +606,14 @@ CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; COPY_PHASE_STRIP = NO; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; DONT_GENERATE_INFOPLIST_FILE = YES; ENABLE_NS_ASSERTIONS = NO; ENABLE_STRICT_OBJC_MSGSEND = YES; GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; @@ -625,6 +632,7 @@ KramConfig.h, ); OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)"; + PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; }; name = Release; diff --git a/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj b/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj index 047e484f..c93236fd 100644 --- a/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj +++ b/gtlf/GLTF/GLTF.xcodeproj/project.pbxproj @@ -383,11 +383,14 @@ CODE_SIGN_IDENTITY = "-"; COPY_PHASE_STRIP = NO; CURRENT_PROJECT_VERSION = 1; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_TESTABILITY = YES; GCC_C_LANGUAGE_STANDARD = gnu99; GCC_DYNAMIC_NO_PIC = NO; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( @@ -443,10 +446,13 @@ CODE_SIGN_IDENTITY = "-"; COPY_PHASE_STRIP = NO; CURRENT_PROJECT_VERSION = 1; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; ENABLE_NS_ASSERTIONS = NO; ENABLE_STRICT_OBJC_MSGSEND = YES; GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; diff --git a/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj b/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj index 2573207b..9d51e748 100644 --- a/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj +++ b/gtlf/GLTFMTL/GLTFMTL.xcodeproj/project.pbxproj @@ -243,11 +243,14 @@ CODE_SIGN_IDENTITY = "-"; COPY_PHASE_STRIP = NO; CURRENT_PROJECT_VERSION = 1; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_STRICT_OBJC_MSGSEND = YES; ENABLE_TESTABILITY = YES; GCC_C_LANGUAGE_STANDARD = gnu99; GCC_DYNAMIC_NO_PIC = NO; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( @@ -305,10 +308,13 @@ CODE_SIGN_IDENTITY = "-"; COPY_PHASE_STRIP = NO; CURRENT_PROJECT_VERSION = 1; + DEAD_CODE_STRIPPING = YES; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; ENABLE_NS_ASSERTIONS = NO; ENABLE_STRICT_OBJC_MSGSEND = YES; GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_ENABLE_CPP_EXCEPTIONS = NO; + GCC_ENABLE_CPP_RTTI = NO; GCC_NO_COMMON_BLOCKS = YES; GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; diff --git a/libkram/ate/ateencoder.mm b/libkram/ate/ateencoder.mm index 1836ebda..f3a43b9b 100644 --- a/libkram/ate/ateencoder.mm +++ b/libkram/ate/ateencoder.mm @@ -438,7 +438,7 @@ inline my_at_block_format_t pixelToDecoderFormat(MyMTLPixelFormat format, bool i // decode is leaving a=60 for some bizarro reason, so correct that if (srcAlphaType == at_alpha_opaque) { - for (uint32_t i = 0, iEnd = w*h; i < iEnd; ++i) + for (uint32_t i = 0, iEnd = (uint32_t)w*h; i < iEnd; ++i) { dstData[4*i+3] = 255; } diff --git a/libkram/kram/KramFileHelper.cpp b/libkram/kram/KramFileHelper.cpp index a6c2b873..05996cb8 100644 --- a/libkram/kram/KramFileHelper.cpp +++ b/libkram/kram/KramFileHelper.cpp @@ -176,7 +176,7 @@ bool FileHelper::copyTemporaryFileTo(const char* dstFilename) size_t bytesRemaining = size_; while (bytesRemaining > 0) { - int bytesToRead = min(bufferSize, bytesRemaining); + size_t bytesToRead = min(bufferSize, bytesRemaining); bytesRemaining -= bytesToRead; if (!read(tmpBuf.data(), bytesToRead) || diff --git a/libkram/kram/KramImage.cpp b/libkram/kram/KramImage.cpp index d6694ac3..80568fdf 100644 --- a/libkram/kram/KramImage.cpp +++ b/libkram/kram/KramImage.cpp @@ -1089,7 +1089,7 @@ bool KramDecoder::decodeImpl(const KTXImage& srcImage, FILE* dstFile, KTXImage& } } - int32_t dstMipOffset = dstMipLevel.offset + chunk * dstMipLevel.length; + uint32_t dstMipOffset = dstMipLevel.offset + chunk * dstMipLevel.length; if (!writeDataAtOffset(outputTexture.data(), dstMipLevel.length, dstMipOffset, dstFile, dstImage)) { return false; diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp index 8e3fa994..5a7726b9 100644 --- a/libkram/kram/KramLog.cpp +++ b/libkram/kram/KramLog.cpp @@ -53,8 +53,8 @@ void getErrorLogCaptureText(string& text) // //} -// Note: careful with stdio sscanf. In clang, this does and initial strlen which for long buffers -// being parsed (f.e. mmapped Json) this can significantly slow a parser down. +// Note: careful with stdio sscanf. In clang, this does an initial strlen which for long buffers +// being parsed (f.e. mmapped Json) can significantly slow a parser down. int32_t append_vsprintf(string& str, const char* format, va_list args) { @@ -62,7 +62,7 @@ int32_t append_vsprintf(string& str, const char* format, va_list args) if (strcmp(format, "%s") == 0) { const char* firstArg = va_arg(args, const char*); str += firstArg; - return strlen(firstArg); + return (int32_t)strlen(firstArg); } // This is important for the case where ##VAR_ARGS only leaves the format. @@ -70,7 +70,7 @@ int32_t append_vsprintf(string& str, const char* format, va_list args) // for KLOGE("group", "text") if (strrchr(format, '%') == nullptr) { str += format; - return strlen(format); + return (int32_t)strlen(format); } // format once to get length (without NULL at end) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 159787ec..6a32e791 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -13,6 +13,7 @@ #include #elif KRAM_WIN #include + #include #else #include #endif @@ -246,6 +247,8 @@ std::thread::native_handle_type getCurrentThread() #if KRAM_WIN +/* This is the old way. This name is only available if debugger attached. + // Isn't this in a header? #pragma pack(push,8) struct THREADNAME_INFO @@ -276,6 +279,27 @@ void setThreadName(std::thread::native_handle_type handle, const char* threadNam { } } +*/ + +// TODO: on Win, also need to set the following. Then use Windows Termnial. +// SetConsoleOutputCP(CP_UTF8); + +void setThreadName(std::thread::native_handle_type handle, const char* threadName) +{ + // TODO: use std::wstring_convert(); + // std::codecvt_utf8_utf16 + + // ugh, win still using char16_t. TODO: this isn't utf8 to utf16 conversion + uint32_t len = strlen(threadName); + std::wstring str; + str.reserve(len); + for (uint32_t i = 0; i < len; ++i) { + if (threadname[i] <= 127) + str.push_back((char)threadName[i]); + } + + ::SetThreadDescription(handle, str.c_str()); +} void setCurrentThreadName(const char* threadName) { @@ -314,7 +338,6 @@ void setCurrentThreadName(const char* threadName) void setThreadName(std::thread::native_handle_type handle, const char* threadName) { - // This can only set on self int val = pthread_setname_np(handle, threadName); if (val != 0) KLOGW("Thread", "Could not set thread name"); diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index 632d0fc9..fdc26515 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -122,6 +122,9 @@ class notification_queue { type(const type&) = delete; \ void operator=(const type&) = delete +// Note: if running multiple processes on the same cpu, then affinity +// isn't ideal. It will force work onto the same cores. Especially if +// limiting cores to say 4/16, then can run 4 processes faster w/o affinity. #define SUPPORT_AFFINITY (KRAM_ANDROID || KRAM_WIN) #define SUPPORT_PRIORITY (KRAM_MAC || KRAM_IOS || KRAM_ANDROID || KRAM_WIN) diff --git a/libkram/tmpfileplus/tmpfileplus.cpp b/libkram/tmpfileplus/tmpfileplus.cpp index da555877..ece6707a 100644 --- a/libkram/tmpfileplus/tmpfileplus.cpp +++ b/libkram/tmpfileplus/tmpfileplus.cpp @@ -109,9 +109,12 @@ */ #define OPEN_ _open #define FDOPEN_ _fdopen +#define FILE_SEPARATOR "\\" +#define snprintf sprintf_s #else #define OPEN_ open #define FDOPEN_ fdopen +#define FILE_SEPARATOR "/" #endif @@ -122,13 +125,6 @@ #define DPRINTF1(s, a1) #endif - -#ifdef _WIN32 -#define FILE_SEPARATOR "\\" -#else -#define FILE_SEPARATOR "/" -#endif - #define RANDCHARS "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" #define NRANDCHARS (sizeof(RANDCHARS) - 1) @@ -243,7 +239,7 @@ static FILE *mktempfile_internal(const char *tmpdir, const char *pfx, const char /* If we don't manage to create a file after 10 goes, there is something wrong... */ for (i = 0; i < 10; i++) { - sprintf(tmpname, "%s%s%s%s%s", tmpdir, separator, pfx, set_randpart(randpart), sfx); + snprintf(tmpname, lentempname + 1, "%s%s%s%s%s", tmpdir, separator, pfx, set_randpart(randpart), sfx); DPRINTF1("[%s]\n", tmpname); fd = OPEN_(tmpname, oflag, pmode); if (fd != -1) break; diff --git a/libkram/transcoder/basisu_containers_impl.h b/libkram/transcoder/basisu_containers_impl.h index 65551714..b233997a 100644 --- a/libkram/transcoder/basisu_containers_impl.h +++ b/libkram/transcoder/basisu_containers_impl.h @@ -3,6 +3,7 @@ #ifdef _MSC_VER #pragma warning (disable:4127) // warning C4127: conditional expression is constant +#define snprintf sprintf_s #endif namespace basisu @@ -46,11 +47,7 @@ namespace basisu return false; char buf[256]; -#ifdef _MSC_VER - sprintf_s(buf, sizeof(buf), "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size); -#else - sprintf(buf, "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size); -#endif + snprintf(buf, sizeof(buf), "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size); fprintf(stderr, "%s", buf); abort(); } @@ -73,11 +70,7 @@ namespace basisu return false; char buf[256]; -#ifdef _MSC_VER - sprintf_s(buf, sizeof(buf), "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size); -#else - sprintf(buf, "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size); -#endif + snprintf(buf, sizeof(buf), "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size); fprintf(stderr, "%s", buf); abort(); } From e296c8d693d17a2684dd01154ade44951f5cfc52 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Tue, 14 Jun 2022 00:11:06 -0700 Subject: [PATCH 039/615] kram - fix typo --- libkram/kram/TaskSystem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 6a32e791..512104d0 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -294,7 +294,7 @@ void setThreadName(std::thread::native_handle_type handle, const char* threadNam std::wstring str; str.reserve(len); for (uint32_t i = 0; i < len; ++i) { - if (threadname[i] <= 127) + if (threadName[i] <= 127) str.push_back((char)threadName[i]); } From 36020265aeead5a6fdf90cd765486a7956f3231a Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 24 Jun 2022 23:15:58 -0700 Subject: [PATCH 040/615] kramv - hide file table when info shown Info can be a lot of lines, and the overlap of this and the file table wasn't ideal. --- kramv/KramViewerMain.mm | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index c5663eb1..2707dce3 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -2399,6 +2399,10 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD // that info else if (action == _actionInfo) { if (_showSettings->isHudShown) { + + // also hide the file table, since this can be long + [self hideFileTable]; + sprintf(text, "%s", isShiftKeyDown ? _showSettings->imageInfoVerbose.c_str() : _showSettings->imageInfo.c_str()); From 2ee942e6c6340d87c4fd44f5b12be63d4cd69bf0 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 24 Jun 2022 23:17:29 -0700 Subject: [PATCH 041/615] kram - simplify policyt/priority handling. Some platforms like Android make setting policy/priority difficult. So make each platform handle the nuances internally off a ThreadPriority enum. --- libkram/kram/TaskSystem.cpp | 291 +++++++++++++++--------------------- libkram/kram/TaskSystem.h | 20 +-- 2 files changed, 124 insertions(+), 187 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 512104d0..b54caa26 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -1,7 +1,7 @@ #include "TaskSystem.h" #if KRAM_MAC - // affiniity + // affinity #include #include @@ -14,10 +14,14 @@ #elif KRAM_WIN #include #include +#elif KRAM_ANDROID + #include #else #include #endif +// TODO: look at replacing this with Job Queue from Filament + namespace kram { using namespace NAMESPACE_STL; @@ -243,7 +247,7 @@ std::thread::native_handle_type getCurrentThread() // https://stackoverflow.com/questions/10121560/stdthread-naming-your-thread // Of course, Windows has to make portability difficult. // And Mac non-standardly, doesn't even pass thread to call. -// This requires it to be set from thread itself). +// This requires it to be set from thread itself. #if KRAM_WIN @@ -336,6 +340,7 @@ void setCurrentThreadName(const char* threadName) #else +// 15 char name limit on Linux/Android, how modern! void setThreadName(std::thread::native_handle_type handle, const char* threadName) { int val = pthread_setname_np(handle, threadName); @@ -357,174 +362,101 @@ void setThreadName(std::thread& thread, const char* threadName) //------------------ -#if SUPPORT_PRIORITY #if KRAM_MAC || KRAM_IOS -static void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) -{ - struct sched_param param = { priority }; - - // this sets policy to round-robin and priority - int val = pthread_setschedparam(handle, SCHED_RR, ¶m); - if (val != 0) - KLOGW("Thread", "Failed to set priority %d", priority); -} - -static void setThreadQos(std::thread::native_handle_type handle, ThreadQos level) +static void setThreadPriority(std::thread::native_handle_type handle, ThreadPriority priority) { - // https://abhimuralidharan.medium.com/understanding-threads-in-ios-5b8d7ab16f09 - // user-interactive, user-initiated, default, utility, background, unspecified - - qos_class_t qos = QOS_CLASS_UNSPECIFIED; - switch(level) { - case ThreadQos::Interactive: qos = QOS_CLASS_USER_INTERACTIVE; break; - case ThreadQos::High: qos = QOS_CLASS_USER_INITIATED; break; - case ThreadQos::Default: qos = QOS_CLASS_DEFAULT; break; - case ThreadQos::Medium: qos = QOS_CLASS_UTILITY; break; - case ThreadQos::Low: qos = QOS_CLASS_BACKGROUND; break; + if (priority == ThreadPriority::Default) { + + /* samples of qos + qos_class_t qos = QOS_CLASS_UNSPECIFIED; + switch(level) { + case ThreadQos::Interactive: qos = QOS_CLASS_USER_INTERACTIVE; break; + case ThreadQos::High: qos = QOS_CLASS_USER_INITIATED; break; + case ThreadQos::Default: qos = QOS_CLASS_DEFAULT; break; + case ThreadQos::Medium: qos = QOS_CLASS_UTILITY; break; + case ThreadQos::Low: qos = QOS_CLASS_BACKGROUND; break; + } + */ + + // qos is transferred to GCD jobs, and can experience thread depriority + // can system can try to adjust priority inversion. + + // note here the priorityOffset = 0, but is negative offsets + // there is a narrow range of offsets + + // note this is a start/end overide call, but can set override on existing thread + // TODO: this returns a newly allocated object which isn't released here + // need to release with pthread_override_qos_class_end_np(override); + + qos_class_t qos = QOS_CLASS_DEFAULT; + auto val = pthread_override_qos_class_start_np(handle, qos, 0); + if (val != nullptr) + KLOGW("Thread", "Failed to set qos %d", (int)qos); + } + else { + int prioritySys = 0; + switch(priority) { + case ThreadPriority::Default: prioritySys = 30; break; // skipped above + case ThreadPriority::High: prioritySys = 41; break; + case ThreadPriority::Interactive: prioritySys = 45; break; + } + + struct sched_param param = { prioritySys }; + + // policy choices + // SCHED_RR, SCHED_FIFO, SCHED_OTHER + int policy = SCHED_RR; + + // this sets policy to round-robin and priority + int val = pthread_setschedparam(handle, policy, ¶m); + if (val != 0) + KLOGW("Thread", "Failed to set policy %d priority %d", policy, prioritySys); } - - // qos is transferred to GCD jobs, and can experience thread depriority - // can system can try to adjust priority inversion. - - // note here the priorityOffset = 0, but is negative offsets - // there is a narrow range of offsets - - // note this is a start/end overide call, but can set override on existing thread - // TODO: this returns a newly allocated object which isn't released here - // need to release with pthread_override_qos_class_end_np(override); - auto val = pthread_override_qos_class_start_np(handle, qos, 0); - if (val != nullptr) - KLOGW("Thread", "Failed to set qos %d", (int)qos); -} - -void task_system::set_priority(std::thread& thread, uint8_t priority) -{ - setThreadPriority(thread.native_handle(), priority); -} - -void task_system::set_current_priority(uint8_t priority) -{ - setThreadPriority(getCurrentThread(), priority); -} - -void task_system::set_current_qos(ThreadQos level) -{ - setThreadQos(getCurrentThread(), level); -} - -void task_system::set_qos(std::thread& thread, ThreadQos level) -{ - setThreadQos(thread.native_handle(), level); } - - #elif KRAM_ANDROID -void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) -{ -/* TODO: finish priority remap first - struct sched_param param = { priority }; - - // Win has 0 to 15 normal, then 16-31 real time priority - int val = pthread_setschedprio(handle, priority); - if (val != 0) - KLOGW("Thread", "Failed to set priority %d", priority); -*/ -} - - -static uint8_t convertQosToPriority(ThreadQos level) +static void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) { - // TODO: fix these priorities. Linux had 20 to -20 as priorities - // but unclear what Android wants set from the docs. - uint8_t priority = 30; - switch(level) { - case ThreadQos::Interactive: priority = 45; break; - case ThreadQos::High: priority = 41; break; - case ThreadQos::Default: priority = 31; break; - case ThreadQos::Medium: priority = 20; break; - case ThreadQos::Low: priority = 10; break; + // Android on -20 to 20, where lower is higher priority + int prioritySys = 0; + switch(priority) { + case ThreadPriority::Default: prioritySys = 0; break; // NORMAL + case ThreadPriority::High: prioritySys = -4; break; // ABOVE NORMAL + case ThreadPriority::Interactive: prioritySys = -8; break; // HIGHEST } - return priority; -} - -void task_system::set_priority(std::thread& thread, uint8_t priority) -{ - setThreadPriority(thread.native_handle(), priority); -} - -void task_system::set_current_priority(uint8_t priority) -{ - setThreadPriority(getCurrentThread(), priority); -} - - -void task_system::set_current_qos(ThreadQos level) -{ - uint8_t priority = convertQosToPriority(level); - set_current_priority(priority); -} - -void task_system::set_qos(std::thread& thread, ThreadQos level) -{ - uint8_t priority = convertQosToPriority(level); - set_priority(thread, priority); + + int val = setpriority(PRIO_PROCESS, 0, prioritySys); + if (val != 0) + KLOGW("Thread", "Failed to set priority %d", prioritySys); } #elif KRAM_WIN -static uint8_t convertQosToPriority(ThreadQos level) +static void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) { - // TODO: fix these priorities. Linux had 20 to -20 as priorities - // but unclear what Android wants set from the docs. - uint8_t priority = 30; - switch(level) { - case ThreadQos::Interactive: priority = 45; break; - case ThreadQos::High: priority = 41; break; - case ThreadQos::Default: priority = 31; break; - case ThreadQos::Medium: priority = 20; break; - case ThreadQos::Low: priority = 10; break; + // Win has 0 to 15 normal, then 16-31 real time priority + int prioritySys = 0; + switch(priority) { + case ThreadPriority::Default: prioritySys = 0; break; // NORMAL + case ThreadPriority::High: prioritySys = 1; break; // ABOVE NORMAL + case ThreadPriority::Interactive: prioritySys = 2; break; // HIGHEST } - return priority; -} - -void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) -{ -/* TODO: finish priority remap first - - BOOL success = SetThreadPriority(handle, priority); + + BOOL success = SetThreadPriority(handle, prioritySys); if (!success) - LOGW("Thread", "Failed to set priority %d", priority); -*/ + LOGW("Thread", "Failed to set priority %d", prioritySys); } -void task_system::set_priority(std::thread& thread, uint8_t priority) -{ - setThreadPriority(thread.native_handle(), priority); -} +#endif -void task_system::set_current_priority(uint8_t priority) +void task_system::set_current_priority(ThreadPriority priority) { + // Most systems can set priority from another thread, but Android can't setThreadPriority(getCurrentThread(), priority); } -void task_system::set_current_qos(ThreadQos level) -{ - uint8_t priority = convertQosToPriority(level); - set_current_priority(priority); -} - -void task_system::set_qos(std::thread& thread, ThreadQos level) -{ - uint8_t priority = convertQosToPriority(level); - set_priority(thread, priority); -} - -#endif -#endif - #if SUPPORT_AFFINITY static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t threadIndex) @@ -545,7 +477,7 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t macroUnusedVar(affinityMask); #if KRAM_MAC - // don't use this, it's unsupported on ARM chips, and only an affinity hints on x64 + // don't use this, it's unsupported on ARM chips, and only affinity hints on x64 // #if KRAM_SSE // if (!coreInfo.isTranslated) { // thread_affinity_policy_data_t policy = { (int)affinityMask }; @@ -658,20 +590,17 @@ void task_system::run(int32_t threadIndex) } struct ThreadInfo { - const char* name; - int policy; - int priority; - int affinity; // single core for now + const char* name = ""; + ThreadPriority priority = ThreadPriority::Default; + int affinity = 0; // single core for now }; // This only works for current thread, but simplifies setting several thread params. void setThreadInfo(ThreadInfo& info) { setCurrentThreadName(info.name); - #if SUPPORT_PRIORITY setThreadPriority(getCurrentThread(), info.priority); - #endif - + #if SUPPORT_AFFINITY setThreadAffinity(getCurrentThread(), info.affinity); #endif @@ -685,7 +614,7 @@ task_system::task_system(int32_t count) : // see WWDC 2021 presentation here // Tune CPU job scheduling for Apple silicon games // https://developer.apple.com/videos/play/tech-talks/110147/ - ThreadInfo infoMain = { "Main", 0, 45, 0 }; + ThreadInfo infoMain = { "Main", ThreadPriority::Interactive, 0 }; setThreadInfo(infoMain); // Note that running work on core0 when core0 may starve it @@ -701,7 +630,7 @@ task_system::task_system(int32_t count) : _threadNames.push_back(name); _threads.emplace_back([&, threadIndex, name] { - ThreadInfo infoTask = { name.c_str(), 0, 41, threadIndex }; + ThreadInfo infoTask = { name.c_str(), ThreadPriority::High, threadIndex }; setThreadInfo(infoTask); run(threadIndex); @@ -712,15 +641,29 @@ task_system::task_system(int32_t count) : log_threads(); } - -static void getThreadInfo(std::thread::native_handle_type handle, int& policy, int& priority) +// TEODO: rename to getThreadPriority +static void getThreadInfo(std::thread::native_handle_type handle, ThreadPriority& priority) { -#if KRAM_MAC || KRAM_IOS || KRAM_ANDROID +#if KRAM_MAC || KRAM_IOS + int policy = 0; struct sched_param priorityVal; int val = pthread_getschedparam(handle, &policy, &priorityVal); if (val != 0) KLOGW("Thread", "failed to retrieve thread data"); - priority = priorityVal.sched_priority; + int prioritySys = priorityVal.sched_priority; + + // remap back to enum + switch(prioritySys) { + case 41: priority = ThreadPriority::High; break; + case 45: priority = ThreadPriority::Interactive; break; + default: priority = ThreadPriority::Default; break; + } + +#elif KRAM_ANDROID + // only have getpriority call on current thread + // pthread_getschedparam never returns valid data + priority = ThreadPriority::Default; // TODO: fix + #elif KRAM_WIN // all threads same policy on Win? // https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Windows%20never%20adjusts%20the%20priority,the%20process%20that%20created%20it. @@ -729,9 +672,17 @@ static void getThreadInfo(std::thread::native_handle_type handle, int& policy, i // DWORD priorityClass = GetPriorityClass(GetCurrentProcess()); // The handle must have the THREAD_QUERY_INFORMATION or THREAD_QUERY_LIMITED_INFORMATION access right. - priority = GetThreadPriority(handle); - if (priority == THREAD_PRIORITY_ERROR_RETURN) - priority = 0; + int prioritySys = GetThreadPriority(handle); + if (prioritySys == THREAD_PRIORITY_ERROR_RETURN) + prioritySys = 0; + + switch(prioritySys) { + case 1: priority = ThreadPriority::High; break; + case 2: priority = ThreadPriority::Interactive; break; + default: priority = ThreadPriority::Default; break; + } + + // TODO: remap back to enum #endif } @@ -744,23 +695,21 @@ void task_system::log_threads() info.affinity = 0; #endif - getThreadInfo(getCurrentThread(), info.policy, info.priority); - KLOGI("Thread", "Thread:%s (pol:%d pri:%d aff:%d)", - info.name, info.policy, info.priority, info.affinity); + getThreadInfo(getCurrentThread(), info.priority); + KLOGI("Thread", "Thread:%s (pri:%d aff:%d)", + info.name, info.priority, info.affinity); for (uint32_t i = 0; i < _threads.size(); ++i) { - info.policy = 0; - info.priority = 0; info.name = _threadNames[i].c_str(); #if SUPPORT_AFFINITY // TODO: if more tasks/threads than cores, then this isn't accurate // but don't want to write a getter for this right now. info.affinity = i; #endif - getThreadInfo(_threads[i].native_handle(), info.policy, info.priority); - KLOGI("Thread", "Thread:%s (pol:%d pri:%d aff:%d)", - info.name, info.policy, info.priority, info.affinity); + getThreadInfo(_threads[i].native_handle(), info.priority); + KLOGI("Thread", "Thread:%s (pri:%d aff:%d)", + info.name, info.priority, info.affinity); } } diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index fdc26515..ebd86e90 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -126,14 +126,13 @@ class notification_queue { // isn't ideal. It will force work onto the same cores. Especially if // limiting cores to say 4/16, then can run 4 processes faster w/o affinity. #define SUPPORT_AFFINITY (KRAM_ANDROID || KRAM_WIN) -#define SUPPORT_PRIORITY (KRAM_MAC || KRAM_IOS || KRAM_ANDROID || KRAM_WIN) // only for ioS/macOS -enum class ThreadQos +enum class ThreadPriority { - Low = 1, - Medium = 2, + //Low = 1, + //Medium = 2, Default = 3, High = 4, Interactive = 5, @@ -155,21 +154,10 @@ class task_system { void run(int32_t threadIndex); #if SUPPORT_AFFINITY - // affinity isn't really supported on Apple - void set_affinity(std::thread& thread, uint32_t threadIndex); static void set_current_affinity(uint32_t threadIndex); #endif -#if SUPPORT_PRIORITY - // these are Apple specific, due to lack of affinity control - // once priority set, can't use qos. Also Android can't control - // policy, only the priority in API 28. - void set_qos(std::thread& thread, ThreadQos level); - static void set_current_qos(ThreadQos level); - - void set_priority(std::thread& thread, uint8_t priority); - static void set_current_priority(uint8_t priority); -#endif + static void set_current_priority(ThreadPriority priority); void log_threads(); From bc562b7fc4e1d916974ce438ce32b8b056d5357d Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 25 Jun 2022 00:08:22 -0700 Subject: [PATCH 042/615] kram - fix win build --- libkram/kram/TaskSystem.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index b54caa26..b784d56b 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -434,7 +434,7 @@ static void setThreadPriority(std::thread::native_handle_type handle, uint8_t pr #elif KRAM_WIN -static void setThreadPriority(std::thread::native_handle_type handle, uint8_t priority) +static void setThreadPriority(std::thread::native_handle_type handle, ThreadPriority priority) { // Win has 0 to 15 normal, then 16-31 real time priority int prioritySys = 0; @@ -461,6 +461,8 @@ void task_system::set_current_priority(ThreadPriority priority) static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t threadIndex) { + // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ + // const auto& coreInfo = GetCoreInfo(); uint32_t maxIndex = coreInfo.remapTable.size() - 1; @@ -528,11 +530,10 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t #endif } -void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) -{ - // https://eli.thegreenplace.net/2016/c11-threads-affinity-and-hyperthreading/ - setThreadAffinity(thread.native_handle(), threadIndex); -} +//void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) +//{ +// setThreadAffinity(thread.native_handle(), threadIndex); +//} void task_system::set_current_affinity(uint32_t threadIndex) { From 545b7e0c770fd7ecb7934c1609ebf3b67f396747 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 25 Jun 2022 00:15:04 -0700 Subject: [PATCH 043/615] kram - fix win build --- libkram/kram/TaskSystem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index b784d56b..36e21d41 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -446,7 +446,7 @@ static void setThreadPriority(std::thread::native_handle_type handle, ThreadPrio BOOL success = SetThreadPriority(handle, prioritySys); if (!success) - LOGW("Thread", "Failed to set priority %d", prioritySys); + KLOGW("Thread", "Failed to set priority %d", prioritySys); } #endif From 03fff0a1af5b8624bc11ee8e25ee2d0d85f6360c Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 25 Jun 2022 00:17:31 -0700 Subject: [PATCH 044/615] kram - set bkgd block to black (0,0,0) on png to try to fix Finder's white thumbnails This may not survive image optim, but fixup srgb also now sets the bkgd block so Finder stops generating white thumbnails. These make white content and icons impossible to view, and Apple blocks redefining the png thumbnailer with kramv's custom one. --- libkram/kram/Kram.cpp | 68 +++++++++++++++++++++++++++++++++------- libkram/kram/KramImage.h | 9 +++++- 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index 69351494..9880a010 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -701,6 +701,23 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray } } + // because Apple finder thumbnails can't be overridden with custom thumbanailer + // and defaults to white bkgd (making white icons impossible to see). + // track the bkgd block, and set/re-define as all black. Maybe will honor that. + bool hasBackground = false; + bool hasBlackBackground = false; + chunkData = lodepng_chunk_find_const(data, data + dataSize, "bKGD"); + if (chunkData) { + lodepng_inspect_chunk(&state, chunkData - data, data, end-data); + if (state.info_png.background_defined) { + hasBackground = true; + hasBlackBackground = + state.info_png.background_r == 0 && // gray/pallete uses this only + state.info_png.background_g == 0 && + state.info_png.background_b == 0; + } + } + // don't convert png bit depths, but can convert pallete data // if (state.info_png.color.bitdepth != 8) { // return false; @@ -775,6 +792,7 @@ bool LoadPng(const uint8_t* data, size_t dataSize, bool isPremulRgb, bool isGray } sourceImage.setSrgbState(isSrgb, hasSrgbBlock, hasNonSrgbBlocks); + sourceImage.setBackgroundState(hasBlackBackground); return sourceImage.loadImageFromPixels(pixels, width, height, hasColor, hasAlpha); } @@ -792,16 +810,18 @@ bool SavePNG(Image& image, const char* filename) // Then if srgb, see if that matches content type srgb state below. TexContentType contentType = findContentTypeFromFilename(filename); bool isSrgb = contentType == TexContentTypeAlbedo; - + // Skip file if it has srgb block, and none of the other block types. // This code will also strip the sRGB block from apps like Figma that always set it. - if (isSrgb == image.isSrgb()) { - if (isSrgb == image.hasSrgbBlock() && !image.hasNonSrgbBlocks()) { - KLOGI("Kram", "skipping srgb correction"); - return true; + if (image.hasBlackBackground()) { + if (isSrgb == image.isSrgb()) { + if (isSrgb == image.hasSrgbBlock() && !image.hasNonSrgbBlocks()) { + KLOGI("Kram", "skipping srgb correction"); + return true; + } } } - + // This is the only block written or not lodepng::State state; if (isSrgb) { @@ -810,6 +830,16 @@ bool SavePNG(Image& image, const char* filename) state.info_png.srgb_intent = 0; } + // always redefine background to black, so Finder thumbnails are not white + // this makes viewing any white icons nearly impossible. Make suer lodepng + // ignores this background on import, want the stored pixels not ones composited. + // Note that _r is only used for grayscale/pallete, and these values are in same + // color depth as pixels. But 0 works for all bit-depths. + state.info_png.background_defined = true; + state.info_png.background_r = 0; + state.info_png.background_g = 0; + state.info_png.background_b = 0; + // TODO: could write other data into Txt block // or try to preserve those @@ -2060,9 +2090,22 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6 } } - // TODO: also bkgd blocks. - - + // because Apple finder thumbnails can't be overridden with custom thumbanailer + // and defaults to white bkgd (making white icons impossible to see). + // track the bkgd block, and set/re-define as all black. Maybe will honor that. + bool hasBackground = false; + bool hasBlackBackground = false; + chunkData = lodepng_chunk_find_const(data, data + dataSize, "bKGD"); + if (chunkData) { + lodepng_inspect_chunk(&state, chunkData - data, data, end-data); + if (state.info_png.background_defined) { + hasBackground = true; + hasBlackBackground = + state.info_png.background_r == 0 && // gray/pallete uses this only + state.info_png.background_g == 0 && + state.info_png.background_b == 0; + } + } string info; @@ -2117,7 +2160,8 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6 "colr: %s\n" "alph: %s\n" "palt: %s\n" - "srgb: %s\n", + "srgb: %s\n" + "bkgd: %s\n", textureTypeName(MyMTLTextureType2D), width, height, width * height / (1000.0f * 1000.0f), @@ -2125,7 +2169,9 @@ string kramInfoPNGToString(const string& srcFilename, const uint8_t* data, uint6 hasColor ? "y" : "n", hasAlpha ? "y" : "n", hasPalette ? "y" : "n", - isSrgb ? "y" : "n"); + isSrgb ? "y" : "n", + hasBackground ? "y" : "n" + ); info += tmp; // optional block with ppi diff --git a/libkram/kram/KramImage.h b/libkram/kram/KramImage.h index eda911a1..f450bd39 100644 --- a/libkram/kram/KramImage.h +++ b/libkram/kram/KramImage.h @@ -45,7 +45,8 @@ class Image { // set state off png blocks void setSrgbState(bool isSrgb, bool hasSrgbBlock, bool hasNonSrgbBlocks); - + void setBackgroundState(bool hasBlackBackground) { _hasBlackBackground = hasBlackBackground; } + // convert mip level of explicit format to single-image bool loadImageFromKTX(const KTXImage& image, uint32_t mipNumber = 0); @@ -72,6 +73,8 @@ class Image { bool hasSrgbBlock() const { return _hasSrgbBlock; } bool hasNonSrgbBlocks() const { return _hasNonSrgbBlocks; } + bool hasBlackBackground() const { return _hasBlackBackground; } + // if converted a KTX/2 image to Image, then this field will be non-zero uint32_t chunksY() const { return _chunksY; } void setChunksY(uint32_t chunksY) { _chunksY = chunksY; } @@ -93,10 +96,14 @@ class Image { bool _hasColor = true; bool _hasAlpha = true; + // track to fix incorrect sRGB state from Figma/Photoshop on PNG files bool _isSrgb = false; bool _hasNonSrgbBlocks = false; bool _hasSrgbBlock = false; + // track to fix Apple Finder previews that are always white background + bool _hasBlackBackground = false; + // this is the entire strip data, float version can be passed for HDR // sources always 4 channels RGBA for 8 and 32f data. 16f promoted to 32f. vector _pixels; From 4c9cd92a93a577128ad1e5bb620ef65b9aebfd69 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 25 Jun 2022 12:07:14 -0700 Subject: [PATCH 045/615] kram - more cleanup on tasks --- libkram/kram/TaskSystem.cpp | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 36e21d41..f709b22b 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -530,11 +530,6 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t #endif } -//void task_system::set_affinity(std::thread& thread, uint32_t threadIndex) -//{ -// setThreadAffinity(thread.native_handle(), threadIndex); -//} - void task_system::set_current_affinity(uint32_t threadIndex) { setThreadAffinity(getCurrentThread(), threadIndex); @@ -642,10 +637,14 @@ task_system::task_system(int32_t count) : log_threads(); } -// TEODO: rename to getThreadPriority -static void getThreadInfo(std::thread::native_handle_type handle, ThreadPriority& priority) +static ThreadPriority getThreadPriority(std::thread::native_handle_type handle) { -#if KRAM_MAC || KRAM_IOS + ThreadPriority priority = ThreadPriority::Default; + +#if KRAM_MAC || KRAM_IOS || KRAM_ANDROID + // Note: this doesn't handle qOS, and returns default priority + // on those threads. + int policy = 0; struct sched_param priorityVal; int val = pthread_getschedparam(handle, &policy, &priorityVal); @@ -660,11 +659,19 @@ static void getThreadInfo(std::thread::native_handle_type handle, ThreadPriority default: priority = ThreadPriority::Default; break; } +/* Using code above since it may work with other threads #elif KRAM_ANDROID + // Note: only for current thread + // only have getpriority call on current thread // pthread_getschedparam never returns valid data - priority = ThreadPriority::Default; // TODO: fix - + int priority = getpriority(PRIO_PROCESS, 0); + switch(prioritySys) { + case 41: priority = ThreadPriority::High; break; + case 45: priority = ThreadPriority::Interactive; break; + default: priority = ThreadPriority::Default; break; + } +*/ #elif KRAM_WIN // all threads same policy on Win? // https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Windows%20never%20adjusts%20the%20priority,the%20process%20that%20created%20it. @@ -682,9 +689,9 @@ static void getThreadInfo(std::thread::native_handle_type handle, ThreadPriority case 2: priority = ThreadPriority::Interactive; break; default: priority = ThreadPriority::Default; break; } - - // TODO: remap back to enum #endif + + return priority; } @@ -696,7 +703,7 @@ void task_system::log_threads() info.affinity = 0; #endif - getThreadInfo(getCurrentThread(), info.priority); + info.priority = getThreadPriority(getCurrentThread()); KLOGI("Thread", "Thread:%s (pri:%d aff:%d)", info.name, info.priority, info.affinity); @@ -708,7 +715,7 @@ void task_system::log_threads() // but don't want to write a getter for this right now. info.affinity = i; #endif - getThreadInfo(_threads[i].native_handle(), info.priority); + info.priority = getThreadPriority(_threads[i].native_handle()); KLOGI("Thread", "Thread:%s (pri:%d aff:%d)", info.name, info.priority, info.affinity); } From cc9a579de976c964723af4ed1bd62c20f5caf3c5 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Fri, 15 Jul 2022 09:14:21 -0700 Subject: [PATCH 046/615] kram - update bc7enc to bc7enc_rdo This is the maintained codebase. Still has bug with all alpha = 255 mapped to 254. Will put in patch for that next. https://github.com/richgel999/bc7enc/issues/3 --- build2/kram.xcodeproj/project.pbxproj | 134 +- libkram/bc7enc/LICENSE | 12 +- libkram/bc7enc/README.md | 237 +- libkram/bc7enc/bc7decomp.cpp | 450 ++- libkram/bc7enc/bc7decomp.h | 17 +- libkram/bc7enc/bc7decomp_ref.cpp | 431 +++ libkram/bc7enc/bc7enc.cpp | 1231 +++++--- libkram/bc7enc/bc7enc.h | 103 +- libkram/bc7enc/ert.cpp | 705 +++++ libkram/bc7enc/ert.h | 81 + libkram/bc7enc/rdo_bc_encoder.cpp | 1270 ++++++++ libkram/bc7enc/rdo_bc_encoder.h | 269 ++ libkram/bc7enc/rgbcx.cpp | 3083 +++++++++++++++++++ libkram/bc7enc/rgbcx.h | 4040 +------------------------ libkram/bc7enc/rgbcx_table4_small.h | 969 ++++++ libkram/bc7enc/utils.cpp | 908 ++++++ libkram/bc7enc/utils.h | 2617 ++++++++++++++++ libkram/kram/KramImage.cpp | 10 +- 18 files changed, 11931 insertions(+), 4636 deletions(-) create mode 100644 libkram/bc7enc/bc7decomp_ref.cpp create mode 100644 libkram/bc7enc/ert.cpp create mode 100644 libkram/bc7enc/ert.h create mode 100644 libkram/bc7enc/rdo_bc_encoder.cpp create mode 100644 libkram/bc7enc/rdo_bc_encoder.h create mode 100644 libkram/bc7enc/rgbcx.cpp create mode 100644 libkram/bc7enc/rgbcx_table4_small.h create mode 100644 libkram/bc7enc/utils.cpp create mode 100644 libkram/bc7enc/utils.h diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index e57e4e91..4e3a0c5a 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -19,8 +19,6 @@ 706EEF8926D1595D001C950E /* EtcBlock4x4Encoding_ETC1.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDBF26D1583E001C950E /* EtcBlock4x4Encoding_ETC1.cpp */; }; 706EEF8A26D1595D001C950E /* EtcBlock4x4Encoding.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDC526D1583E001C950E /* EtcBlock4x4Encoding.cpp */; }; 706EEF8B26D1595D001C950E /* EtcBlock4x4.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDC626D1583E001C950E /* EtcBlock4x4.cpp */; }; - 706EEF8C26D1595D001C950E /* bc7decomp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDCE26D1583E001C950E /* bc7decomp.cpp */; }; - 706EEF8D26D1595D001C950E /* bc7enc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDD026D1583E001C950E /* bc7enc.cpp */; }; 706EEFA726D1595D001C950E /* basisu_transcoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE0426D1583F001C950E /* basisu_transcoder.cpp */; }; 706EEFA826D1595D001C950E /* miniz.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1126D1583F001C950E /* miniz.cpp */; }; 706EEFA926D1595D001C950E /* hedistance.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1426D1583F001C950E /* hedistance.cpp */; }; @@ -68,10 +66,6 @@ 706EEFE026D15984001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC426D1583E001C950E /* EtcBlock4x4Encoding_RGBA8.h */; }; 706EEFE126D15984001C950E /* EtcColorFloatRGBA.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC726D1583E001C950E /* EtcColorFloatRGBA.h */; }; 706EEFE226D15984001C950E /* EtcBlock4x4Encoding.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC826D1583E001C950E /* EtcBlock4x4Encoding.h */; }; - 706EEFE326D15984001C950E /* rgbcx.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDCB26D1583E001C950E /* rgbcx.h */; }; - 706EEFE426D15984001C950E /* bc7enc.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDCC26D1583E001C950E /* bc7enc.h */; }; - 706EEFE526D15984001C950E /* bc7decomp.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDCD26D1583E001C950E /* bc7decomp.h */; }; - 706EEFE626D15984001C950E /* rgbcx_table4.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDD126D1583E001C950E /* rgbcx_table4.h */; }; 706EEFF226D15984001C950E /* ateencoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFA26D1583E001C950E /* ateencoder.h */; }; 706EEFF326D15984001C950E /* basisu_transcoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFC26D1583E001C950E /* basisu_transcoder.h */; }; 706EEFF426D15984001C950E /* basisu_containers.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFD26D1583E001C950E /* basisu_containers.h */; }; @@ -134,10 +128,6 @@ 706EF15A26D166C5001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC426D1583E001C950E /* EtcBlock4x4Encoding_RGBA8.h */; }; 706EF15B26D166C5001C950E /* EtcColorFloatRGBA.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC726D1583E001C950E /* EtcColorFloatRGBA.h */; }; 706EF15C26D166C5001C950E /* EtcBlock4x4Encoding.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDC826D1583E001C950E /* EtcBlock4x4Encoding.h */; }; - 706EF15D26D166C5001C950E /* rgbcx.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDCB26D1583E001C950E /* rgbcx.h */; }; - 706EF15E26D166C5001C950E /* bc7enc.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDCC26D1583E001C950E /* bc7enc.h */; }; - 706EF15F26D166C5001C950E /* bc7decomp.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDCD26D1583E001C950E /* bc7decomp.h */; }; - 706EF16026D166C5001C950E /* rgbcx_table4.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDD126D1583E001C950E /* rgbcx_table4.h */; }; 706EF16C26D166C5001C950E /* ateencoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFA26D1583E001C950E /* ateencoder.h */; }; 706EF16D26D166C5001C950E /* basisu_transcoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFC26D1583E001C950E /* basisu_transcoder.h */; }; 706EF16E26D166C5001C950E /* basisu_containers.h in Headers */ = {isa = PBXBuildFile; fileRef = 706EEDFD26D1583E001C950E /* basisu_containers.h */; }; @@ -193,8 +183,6 @@ 706EF1A126D166C5001C950E /* EtcBlock4x4Encoding_ETC1.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDBF26D1583E001C950E /* EtcBlock4x4Encoding_ETC1.cpp */; }; 706EF1A226D166C5001C950E /* EtcBlock4x4Encoding.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDC526D1583E001C950E /* EtcBlock4x4Encoding.cpp */; }; 706EF1A326D166C5001C950E /* EtcBlock4x4.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDC626D1583E001C950E /* EtcBlock4x4.cpp */; }; - 706EF1A426D166C5001C950E /* bc7decomp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDCE26D1583E001C950E /* bc7decomp.cpp */; }; - 706EF1A526D166C5001C950E /* bc7enc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDD026D1583E001C950E /* bc7enc.cpp */; }; 706EF1BF26D166C5001C950E /* basisu_transcoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE0426D1583F001C950E /* basisu_transcoder.cpp */; }; 706EF1C026D166C5001C950E /* miniz.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1126D1583F001C950E /* miniz.cpp */; }; 706EF1C126D166C5001C950E /* hedistance.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEE1426D1583F001C950E /* hedistance.cpp */; }; @@ -245,6 +233,36 @@ 706EFF8426D34740001C950E /* red_black_tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5F26D3473F001C950E /* red_black_tree.cpp */; }; 706EFF8526D34740001C950E /* fixed_pool.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD6026D3473F001C950E /* fixed_pool.cpp */; }; 706EFF8626D34740001C950E /* fixed_pool.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD6026D3473F001C950E /* fixed_pool.cpp */; }; + 707789D52881BA81008A51BC /* bc7enc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789C62881BA81008A51BC /* bc7enc.cpp */; }; + 707789D62881BA81008A51BC /* bc7enc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789C62881BA81008A51BC /* bc7enc.cpp */; }; + 707789D72881BA81008A51BC /* bc7enc.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C72881BA81008A51BC /* bc7enc.h */; }; + 707789D82881BA81008A51BC /* bc7enc.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C72881BA81008A51BC /* bc7enc.h */; }; + 707789D92881BA81008A51BC /* bc7decomp.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C82881BA81008A51BC /* bc7decomp.h */; }; + 707789DA2881BA81008A51BC /* bc7decomp.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C82881BA81008A51BC /* bc7decomp.h */; }; + 707789DB2881BA81008A51BC /* ert.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C92881BA81008A51BC /* ert.h */; }; + 707789DC2881BA81008A51BC /* ert.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789C92881BA81008A51BC /* ert.h */; }; + 707789DD2881BA81008A51BC /* rgbcx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CA2881BA81008A51BC /* rgbcx.cpp */; }; + 707789DE2881BA81008A51BC /* rgbcx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CA2881BA81008A51BC /* rgbcx.cpp */; }; + 707789DF2881BA81008A51BC /* rgbcx_table4.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CB2881BA81008A51BC /* rgbcx_table4.h */; }; + 707789E02881BA81008A51BC /* rgbcx_table4.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CB2881BA81008A51BC /* rgbcx_table4.h */; }; + 707789E12881BA81008A51BC /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CC2881BA81008A51BC /* utils.cpp */; }; + 707789E22881BA81008A51BC /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CC2881BA81008A51BC /* utils.cpp */; }; + 707789E32881BA81008A51BC /* rgbcx_table4_small.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CD2881BA81008A51BC /* rgbcx_table4_small.h */; }; + 707789E42881BA81008A51BC /* rgbcx_table4_small.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CD2881BA81008A51BC /* rgbcx_table4_small.h */; }; + 707789E52881BA81008A51BC /* ert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CE2881BA81008A51BC /* ert.cpp */; }; + 707789E62881BA81008A51BC /* ert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789CE2881BA81008A51BC /* ert.cpp */; }; + 707789E72881BA81008A51BC /* rgbcx.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CF2881BA81008A51BC /* rgbcx.h */; }; + 707789E82881BA81008A51BC /* rgbcx.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789CF2881BA81008A51BC /* rgbcx.h */; }; + 707789E92881BA81008A51BC /* bc7decomp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D02881BA81008A51BC /* bc7decomp.cpp */; }; + 707789EA2881BA81008A51BC /* bc7decomp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D02881BA81008A51BC /* bc7decomp.cpp */; }; + 707789EB2881BA81008A51BC /* utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789D22881BA81008A51BC /* utils.h */; }; + 707789EC2881BA81008A51BC /* utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789D22881BA81008A51BC /* utils.h */; }; + 707789ED2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D32881BA81008A51BC /* bc7decomp_ref.cpp */; }; + 707789EE2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789D32881BA81008A51BC /* bc7decomp_ref.cpp */; }; + 707789F12881BCE2008A51BC /* rdo_bc_encoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789EF2881BCE2008A51BC /* rdo_bc_encoder.cpp */; }; + 707789F22881BCE2008A51BC /* rdo_bc_encoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 707789EF2881BCE2008A51BC /* rdo_bc_encoder.cpp */; }; + 707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789F02881BCE2008A51BC /* rdo_bc_encoder.h */; }; + 707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 707789F02881BCE2008A51BC /* rdo_bc_encoder.h */; }; 70871DC927DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DA727DDDBCC00D0B9E1 /* astcenc_vecmathlib_common_4.h */; }; 70871DCA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_common_4.h in Headers */ = {isa = PBXBuildFile; fileRef = 70871DA727DDDBCC00D0B9E1 /* astcenc_vecmathlib_common_4.h */; }; 70871DCB27DDDBCD00D0B9E1 /* astcenc_image.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70871DA827DDDBCC00D0B9E1 /* astcenc_image.cpp */; }; @@ -365,14 +383,6 @@ 706EEDC626D1583E001C950E /* EtcBlock4x4.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = EtcBlock4x4.cpp; sourceTree = ""; }; 706EEDC726D1583E001C950E /* EtcColorFloatRGBA.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = EtcColorFloatRGBA.h; sourceTree = ""; }; 706EEDC826D1583E001C950E /* EtcBlock4x4Encoding.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = EtcBlock4x4Encoding.h; sourceTree = ""; }; - 706EEDCA26D1583E001C950E /* LICENSE */ = {isa = PBXFileReference; lastKnownFileType = text; path = LICENSE; sourceTree = ""; }; - 706EEDCB26D1583E001C950E /* rgbcx.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = rgbcx.h; sourceTree = ""; }; - 706EEDCC26D1583E001C950E /* bc7enc.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = bc7enc.h; sourceTree = ""; }; - 706EEDCD26D1583E001C950E /* bc7decomp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = bc7decomp.h; sourceTree = ""; }; - 706EEDCE26D1583E001C950E /* bc7decomp.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = bc7decomp.cpp; sourceTree = ""; }; - 706EEDCF26D1583E001C950E /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = ""; }; - 706EEDD026D1583E001C950E /* bc7enc.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = bc7enc.cpp; sourceTree = ""; }; - 706EEDD126D1583E001C950E /* rgbcx_table4.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = rgbcx_table4.h; sourceTree = ""; }; 706EEDF926D1583E001C950E /* ateencoder.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = ateencoder.mm; sourceTree = ""; }; 706EEDFA26D1583E001C950E /* ateencoder.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ateencoder.h; sourceTree = ""; }; 706EEDFC26D1583E001C950E /* basisu_transcoder.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = basisu_transcoder.h; sourceTree = ""; }; @@ -598,6 +608,23 @@ 706EFD5E26D3473F001C950E /* hashtable.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hashtable.cpp; sourceTree = ""; }; 706EFD5F26D3473F001C950E /* red_black_tree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = red_black_tree.cpp; sourceTree = ""; }; 706EFD6026D3473F001C950E /* fixed_pool.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fixed_pool.cpp; sourceTree = ""; }; + 707789C62881BA81008A51BC /* bc7enc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bc7enc.cpp; sourceTree = ""; }; + 707789C72881BA81008A51BC /* bc7enc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bc7enc.h; sourceTree = ""; }; + 707789C82881BA81008A51BC /* bc7decomp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bc7decomp.h; sourceTree = ""; }; + 707789C92881BA81008A51BC /* ert.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ert.h; sourceTree = ""; }; + 707789CA2881BA81008A51BC /* rgbcx.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rgbcx.cpp; sourceTree = ""; }; + 707789CB2881BA81008A51BC /* rgbcx_table4.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rgbcx_table4.h; sourceTree = ""; }; + 707789CC2881BA81008A51BC /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = utils.cpp; sourceTree = ""; }; + 707789CD2881BA81008A51BC /* rgbcx_table4_small.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rgbcx_table4_small.h; sourceTree = ""; }; + 707789CE2881BA81008A51BC /* ert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ert.cpp; sourceTree = ""; }; + 707789CF2881BA81008A51BC /* rgbcx.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rgbcx.h; sourceTree = ""; }; + 707789D02881BA81008A51BC /* bc7decomp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bc7decomp.cpp; sourceTree = ""; }; + 707789D12881BA81008A51BC /* README.md */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = ""; }; + 707789D22881BA81008A51BC /* utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = utils.h; sourceTree = ""; }; + 707789D32881BA81008A51BC /* bc7decomp_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bc7decomp_ref.cpp; sourceTree = ""; }; + 707789D42881BA81008A51BC /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = LICENSE; sourceTree = ""; }; + 707789EF2881BCE2008A51BC /* rdo_bc_encoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rdo_bc_encoder.cpp; sourceTree = ""; }; + 707789F02881BCE2008A51BC /* rdo_bc_encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rdo_bc_encoder.h; sourceTree = ""; }; 70871DA727DDDBCC00D0B9E1 /* astcenc_vecmathlib_common_4.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = astcenc_vecmathlib_common_4.h; sourceTree = ""; }; 70871DA827DDDBCC00D0B9E1 /* astcenc_image.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = astcenc_image.cpp; sourceTree = ""; }; 70871DA927DDDBCC00D0B9E1 /* astcenc_find_best_partitioning.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = astcenc_find_best_partitioning.cpp; sourceTree = ""; }; @@ -744,14 +771,23 @@ 706EEDC926D1583E001C950E /* bc7enc */ = { isa = PBXGroup; children = ( - 706EEDCA26D1583E001C950E /* LICENSE */, - 706EEDCB26D1583E001C950E /* rgbcx.h */, - 706EEDCC26D1583E001C950E /* bc7enc.h */, - 706EEDCD26D1583E001C950E /* bc7decomp.h */, - 706EEDCE26D1583E001C950E /* bc7decomp.cpp */, - 706EEDCF26D1583E001C950E /* README.md */, - 706EEDD026D1583E001C950E /* bc7enc.cpp */, - 706EEDD126D1583E001C950E /* rgbcx_table4.h */, + 707789D32881BA81008A51BC /* bc7decomp_ref.cpp */, + 707789D02881BA81008A51BC /* bc7decomp.cpp */, + 707789C82881BA81008A51BC /* bc7decomp.h */, + 707789C62881BA81008A51BC /* bc7enc.cpp */, + 707789C72881BA81008A51BC /* bc7enc.h */, + 707789CE2881BA81008A51BC /* ert.cpp */, + 707789C92881BA81008A51BC /* ert.h */, + 707789D42881BA81008A51BC /* LICENSE */, + 707789D12881BA81008A51BC /* README.md */, + 707789EF2881BCE2008A51BC /* rdo_bc_encoder.cpp */, + 707789F02881BCE2008A51BC /* rdo_bc_encoder.h */, + 707789CD2881BA81008A51BC /* rgbcx_table4_small.h */, + 707789CB2881BA81008A51BC /* rgbcx_table4.h */, + 707789CA2881BA81008A51BC /* rgbcx.cpp */, + 707789CF2881BA81008A51BC /* rgbcx.h */, + 707789CC2881BA81008A51BC /* utils.cpp */, + 707789D22881BA81008A51BC /* utils.h */, ); path = bc7enc; sourceTree = ""; @@ -1205,7 +1241,9 @@ 706EEFDA26D15984001C950E /* EtcBlock4x4EncodingBits.h in Headers */, 706EEFDB26D15984001C950E /* EtcBlock4x4Encoding_RGB8A1.h in Headers */, 706EEFDC26D15984001C950E /* EtcBlock4x4.h in Headers */, + 707789E72881BA81008A51BC /* rgbcx.h in Headers */, 706EEFDD26D15984001C950E /* Etc.h in Headers */, + 707789D72881BA81008A51BC /* bc7enc.h in Headers */, 706EEFDE26D15984001C950E /* EtcImage.h in Headers */, 70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */, 708A6A9C2708CE4700BA5410 /* bc6h_encode.h in Headers */, @@ -1213,10 +1251,6 @@ 706EEFE026D15984001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */, 706EEFE126D15984001C950E /* EtcColorFloatRGBA.h in Headers */, 706EEFE226D15984001C950E /* EtcBlock4x4Encoding.h in Headers */, - 706EEFE326D15984001C950E /* rgbcx.h in Headers */, - 706EEFE426D15984001C950E /* bc7enc.h in Headers */, - 706EEFE526D15984001C950E /* bc7decomp.h in Headers */, - 706EEFE626D15984001C950E /* rgbcx_table4.h in Headers */, 706EEFF226D15984001C950E /* ateencoder.h in Headers */, 706EEFF326D15984001C950E /* basisu_transcoder.h in Headers */, 70A7BD3227092A1200DBCCF7 /* hdr_encode.h in Headers */, @@ -1224,12 +1258,14 @@ 706EEFF426D15984001C950E /* basisu_containers.h in Headers */, 70871DD527DDDBCD00D0B9E1 /* astcenc.h in Headers */, 706EEFF526D15985001C950E /* basisu_containers_impl.h in Headers */, + 707789EB2881BA81008A51BC /* utils.h in Headers */, 706EEFF626D15985001C950E /* basisu_transcoder_internal.h in Headers */, 70871DF927DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */, 70871DFB27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */, 706EEFF726D15985001C950E /* basisu_global_selector_cb.h in Headers */, 706EEFF826D15985001C950E /* basisu_transcoder_uastc.h in Headers */, 706EEFF926D15985001C950E /* basisu_global_selector_palette.h in Headers */, + 707789E32881BA81008A51BC /* rgbcx_table4_small.h in Headers */, 706EEFFA26D15985001C950E /* basisu.h in Headers */, 706EEFFB26D15985001C950E /* basisu_file_headers.h in Headers */, 706EEFFC26D15985001C950E /* miniz.h in Headers */, @@ -1246,10 +1282,13 @@ 706EF00626D15985001C950E /* KramImage.h in Headers */, 706EF00726D15985001C950E /* win_mmap.h in Headers */, 70871DDD27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */, + 707789D92881BA81008A51BC /* bc7decomp.h in Headers */, 706EF00826D15985001C950E /* Kram.h in Headers */, 70871DED27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */, + 707789DB2881BA81008A51BC /* ert.h in Headers */, 706EF00926D15985001C950E /* KTXImage.h in Headers */, 706EF00A26D15985001C950E /* KramImageInfo.h in Headers */, + 707789DF2881BA81008A51BC /* rgbcx_table4.h in Headers */, 70871DF727DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */, 706EF00B26D15985001C950E /* KramTimer.h in Headers */, 706EF00C26D15985001C950E /* KramMmapHelper.h in Headers */, @@ -1265,6 +1304,7 @@ 708A6A982708CE4700BA5410 /* bc6h_decode.h in Headers */, 706EF01526D15985001C950E /* singlecolourfit.h in Headers */, 706EF01626D15985001C950E /* maths.h in Headers */, + 707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */, 706EF01726D15985001C950E /* colourset.h in Headers */, 708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */, 706EF01826D15985001C950E /* colourblock.h in Headers */, @@ -1293,7 +1333,9 @@ 706EF15426D166C5001C950E /* EtcBlock4x4EncodingBits.h in Headers */, 706EF15526D166C5001C950E /* EtcBlock4x4Encoding_RGB8A1.h in Headers */, 706EF15626D166C5001C950E /* EtcBlock4x4.h in Headers */, + 707789E82881BA81008A51BC /* rgbcx.h in Headers */, 706EF15726D166C5001C950E /* Etc.h in Headers */, + 707789D82881BA81008A51BC /* bc7enc.h in Headers */, 706EF15826D166C5001C950E /* EtcImage.h in Headers */, 70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */, 708A6A9D2708CE4700BA5410 /* bc6h_encode.h in Headers */, @@ -1301,10 +1343,6 @@ 706EF15A26D166C5001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */, 706EF15B26D166C5001C950E /* EtcColorFloatRGBA.h in Headers */, 706EF15C26D166C5001C950E /* EtcBlock4x4Encoding.h in Headers */, - 706EF15D26D166C5001C950E /* rgbcx.h in Headers */, - 706EF15E26D166C5001C950E /* bc7enc.h in Headers */, - 706EF15F26D166C5001C950E /* bc7decomp.h in Headers */, - 706EF16026D166C5001C950E /* rgbcx_table4.h in Headers */, 706EF16C26D166C5001C950E /* ateencoder.h in Headers */, 706EF16D26D166C5001C950E /* basisu_transcoder.h in Headers */, 70A7BD3327092A1200DBCCF7 /* hdr_encode.h in Headers */, @@ -1312,12 +1350,14 @@ 706EF16E26D166C5001C950E /* basisu_containers.h in Headers */, 70871DD627DDDBCD00D0B9E1 /* astcenc.h in Headers */, 706EF16F26D166C5001C950E /* basisu_containers_impl.h in Headers */, + 707789EC2881BA81008A51BC /* utils.h in Headers */, 706EF17026D166C5001C950E /* basisu_transcoder_internal.h in Headers */, 70871DFA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */, 70871DFC27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */, 706EF17126D166C5001C950E /* basisu_global_selector_cb.h in Headers */, 706EF17226D166C5001C950E /* basisu_transcoder_uastc.h in Headers */, 706EF17326D166C5001C950E /* basisu_global_selector_palette.h in Headers */, + 707789E42881BA81008A51BC /* rgbcx_table4_small.h in Headers */, 706EF17426D166C5001C950E /* basisu.h in Headers */, 706EF17526D166C5001C950E /* basisu_file_headers.h in Headers */, 706EF17626D166C5001C950E /* miniz.h in Headers */, @@ -1334,10 +1374,13 @@ 706EF18026D166C5001C950E /* KramImage.h in Headers */, 706EF18126D166C5001C950E /* win_mmap.h in Headers */, 70871DDE27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */, + 707789DA2881BA81008A51BC /* bc7decomp.h in Headers */, 706EF18226D166C5001C950E /* Kram.h in Headers */, 70871DEE27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */, + 707789DC2881BA81008A51BC /* ert.h in Headers */, 706EF18326D166C5001C950E /* KTXImage.h in Headers */, 706EF18426D166C5001C950E /* KramImageInfo.h in Headers */, + 707789E02881BA81008A51BC /* rgbcx_table4.h in Headers */, 70871DF827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */, 706EF18526D166C5001C950E /* KramTimer.h in Headers */, 706EF18626D166C5001C950E /* KramMmapHelper.h in Headers */, @@ -1353,6 +1396,7 @@ 708A6A992708CE4700BA5410 /* bc6h_decode.h in Headers */, 706EF18F26D166C5001C950E /* singlecolourfit.h in Headers */, 706EF19026D166C5001C950E /* maths.h in Headers */, + 707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */, 706EF19126D166C5001C950E /* colourset.h in Headers */, 708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */, 706EF19226D166C5001C950E /* colourblock.h in Headers */, @@ -1441,10 +1485,12 @@ buildActionMask = 2147483647; files = ( 70871DD727DDDBCD00D0B9E1 /* astcenc_quantization.cpp in Sources */, + 707789E52881BA81008A51BC /* ert.cpp in Sources */, 70871E0327DDDBCD00D0B9E1 /* astcenc_color_unquantize.cpp in Sources */, 70871DD127DDDBCD00D0B9E1 /* astcenc_averages_and_directions.cpp in Sources */, 70871DDF27DDDBCD00D0B9E1 /* astcenc_mathlib_softfloat.cpp in Sources */, 706EF26426D17DCC001C950E /* ateencoder.mm in Sources */, + 707789ED2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */, 706EEF7F26D1595D001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */, 70871DCD27DDDBCD00D0B9E1 /* astcenc_find_best_partitioning.cpp in Sources */, 70CDB65227A1382700A546C1 /* KramDDSHelper.cpp in Sources */, @@ -1456,15 +1502,15 @@ 706EEF8426D1595D001C950E /* EtcBlock4x4Encoding_RG11.cpp in Sources */, 706EEF8526D1595D001C950E /* EtcBlock4x4Encoding_RGB8A1.cpp in Sources */, 706EEF8726D1595D001C950E /* EtcIndividualTrys.cpp in Sources */, + 707789DD2881BA81008A51BC /* rgbcx.cpp in Sources */, 706EEF8826D1595D001C950E /* EtcBlock4x4Encoding_R11.cpp in Sources */, + 707789F12881BCE2008A51BC /* rdo_bc_encoder.cpp in Sources */, 70871DF527DDDBCD00D0B9E1 /* astcenc_color_quantize.cpp in Sources */, 706EEF8926D1595D001C950E /* EtcBlock4x4Encoding_ETC1.cpp in Sources */, 706EEF8A26D1595D001C950E /* EtcBlock4x4Encoding.cpp in Sources */, 706EEF8B26D1595D001C950E /* EtcBlock4x4.cpp in Sources */, 70871DDB27DDDBCD00D0B9E1 /* astcenc_percentile_tables.cpp in Sources */, - 706EEF8C26D1595D001C950E /* bc7decomp.cpp in Sources */, 70871DE127DDDBCD00D0B9E1 /* astcenc_mathlib.cpp in Sources */, - 706EEF8D26D1595D001C950E /* bc7enc.cpp in Sources */, 708A6A9A2708CE4700BA5410 /* bc6h_encode.cpp in Sources */, 70A7BD3027092A1200DBCCF7 /* hdr_encode.cpp in Sources */, 706EFF7726D34740001C950E /* string.cpp in Sources */, @@ -1476,7 +1522,9 @@ 70871DE327DDDBCD00D0B9E1 /* astcenc_decompress_symbolic.cpp in Sources */, 70871E0727DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp in Sources */, 70871E0527DDDBCD00D0B9E1 /* astcenc_platform_isa_detection.cpp in Sources */, + 707789D52881BA81008A51BC /* bc7enc.cpp in Sources */, 706EFF7F26D34740001C950E /* intrusive_list.cpp in Sources */, + 707789E92881BA81008A51BC /* bc7decomp.cpp in Sources */, 706EEFA826D1595D001C950E /* miniz.cpp in Sources */, 70871DE527DDDBCD00D0B9E1 /* astcenc_compress_symbolic.cpp in Sources */, 706EEFA926D1595D001C950E /* hedistance.cpp in Sources */, @@ -1518,6 +1566,7 @@ 706EEFC226D1595E001C950E /* zstd.cpp in Sources */, 706EEFC326D1595E001C950E /* zstddeclib.cpp in Sources */, 706EEFC426D1595E001C950E /* lodepng.cpp in Sources */, + 707789E12881BA81008A51BC /* utils.cpp in Sources */, 706EEFC526D1595E001C950E /* tmpfileplus.cpp in Sources */, 70871E0127DDDBCD00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp in Sources */, ); @@ -1528,10 +1577,12 @@ buildActionMask = 2147483647; files = ( 70871DD827DDDBCD00D0B9E1 /* astcenc_quantization.cpp in Sources */, + 707789E62881BA81008A51BC /* ert.cpp in Sources */, 70871E0427DDDBCD00D0B9E1 /* astcenc_color_unquantize.cpp in Sources */, 70871DD227DDDBCD00D0B9E1 /* astcenc_averages_and_directions.cpp in Sources */, 70871DE027DDDBCD00D0B9E1 /* astcenc_mathlib_softfloat.cpp in Sources */, 706EFC2426D1C39B001C950E /* ateencoder.mm in Sources */, + 707789EE2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */, 706EF19826D166C5001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */, 70871DCE27DDDBCD00D0B9E1 /* astcenc_find_best_partitioning.cpp in Sources */, 70CDB65327A1382700A546C1 /* KramDDSHelper.cpp in Sources */, @@ -1543,15 +1594,15 @@ 706EF19D26D166C5001C950E /* EtcBlock4x4Encoding_RG11.cpp in Sources */, 706EF19E26D166C5001C950E /* EtcBlock4x4Encoding_RGB8A1.cpp in Sources */, 706EF19F26D166C5001C950E /* EtcIndividualTrys.cpp in Sources */, + 707789DE2881BA81008A51BC /* rgbcx.cpp in Sources */, 706EF1A026D166C5001C950E /* EtcBlock4x4Encoding_R11.cpp in Sources */, + 707789F22881BCE2008A51BC /* rdo_bc_encoder.cpp in Sources */, 70871DF627DDDBCD00D0B9E1 /* astcenc_color_quantize.cpp in Sources */, 706EF1A126D166C5001C950E /* EtcBlock4x4Encoding_ETC1.cpp in Sources */, 706EF1A226D166C5001C950E /* EtcBlock4x4Encoding.cpp in Sources */, 706EF1A326D166C5001C950E /* EtcBlock4x4.cpp in Sources */, 70871DDC27DDDBCD00D0B9E1 /* astcenc_percentile_tables.cpp in Sources */, - 706EF1A426D166C5001C950E /* bc7decomp.cpp in Sources */, 70871DE227DDDBCD00D0B9E1 /* astcenc_mathlib.cpp in Sources */, - 706EF1A526D166C5001C950E /* bc7enc.cpp in Sources */, 708A6A9B2708CE4700BA5410 /* bc6h_encode.cpp in Sources */, 70A7BD3127092A1200DBCCF7 /* hdr_encode.cpp in Sources */, 706EFF7826D34740001C950E /* string.cpp in Sources */, @@ -1563,7 +1614,9 @@ 70871DE427DDDBCD00D0B9E1 /* astcenc_decompress_symbolic.cpp in Sources */, 70871E0827DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.cpp in Sources */, 70871E0627DDDBCD00D0B9E1 /* astcenc_platform_isa_detection.cpp in Sources */, + 707789D62881BA81008A51BC /* bc7enc.cpp in Sources */, 706EFF8026D34740001C950E /* intrusive_list.cpp in Sources */, + 707789EA2881BA81008A51BC /* bc7decomp.cpp in Sources */, 706EF1C026D166C5001C950E /* miniz.cpp in Sources */, 70871DE627DDDBCD00D0B9E1 /* astcenc_compress_symbolic.cpp in Sources */, 706EF1C126D166C5001C950E /* hedistance.cpp in Sources */, @@ -1605,6 +1658,7 @@ 706EF1D826D166C5001C950E /* zstd.cpp in Sources */, 706EF1D926D166C5001C950E /* zstddeclib.cpp in Sources */, 706EF1DA26D166C5001C950E /* lodepng.cpp in Sources */, + 707789E22881BA81008A51BC /* utils.cpp in Sources */, 706EF1DB26D166C5001C950E /* tmpfileplus.cpp in Sources */, 70871E0227DDDBCD00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp in Sources */, ); diff --git a/libkram/bc7enc/LICENSE b/libkram/bc7enc/LICENSE index 71e10daf..b3b1f69b 100644 --- a/libkram/bc7enc/LICENSE +++ b/libkram/bc7enc/LICENSE @@ -1,10 +1,12 @@ -The following source code files are available under 2 licenses -- choose whichever you prefer: -rgbcx.h -bc7decomp.cpp/h -bc7enc.c +If you use this software in a product, attribution / credits is requested but not required. + +bc7e.ispc uses the Apache 2.0 license and is Copyright (C) 2018-2021 Binomial LLC. +LodePNG is Copyright (c) 2005-2016 Lode Vandevenne. See LodePNG.cpp for its license. + +All other source code files in this repo are available under 2 licenses -- choose whichever you prefer. ALTERNATIVE A - MIT License -Copyright(c) 2020 Richard Geldreich, Jr. +Copyright(c) 2020-2021 Richard Geldreich, Jr. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files(the "Software"), to deal in the Software without restriction, including without limitation the rights to diff --git a/libkram/bc7enc/README.md b/libkram/bc7enc/README.md index 8f762b2d..b86777f5 100644 --- a/libkram/bc7enc/README.md +++ b/libkram/bc7enc/README.md @@ -1,146 +1,163 @@ -bc7enc - Fast, single source file BC1-5 and BC7/BPTC GPU texture encoders. +bc7enc - Fast BC1-7 GPU texture encoders with Rate Distortion Optimization (RDO) -Features: -- BC1/3 encoder (in [rgbcx.h](https://github.com/richgel999/bc7enc/blob/master/rgbcx.h)) uses a new algorithm (which we've named "prioritized cluster fit") which is 3-4x faster than traditional cluster fit (as implemented in [libsquish](https://github.com/svn2github/libsquish) with SSE2) at the same or slightly higher average quality using scalar CPU instructions. This algorithm is suitable for GPU encoder implementations. +This repo contains fast texture encoders for BC1-7. All formats support a simple post-processing transform on the encoded texture data designed to trade off quality for smaller compressed file sizes using LZ compression. Significant (10-50%) size reductions are possible. The BC7 encoder also supports a "reduced entropy" mode using the -e option which causes the output to be biased/weighted in various ways which minimally impact quality, which results in 5-10% smaller file sizes with no slowdowns in encoding time. -The BC1/BC3 encoder also implements [Castano's optimal endpoint rounding improvement](https://gist.github.com/castano/c92c7626f288f9e99e158520b14a61cf). +Currently, the entropy reduction transform is tuned for Deflate, LZHAM, or LZMA. The method used to control the rate-distortion tradeoff is the classic Lagrangian multiplier RDO method, modified to favor MSE on very smooth blocks. Rate is approximated using a fixed Deflate model. The post-processing transform applied to the encoded texture data tries to introduce the longest match it can into every encoded output block. It also tries to continue matches between blocks and (specifically for codecs like LZHAM/LZMA/Zstd) it tries to utilize REP0 (repeat) matches. -rgbcx's BC1 encoder is faster than both AMD Compressonator and libsquish at the same average quality. +You can see examples of the RDO BC7 encoder's current output [here](https://richg42.blogspot.com/2021/02/more-rdo-bc7-encoding.html). Some examples on how to use the command line tool are on my blog, [here](https://richg42.blogspot.com/2021/02/how-to-use-bc7encrdo.html). -- BC7 encoder (in bc7enc.c/.h) has perceptual colorspace metric support, and is very fast compared to ispc_texcomp (see below) for RGB textures. Important: The BC7 encoder included in this repo is still a work in progress. I took bc7enc16 and added more modes for better alpha support, but it needs more testing and development. +This repo contains both [bc7e.ispc](https://github.com/BinomialLLC/bc7e) and its distantly related but weaker 4 mode only non-ispc variant, bc7enc.cpp. By default, if you set SUPPORT_BC7E=TRUE when running cmake, you get bc7e.ispc, otherwise you get bc7enc.cpp. (The -C option forces bc7enc.cpp.) bc7e supports all BC7 modes and features, but doesn't yet support reduced entropy BC7 encoding. bc7enc.cpp supports optional reduced entropy encoding (using -e with the command line tool). RDO BC7 is supported when using either encoder, however. -- Full decoders for BC1-5/7. BC7 decoder is in bc7decomp.cpp/.h, BC1-5 decoders in rgbcx.h. +The next major focus will be improving the default smooth block handling and improving rate distorton performance. -This project is basically a demo of some of the techniques we use in Basis BC7, -which is Binomial's state of the art vectorized BC7 encoder. Basis BC7 is the -highest quality and fastest CPU BC7 encoder available (2-3x faster than -ispc_texcomp). It supports all modes and linear/perceptual colorspace metrics. -Licensees get full ISPC source code so they can customize the codec as needed. +This repo was originally derived from [bc7enc](https://github.com/richgel999/bc7enc) and [bc7e](https://github.com/BinomialLLC/bc7e). Note this repo contains the latest version of bc7e.ispc, which has a determinism bug fix. -bc7enc currently only supports modes 1 and 6 for RGB, and modes 1, 5, 6, and 7 for alpha. The plan is to add all the modes. See the [bc7enc16](https://github.com/richgel999/bc7enc16) project for the previous version (which only supports modes 1 and 6). Note this readme still refers to "bc7enc16", but bc7enc is the same encoder but with more alpha modes. +**Note: If you use this software in a product, attribution / credits is requested but not required. Thanks!** -This codec supports a perceptual mode when encoding BC7, where it computes colorspace error in -weighted YCbCr space (like etc2comp), and it also supports weighted RGBA -metrics. It's particular strong in perceptual mode, beating the current state of -the art CPU encoder (Intel's ispc_texcomp) by a wide margin when measured by -Luma PSNR, even though it only supports 2 modes and isn't vectorized. +### Compiling -Why only modes 1 and 6 for opaque BC7? -Because with these two modes you have a complete encoder that supports both -opaque and transparent textures in a small amount (~1400 lines) of -understandable plain C code. Mode 6 excels on smooth blocks, and mode 1 is -strong with complex blocks, and a strong encoder that combines both modes can be -quite high quality. Fast mode 6-only encoders will have noticeable block -artifacts which this codec avoids by fully supporting mode 1. +This build has been tested with MSVC 2019 x64 and clang 6.0.0 under Ubuntu v18.04. -Modes 1 and 6 are typically the most used modes on many textures using other -encoders. Mode 1 has two subsets, 64 possible partitions, and 3-bit indices, -while mode 6 has large 4-bit indices and high precision 7777.1 endpoints. This -codec produces output that is far higher quality than any BC1 encoder, and -approaches (or in perceptual mode exceeds!) the quality of other full BC7 -encoders. +To compile with bc7e.ispc (on Linux this requires [Intel's ISPC compiler](https://ispc.github.io/downloads.html) to be in your path - recommended): -Why is bc7enc16 so fast in perceptual mode? -Computing error in YCbCr space is more expensive than in RGB space, yet bc7enc16 -in perceptual mode is stronger than ispc_texcomp (see the benchmark below) - -even without SSE/AVX vectorization and with only 2 modes to work with! +``` +cmake -D SUPPORT_BC7E=TRUE . +make +``` + +To compile without BC7E: + +``` +cmake . +make +``` -Most BC7 encoders only support linear RGB colorspace metrics, which is a -fundamental weakness. Some support weighted RGB metrics, which is better. With -linear RGB metrics, encoding error is roughly balanced between each channel, and -encoders have to work *very* hard (examining large amounts of RGB search space) -to get overall quality up. With perceptual colorspace metrics, RGB error tends -to become a bit unbalanced, with green quality favored more highly than red and -blue, and blue quality favored the least. A perceptual encoder is tuned to -prefer exploring solutions along the luma axis, where it's much less work to find -solutions with less luma error. bc7enc16 is, as far as I know, the first BC7 -codec to support computing error in weighted YCbCr colorspace. +Note the MSVC and Linux builds enable OpenMP for faster compression. -Note: Most of the timings here (except for the ispc_texcomp "fast" mode timings at the very bottom) -are for the *original* release, before I added several more optimizations. The latest version of -bc7enc16.c is around 8-27% faster than the initial release at same quality (when mode 1 is enabled - -there's no change with just mode 6). +### Examples -Some benchmarks across 31 images (kodim corpus+others): +The [.DDS](https://docs.microsoft.com/en-us/windows/win32/direct3ddds/dx-graphics-dds-pguide) output files can be loaded/viewed using tools like [AMD Compressonator](https://gpuopen.com/compressonator/). -Perceptual (average REC709 Luma PSNR - higher is better quality): +To encode to non-RDO BC7 using BC7E, highest quality, linear RGB(A) metrics: + +``` +./bc7enc blah.png ``` -iscp_texcomp slow vs. bc7enc16 uber4/max_partitions 64 -iscp_texcomp: 355.4 secs 48.6 dB -bc7enc16: 122.6 secs 50.0 dB -iscp_texcomp slow vs. bc7enc16 uber0/max_partitions 64 -iscp_texcomp: 355.4 secs 48.6 dB -bc7enc16: 38.3 secs 49.6 dB +To encode to non-RDO BC7 using BC7E, highest quality, using perceptual (scaled YCbCr) colorspace error metrics: -iscp_texcomp basic vs. bc7enc16 uber0/max_partitions 16 -ispc_texcomp: 100.2 secs 48.3 dB -bc7enc16: 20.8 secs 49.3 dB +``` +./bc7enc blah.png -s +``` -iscp_texcomp fast vs. bc7enc16 uber0/max_partitions 16 -iscp_texcomp: 41.5 secs 48.0 dB -bc7enc16: 20.8 secs 49.3 dB +To encode to RDO BC7 using BC7E, highest quality, lambda=.5, linear metrics (perceptual colorspace metrics are always automatically disabled when -z is specified), with a balance of encoding performance vs. RDO efficiency: -iscp_texcomp ultrafast vs. bc7enc16 uber0/max_partitions 0 -iscp_texcomp: 1.9 secs 46.2 dB -bc7enc16: 8.9 secs 48.4 dB +``` +./bc7enc blah.png -z.5 +``` -Non-perceptual (average RGB PSNR): +To encode to RDO BC7 using BC7E, lower baseline quality (-u4) for faster encoding, lambda=.5, and with faster encoding (only inject one match vs two, with a tiny RDO lookback window size of 16 bytes): -iscp_texcomp slow vs. bc7enc16 uber4/max_partitions 64 -iscp_texcomp: 355.4 secs 46.8 dB -bc7enc16: 51 secs 46.1 dB +``` +./bc7enc blah.png -u4 -z.5 -ze -zc16 +``` -iscp_texcomp slow vs. bc7enc16 uber0/max_partitions 64 -iscp_texcomp: 355.4 secs 46.8 dB -bc7enc16: 29.3 secs 45.8 dB +To encode to non-RDO BC7 using entropy reduced or quantized/weighted BC7 (no slowdown vs. non-RDO bc7enc.cpp for BC7, slightly reduced quality, but 5-10% better LZ compression, only uses 2 or 4 BC7 modes): -iscp_texcomp basic vs. bc7enc16 uber4/max_partitions 64 -iscp_texcomp: 99.9 secs 46.5 dB -bc7enc16: 51 secs 46.1 dB +``` +./bc7enc blah.png -C -e +``` -iscp_texcomp fast vs. bc7enc16 uber1/max_partitions 16 -ispc_texcomp: 41.5 secs 46.1 dB -bc7enc16: 19.8 secs 45.5 dB +To encode to RDO BC7 using the entropy reduction transform combined with reduced entropy BC7 encoding, with a slightly larger window size than the default which is 128 bytes: -iscp_texcomp fast vs. bc7enc16 uber0/max_partitions 8 -ispc_texcomp: 41.5 secs 46.1 dB -bc7enc16: 10.46 secs 44.4 dB +``` +./bc7enc -zc256 blah.png -C -e -z1.0 +``` -iscp_texcomp ultrafast vs. bc7enc16 uber0/max_partitions 0 -ispc_texcomp: 1.9 secs 42.7 dB -bc7enc16: 3.8 secs 42.7 dB +Same as before, but higher compression (allow 2 matches per block instead of 1): -DirectXTex CPU in "mode 6 only" mode vs. bc7enc16 uber1/max_partions 0 (mode 6 only), non-perceptual: - -DirectXTex: 466.4 secs 41.9 dB -bc7enc16: 6.7 secs 42.8 dB +``` +./bc7enc -zc256 blah.png -C -e -z1.0 -zn +``` -DirectXTex CPU in (default - no 3 subset modes) vs. bc7enc16 uber1/max_partions 64, non-perceptual: +Same, except disable ultra-smooth block handling: -DirectXTex: 9485.1 secs 45.6 dB -bc7enc16: 36 secs 46.0 dB -``` -(Note this version of DirectXTex has a key pbit bugfix which I've submitted but -is still waiting to be accepted. Non-bugfixed versions will be slightly lower -quality.) +``` +./bc7enc -zc256 blah.png -C -e -z1.0 -zu +``` -UPDATE: To illustrate how strong the mode 1+6 implementation is in bc7enc16, let's compare ispc_texcomp -fast vs. the latest version of bc7enc16 uber4/max_partitions 64: +To encode to RDO BC7 using the entropy reduction transform at lower quality, combined with reduced entropy BC7 encoding, with a slightly larger window size than the default which is 128 bytes: -Without filterbank optimizations: ``` - Time RGB PSNR Y PSNR -ispc_texcomp: 41.45 secs 46.09 dB 48.0 dB -bc7enc16: 41.42 secs 46.03 dB 48.2 dB +./bc7enc -zc256 blah.png -C -e -z2.0 +``` + +To encode to RDO BC7 using the entropy reduction transform at higher effectivenes using a larger window size, without using reduced entropy BC7 encoding: -With filterbank optimizations enabled: -bc7enc16: 38.78 secs 45.94 dB 48.12 dB ``` -They both have virtually the same average RGB PSNR with these settings (.06 dB is basically noise), but -bc7enc16 is just as fast as ispc_texcomp fast, even though it's not vectorized. Interestingly, our Y PSNR is better, -although bc7enc16 wasn't using perceptual metrics in these benchmarks. +./bc7enc -zc1024 blah.png -z1.0 +``` + +To encode to RDO BC7 using the entropy reduction transform at higher effectivenes using a larger window size, with a manually specified max smooth block max error scale: + +``` +./bc7enc -zc1024 blah.png -z2.0 -zb30.0 +``` + +To encode to RDO BC7 using the entropy reduction transform at higher effectivenes using a larger window size, using only mode 6 (more block artifacts, but better rate-distortion performance as measured by PSNR): + +``` +./bc7enc -zc1024 blah.png -6 -z1.0 -e +``` + +To encode to BC1: +``` +./bc7enc -1 blah.png +``` + +To encode to BC1 with Rate Distortion Optimization (RDO) at lambda=1.0: +``` +./bc7enc -1 -z1.0 blah.png +``` + +The -z option controls lambda, or the rate vs. distortion tradeoff. 0 = maximum quality, higher values=lower bitrates but lower quality. Try values [.25-8]. + +To encode to BC1 with RDO, with RDO debug output, to monitor the percentage of blocks impacted: +``` +./bc7enc -1 -z1.0 -zd blah.png +``` + +To encode to BC1 with RDO with a higher then default smooth block scale factor: +``` +./bc7enc -1 -z1.0 -zb40.0 blah.png +``` + +Use -zb1.0 to disable smooth block error scaling completely, which increases RDO performance but can result in noticeable artifacts on smooth/flat blocks at higher lambdas. + +Use -zc# to control the RDO window size in bytes. Good values to try are 16-8192. +Use -zt to disable RDO multithreading. + +To encode to BC1 with RDO at the highest achievable quality/effectiveness (this is extremely slow): + +``` +./bc7enc -1 -z1.0 -zc32768 blah.png +``` + +This sets the window size to 32KB (the highest setting that makes sense for Deflate). Window sizes of 2KB (the default) to 8KB are way faster and in practice are almost as effective. The maximum window size setting supported by the command line tool is 64KB, but this would be very slow. + +For even higher quality per bit (this is incredibly slow): +``` +./bc7enc -1 -z1.0 -zc32768 -zm blah.png +``` + +### Dependencies +There are no 3rd party code or library dependencies. utils.cpp/.h is only needed by the example command line tool. It uses C++11. The individual .cpp files are designed to be easily dropped into other codebases. + +For RDO post-processing of any block-based format: ert.cpp/.h. You provide this function an array of encoded blocks, an array of source/original 32bpp blocks, some parameters, and a pointer to a block decoder function for your format as a callback. It must return false if the passed in block data is invalid. (Make sure you *really* validate the block's data, because the ERT post-processor will inevitably call your callback with invalid blocks.) This transform works on most other texture formats, such as ETC1/2, EAC, and ASTC. The ERT works on block sizes ranging from 1x1 to 12x12. This file has no other dependencies apart from utils.cpp/h. + +For BC1-5 encoding/decoding: rgbcx.cpp/.h + +For BC7 encoding: bc7enc.cpp/.h + +For BC7 decoding: bc7decomp.cpp/.h -This was a multithreaded benchmark (using OpenMP) on a dual Xeon workstation. -ispc_texcomp was called with 64-blocks at a time and used AVX instructions. -Timings are for encoding only. diff --git a/libkram/bc7enc/bc7decomp.cpp b/libkram/bc7enc/bc7decomp.cpp index 3099ec4d..cf1574af 100644 --- a/libkram/bc7enc/bc7decomp.cpp +++ b/libkram/bc7enc/bc7decomp.cpp @@ -1,9 +1,33 @@ // File: bc7decomp.c - Richard Geldreich, Jr. 3/31/2020 - MIT license or public domain (see end of file) #include "bc7decomp.h" +#include -namespace bc7decomp +#if (defined(_M_AMD64) || defined(_M_X64) || defined(__SSE2__)) +# define BC7DECOMP_USE_SSE2 +#endif + +#ifdef BC7DECOMP_USE_SSE2 +#include +#include +#endif + +namespace bc7decomp { +#ifdef BC7DECOMP_USE_SSE2 + const __m128i g_bc7_weights4_sse2[8] = + { + _mm_set_epi16(4, 4, 4, 4, 0, 0, 0, 0), + _mm_set_epi16(13, 13, 13, 13, 9, 9, 9, 9), + _mm_set_epi16(21, 21, 21, 21, 17, 17, 17, 17), + _mm_set_epi16(30, 30, 30, 30, 26, 26, 26, 26), + _mm_set_epi16(38, 38, 38, 38, 34, 34, 34, 34), + _mm_set_epi16(47, 47, 47, 47, 43, 43, 43, 43), + _mm_set_epi16(55, 55, 55, 55, 51, 51, 51, 51), + _mm_set_epi16(64, 64, 64, 64, 60, 60, 60, 60), + }; +#endif + const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 }; const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; @@ -44,27 +68,32 @@ const uint8_t g_bc7_table_anchor_index_third_subset_2[64] = 15, 8, 8, 3,15,15, 3, 8, 15,15,15,15,15,15,15, 8, 15, 8,15, 3,15, 8,15, 8, 3,15, 6,10,15,15,10, 8, 15, 3,15,10,10, 8, 9,10, 6,15, 8,15, 3, 6, 6, 8, 15, 3,15,15,15,15,15,15, 15,15,15,15, 3,15,15, 8 }; -inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) +const uint8_t g_bc7_first_byte_to_mode[256] = { - assert(codesize <= 32); - uint32_t bits = 0; - uint32_t total_bits = 0; - - while (total_bits < codesize) - { - uint32_t byte_bit_offset = bit_offset & 7; - uint32_t bits_to_read = std::min(codesize - total_bits, 8 - byte_bit_offset); - - uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset; - byte_bits &= ((1 << bits_to_read) - 1); - - bits |= (byte_bits << total_bits); + 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, +}; - total_bits += bits_to_read; - bit_offset += bits_to_read; - } +inline void insert_weight_zero(uint64_t& index_bits, uint32_t bits_per_index, uint32_t offset) +{ + uint64_t LOW_BIT_MASK = (static_cast(1) << ((bits_per_index * (offset + 1)) - 1)) - 1; + uint64_t HIGH_BIT_MASK = ~LOW_BIT_MASK; - return bits; + index_bits = ((index_bits & HIGH_BIT_MASK) << 1) | (index_bits & LOW_BIT_MASK); } // BC7 mode 0-7 decompression. @@ -89,51 +118,142 @@ static inline uint32_t bc7_interp(uint32_t l, uint32_t h, uint32_t w, uint32_t b } return 0; } - -bool unpack_bc7_mode0_2(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels) + + +#ifdef BC7DECOMP_USE_SSE2 +static inline __m128i bc7_interp_sse2(__m128i l, __m128i h, __m128i w, __m128i iw) +{ + return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(l, iw), _mm_mullo_epi16(h, w)), _mm_set1_epi16(32)), 6); +} + +static inline void bc7_interp2_sse2(const color_rgba* endpoint_pair, color_rgba* out_colors) +{ + __m128i endpoints = _mm_loadu_si64(endpoint_pair); + __m128i endpoints_16 = _mm_unpacklo_epi8(endpoints, _mm_setzero_si128()); + + __m128i endpoints_16_swapped = _mm_shuffle_epi32(endpoints_16, _MM_SHUFFLE(1, 0, 3, 2)); + + // Interpolated colors will be color 1 and 2 + __m128i interpolated_colors = bc7_interp_sse2(endpoints_16, endpoints_16_swapped, _mm_set1_epi16(21), _mm_set1_epi16(43)); + + // all_colors will be 1, 2, 0, 3 + __m128i all_colors = _mm_packus_epi16(interpolated_colors, endpoints_16); + + all_colors = _mm_shuffle_epi32(all_colors, _MM_SHUFFLE(3, 1, 0, 2)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(out_colors), all_colors); +} + +static inline void bc7_interp3_sse2(const color_rgba* endpoint_pair, color_rgba* out_colors) +{ + __m128i endpoints = _mm_loadu_si64(endpoint_pair); + __m128i endpoints_16bit = _mm_unpacklo_epi8(endpoints, _mm_setzero_si128()); + __m128i endpoints_16bit_swapped = _mm_shuffle_epi32(endpoints_16bit, _MM_SHUFFLE(1, 0, 3, 2)); + + __m128i interpolated_16 = bc7_interp_sse2(endpoints_16bit, endpoints_16bit_swapped, _mm_set1_epi16(9), _mm_set1_epi16(55)); + __m128i interpolated_23 = bc7_interp_sse2(endpoints_16bit, endpoints_16bit_swapped, _mm_set_epi16(37, 37, 37, 37, 18, 18, 18, 18), _mm_set_epi16(27, 27, 27, 27, 46, 46, 46, 46)); + __m128i interpolated_45 = bc7_interp_sse2(endpoints_16bit, endpoints_16bit_swapped, _mm_set_epi16(18, 18, 18, 18, 37, 37, 37, 37), _mm_set_epi16(46, 46, 46, 46, 27, 27, 27, 27)); + + __m128i interpolated_01 = _mm_unpacklo_epi64(endpoints_16bit, interpolated_16); + __m128i interpolated_67 = _mm_unpackhi_epi64(interpolated_16, endpoints_16bit); + + __m128i all_colors_0 = _mm_packus_epi16(interpolated_01, interpolated_23); + __m128i all_colors_1 = _mm_packus_epi16(interpolated_45, interpolated_67); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(out_colors), all_colors_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(out_colors + 4), all_colors_1); +} +#endif + +bool unpack_bc7_mode0_2(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels) { //const uint32_t SUBSETS = 3; const uint32_t ENDPOINTS = 6; const uint32_t COMPS = 3; const uint32_t WEIGHT_BITS = (mode == 0) ? 3 : 2; + const uint32_t WEIGHT_MASK = (1 << WEIGHT_BITS) - 1; const uint32_t ENDPOINT_BITS = (mode == 0) ? 4 : 5; + const uint32_t ENDPOINT_MASK = (1 << ENDPOINT_BITS) - 1; const uint32_t PBITS = (mode == 0) ? 6 : 0; const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS; - - uint32_t bit_offset = 0; - const uint8_t* pBuf = static_cast(pBlock_bits); + const uint32_t PART_BITS = (mode == 0) ? 4 : 6; + const uint32_t PART_MASK = (1 << PART_BITS) - 1; + + const uint64_t low_chunk = data_chunks[0]; + const uint64_t high_chunk = data_chunks[1]; - if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false; + const uint32_t part = (low_chunk >> (mode + 1)) & PART_MASK; - const uint32_t part = read_bits32(pBuf, bit_offset, (mode == 0) ? 4 : 6); + uint64_t channel_read_chunks[3] = { 0, 0, 0 }; + + if (mode == 0) + { + channel_read_chunks[0] = low_chunk >> 5; + channel_read_chunks[1] = low_chunk >> 29; + channel_read_chunks[2] = ((low_chunk >> 53) | (high_chunk << 11)); + } + else + { + channel_read_chunks[0] = low_chunk >> 9; + channel_read_chunks[1] = ((low_chunk >> 39) | (high_chunk << 25)); + channel_read_chunks[2] = high_chunk >> 5; + } color_rgba endpoints[ENDPOINTS]; for (uint32_t c = 0; c < COMPS; c++) + { + uint64_t channel_read_chunk = channel_read_chunks[c]; for (uint32_t e = 0; e < ENDPOINTS; e++) - endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS); + { + endpoints[e][c] = static_cast(channel_read_chunk & ENDPOINT_MASK); + channel_read_chunk >>= ENDPOINT_BITS; + } + } uint32_t pbits[6]; - for (uint32_t p = 0; p < PBITS; p++) - pbits[p] = read_bits32(pBuf, bit_offset, 1); + if (mode == 0) + { + uint8_t p_bits_chunk = static_cast((high_chunk >> 13) & 0xff); + + for (uint32_t p = 0; p < PBITS; p++) + pbits[p] = (p_bits_chunk >> p) & 1; + } + + uint64_t weights_read_chunk = high_chunk >> (67 - 16 * WEIGHT_BITS); + insert_weight_zero(weights_read_chunk, WEIGHT_BITS, 0); + insert_weight_zero(weights_read_chunk, WEIGHT_BITS, std::min(g_bc7_table_anchor_index_third_subset_1[part], g_bc7_table_anchor_index_third_subset_2[part])); + insert_weight_zero(weights_read_chunk, WEIGHT_BITS, std::max(g_bc7_table_anchor_index_third_subset_1[part], g_bc7_table_anchor_index_third_subset_2[part])); uint32_t weights[16]; for (uint32_t i = 0; i < 16; i++) - weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == g_bc7_table_anchor_index_third_subset_1[part]) || (i == g_bc7_table_anchor_index_third_subset_2[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS); - - assert(bit_offset == 128); + { + weights[i] = static_cast(weights_read_chunk & WEIGHT_MASK); + weights_read_chunk >>= WEIGHT_BITS; + } for (uint32_t e = 0; e < ENDPOINTS; e++) for (uint32_t c = 0; c < 4; c++) - endpoints[e][c] = (uint8_t)((c == 3) ? 255 : (PBITS ? bc7_dequant(endpoints[e][c], pbits[e], ENDPOINT_BITS) : bc7_dequant(endpoints[e][c], ENDPOINT_BITS))); + endpoints[e][c] = static_cast((c == 3) ? 255 : (PBITS ? bc7_dequant(endpoints[e][c], pbits[e], ENDPOINT_BITS) : bc7_dequant(endpoints[e][c], ENDPOINT_BITS))); color_rgba block_colors[3][8]; + +#ifdef BC7DECOMP_USE_SSE2 + for (uint32_t s = 0; s < 3; s++) + { + if (WEIGHT_BITS == 2) + bc7_interp2_sse2(endpoints + s * 2, block_colors[s]); + else + bc7_interp3_sse2(endpoints + s * 2, block_colors[s]); + } +#else for (uint32_t s = 0; s < 3; s++) for (uint32_t i = 0; i < WEIGHT_VALS; i++) { for (uint32_t c = 0; c < 3; c++) - block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS); + block_colors[s][i][c] = static_cast(bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS)); block_colors[s][i][3] = 255; } +#endif for (uint32_t i = 0; i < 16; i++) pPixels[i] = block_colors[g_bc7_partition3[part * 16 + i]][weights[i]]; @@ -141,51 +261,102 @@ bool unpack_bc7_mode0_2(uint32_t mode, const void* pBlock_bits, color_rgba* pPix return true; } -bool unpack_bc7_mode1_3_7(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels) +bool unpack_bc7_mode1_3_7(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels) { //const uint32_t SUBSETS = 2; const uint32_t ENDPOINTS = 4; const uint32_t COMPS = (mode == 7) ? 4 : 3; const uint32_t WEIGHT_BITS = (mode == 1) ? 3 : 2; + const uint32_t WEIGHT_MASK = (1 << WEIGHT_BITS) - 1; const uint32_t ENDPOINT_BITS = (mode == 7) ? 5 : ((mode == 1) ? 6 : 7); + const uint32_t ENDPOINT_MASK = (1 << ENDPOINT_BITS) - 1; const uint32_t PBITS = (mode == 1) ? 2 : 4; const uint32_t SHARED_PBITS = (mode == 1) ? true : false; const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS; - - uint32_t bit_offset = 0; - const uint8_t* pBuf = static_cast(pBlock_bits); - if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false; + const uint64_t low_chunk = data_chunks[0]; + const uint64_t high_chunk = data_chunks[1]; - const uint32_t part = read_bits32(pBuf, bit_offset, 6); + const uint32_t part = ((low_chunk >> (mode + 1)) & 0x3f); color_rgba endpoints[ENDPOINTS]; + + uint64_t channel_read_chunks[4] = { 0, 0, 0, 0 }; + uint64_t p_read_chunk = 0; + channel_read_chunks[0] = (low_chunk >> (mode + 7)); + uint64_t weight_read_chunk; + + switch (mode) + { + case 1: + channel_read_chunks[1] = (low_chunk >> 32); + channel_read_chunks[2] = ((low_chunk >> 56) | (high_chunk << 8)); + p_read_chunk = high_chunk >> 16; + weight_read_chunk = high_chunk >> 18; + break; + case 3: + channel_read_chunks[1] = ((low_chunk >> 38) | (high_chunk << 26)); + channel_read_chunks[2] = high_chunk >> 2; + p_read_chunk = high_chunk >> 30; + weight_read_chunk = high_chunk >> 34; + break; + case 7: + channel_read_chunks[1] = low_chunk >> 34; + channel_read_chunks[2] = ((low_chunk >> 54) | (high_chunk << 10)); + channel_read_chunks[3] = high_chunk >> 10; + p_read_chunk = (high_chunk >> 30); + weight_read_chunk = (high_chunk >> 34); + break; + default: + return false; + }; + for (uint32_t c = 0; c < COMPS; c++) + { + uint64_t channel_read_chunk = channel_read_chunks[c]; for (uint32_t e = 0; e < ENDPOINTS; e++) - endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS); + { + endpoints[e][c] = static_cast(channel_read_chunk & ENDPOINT_MASK); + channel_read_chunk >>= ENDPOINT_BITS; + } + } uint32_t pbits[4]; for (uint32_t p = 0; p < PBITS; p++) - pbits[p] = read_bits32(pBuf, bit_offset, 1); - + pbits[p] = (p_read_chunk >> p) & 1; + + insert_weight_zero(weight_read_chunk, WEIGHT_BITS, 0); + insert_weight_zero(weight_read_chunk, WEIGHT_BITS, g_bc7_table_anchor_index_second_subset[part]); + uint32_t weights[16]; for (uint32_t i = 0; i < 16; i++) - weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == g_bc7_table_anchor_index_second_subset[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS); - - assert(bit_offset == 128); + { + weights[i] = static_cast(weight_read_chunk & WEIGHT_MASK); + weight_read_chunk >>= WEIGHT_BITS; + } for (uint32_t e = 0; e < ENDPOINTS; e++) for (uint32_t c = 0; c < 4; c++) - endpoints[e][c] = (uint8_t)((c == ((mode == 7U) ? 4U : 3U)) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS)); + endpoints[e][c] = static_cast((mode != 7U && c == 3U) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS)); color_rgba block_colors[2][8]; +#ifdef BC7DECOMP_USE_SSE2 + for (uint32_t s = 0; s < 2; s++) + { + if (WEIGHT_BITS == 2) + bc7_interp2_sse2(endpoints + s * 2, block_colors[s]); + else + bc7_interp3_sse2(endpoints + s * 2, block_colors[s]); + } +#else for (uint32_t s = 0; s < 2; s++) for (uint32_t i = 0; i < WEIGHT_VALS; i++) { for (uint32_t c = 0; c < COMPS; c++) - block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS); + block_colors[s][i][c] = static_cast(bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS)); block_colors[s][i][3] = (COMPS == 3) ? 255 : block_colors[s][i][3]; } +#endif for (uint32_t i = 0; i < 16; i++) pPixels[i] = block_colors[g_bc7_partition2[part * 16 + i]][weights[i]]; @@ -193,53 +364,101 @@ bool unpack_bc7_mode1_3_7(uint32_t mode, const void* pBlock_bits, color_rgba* pP return true; } -bool unpack_bc7_mode4_5(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels) +bool unpack_bc7_mode4_5(uint32_t mode, const uint64_t* data_chunks, color_rgba* pPixels) { const uint32_t ENDPOINTS = 2; - const uint32_t COMPS = 4; + //const uint32_t COMPS = 4; const uint32_t WEIGHT_BITS = 2; + const uint32_t WEIGHT_MASK = (1 << WEIGHT_BITS) - 1; const uint32_t A_WEIGHT_BITS = (mode == 4) ? 3 : 2; + const uint32_t A_WEIGHT_MASK = (1 << A_WEIGHT_BITS) - 1; const uint32_t ENDPOINT_BITS = (mode == 4) ? 5 : 7; + const uint32_t ENDPOINT_MASK = (1 << ENDPOINT_BITS) - 1; const uint32_t A_ENDPOINT_BITS = (mode == 4) ? 6 : 8; + const uint32_t A_ENDPOINT_MASK = (1 << A_ENDPOINT_BITS) - 1; //const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS; //const uint32_t A_WEIGHT_VALS = 1 << A_WEIGHT_BITS; - uint32_t bit_offset = 0; - const uint8_t* pBuf = static_cast(pBlock_bits); + const uint64_t low_chunk = data_chunks[0]; + const uint64_t high_chunk = data_chunks[1]; - if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false; + const uint32_t comp_rot = (low_chunk >> (mode + 1)) & 0x3; + const uint32_t index_mode = (mode == 4) ? static_cast((low_chunk >> 7) & 1) : 0; - const uint32_t comp_rot = read_bits32(pBuf, bit_offset, 2); - const uint32_t index_mode = (mode == 4) ? read_bits32(pBuf, bit_offset, 1) : 0; + uint64_t color_read_bits = low_chunk >> 8; color_rgba endpoints[ENDPOINTS]; - for (uint32_t c = 0; c < COMPS; c++) + for (uint32_t c = 0; c < 3; c++) + { for (uint32_t e = 0; e < ENDPOINTS; e++) - endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS); - + { + endpoints[e][c] = static_cast(color_read_bits & ENDPOINT_MASK); + color_read_bits >>= ENDPOINT_BITS; + } + } + + endpoints[0][3] = static_cast(color_read_bits & ENDPOINT_MASK); + + uint64_t rgb_weights_chunk; + uint64_t a_weights_chunk; + if (mode == 4) + { + endpoints[0][3] = static_cast(color_read_bits & A_ENDPOINT_MASK); + endpoints[1][3] = static_cast((color_read_bits >> A_ENDPOINT_BITS) & A_ENDPOINT_MASK); + rgb_weights_chunk = ((low_chunk >> 50) | (high_chunk << 14)); + a_weights_chunk = high_chunk >> 17; + } + else if (mode == 5) + { + endpoints[0][3] = static_cast(color_read_bits & A_ENDPOINT_MASK); + endpoints[1][3] = static_cast(((low_chunk >> 58) | (high_chunk << 6)) & A_ENDPOINT_MASK); + rgb_weights_chunk = high_chunk >> 2; + a_weights_chunk = high_chunk >> 33; + } + else + return false; + + insert_weight_zero(rgb_weights_chunk, WEIGHT_BITS, 0); + insert_weight_zero(a_weights_chunk, A_WEIGHT_BITS, 0); + const uint32_t weight_bits[2] = { index_mode ? A_WEIGHT_BITS : WEIGHT_BITS, index_mode ? WEIGHT_BITS : A_WEIGHT_BITS }; - + const uint32_t weight_mask[2] = { index_mode ? A_WEIGHT_MASK : WEIGHT_MASK, index_mode ? WEIGHT_MASK : A_WEIGHT_MASK }; + uint32_t weights[16], a_weights[16]; - - for (uint32_t i = 0; i < 16; i++) - (index_mode ? a_weights : weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[index_mode] - ((!i) ? 1 : 0)); + + if (index_mode) + std::swap(rgb_weights_chunk, a_weights_chunk); for (uint32_t i = 0; i < 16; i++) - (index_mode ? weights : a_weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[1 - index_mode] - ((!i) ? 1 : 0)); + { + weights[i] = (rgb_weights_chunk & weight_mask[0]); + rgb_weights_chunk >>= weight_bits[0]; + } - assert(bit_offset == 128); + for (uint32_t i = 0; i < 16; i++) + { + a_weights[i] = (a_weights_chunk & weight_mask[1]); + a_weights_chunk >>= weight_bits[1]; + } for (uint32_t e = 0; e < ENDPOINTS; e++) for (uint32_t c = 0; c < 4; c++) - endpoints[e][c] = (uint8_t)bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS); + endpoints[e][c] = static_cast(bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS)); color_rgba block_colors[8]; +#ifdef BC7DECOMP_USE_SSE2 + if (weight_bits[0] == 3) + bc7_interp3_sse2(endpoints, block_colors); + else + bc7_interp2_sse2(endpoints, block_colors); +#else for (uint32_t i = 0; i < (1U << weight_bits[0]); i++) for (uint32_t c = 0; c < 3; c++) - block_colors[i][c] = (uint8_t)bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0]); + block_colors[i][c] = static_cast(bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0])); +#endif for (uint32_t i = 0; i < (1U << weight_bits[1]); i++) - block_colors[i][3] = (uint8_t)bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1]); + block_colors[i][3] = static_cast(bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1])); for (uint32_t i = 0; i < 16; i++) { @@ -308,26 +527,46 @@ bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels) if (block.m_lo.m_mode != (1 << 6)) return false; - const uint32_t r0 = (uint32_t)((block.m_lo.m_r0 << 1) | block.m_lo.m_p0); - const uint32_t g0 = (uint32_t)((block.m_lo.m_g0 << 1) | block.m_lo.m_p0); - const uint32_t b0 = (uint32_t)((block.m_lo.m_b0 << 1) | block.m_lo.m_p0); - const uint32_t a0 = (uint32_t)((block.m_lo.m_a0 << 1) | block.m_lo.m_p0); - const uint32_t r1 = (uint32_t)((block.m_lo.m_r1 << 1) | block.m_hi.m_p1); - const uint32_t g1 = (uint32_t)((block.m_lo.m_g1 << 1) | block.m_hi.m_p1); - const uint32_t b1 = (uint32_t)((block.m_lo.m_b1 << 1) | block.m_hi.m_p1); - const uint32_t a1 = (uint32_t)((block.m_lo.m_a1 << 1) | block.m_hi.m_p1); + const uint32_t r0 = static_cast((block.m_lo.m_r0 << 1) | block.m_lo.m_p0); + const uint32_t g0 = static_cast((block.m_lo.m_g0 << 1) | block.m_lo.m_p0); + const uint32_t b0 = static_cast((block.m_lo.m_b0 << 1) | block.m_lo.m_p0); + const uint32_t a0 = static_cast((block.m_lo.m_a0 << 1) | block.m_lo.m_p0); + const uint32_t r1 = static_cast((block.m_lo.m_r1 << 1) | block.m_hi.m_p1); + const uint32_t g1 = static_cast((block.m_lo.m_g1 << 1) | block.m_hi.m_p1); + const uint32_t b1 = static_cast((block.m_lo.m_b1 << 1) | block.m_hi.m_p1); + const uint32_t a1 = static_cast((block.m_lo.m_a1 << 1) | block.m_hi.m_p1); color_rgba vals[16]; +#ifdef BC7DECOMP_USE_SSE2 + __m128i vep0 = _mm_set_epi16((short)a0, (short)b0, (short)g0, (short)r0, (short)a0, (short)b0, (short)g0, (short)r0); + __m128i vep1 = _mm_set_epi16((short)a1, (short)b1, (short)g1, (short)r1, (short)a1, (short)b1, (short)g1, (short)r1); + + for (uint32_t i = 0; i < 16; i += 4) + { + const __m128i w0 = g_bc7_weights4_sse2[i / 4 * 2 + 0]; + const __m128i w1 = g_bc7_weights4_sse2[i / 4 * 2 + 1]; + + const __m128i iw0 = _mm_sub_epi16(_mm_set1_epi16(64), w0); + const __m128i iw1 = _mm_sub_epi16(_mm_set1_epi16(64), w1); + + __m128i first_half = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(vep0, iw0), _mm_mullo_epi16(vep1, w0)), _mm_set1_epi16(32)), 6); + __m128i second_half = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(vep0, iw1), _mm_mullo_epi16(vep1, w1)), _mm_set1_epi16(32)), 6); + __m128i combined = _mm_packus_epi16(first_half, second_half); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(vals + i), combined); + } +#else for (uint32_t i = 0; i < 16; i++) { const uint32_t w = g_bc7_weights4[i]; const uint32_t iw = 64 - w; - vals[i].set_noclamp_rgba( - (r0 * iw + r1 * w + 32) >> 6, - (g0 * iw + g1 * w + 32) >> 6, - (b0 * iw + b1 * w + 32) >> 6, + vals[i].set_noclamp_rgba( + (r0 * iw + r1 * w + 32) >> 6, + (g0 * iw + g1 * w + 32) >> 6, + (b0 * iw + b1 * w + 32) >> 6, (a0 * iw + a1 * w + 32) >> 6); } +#endif pPixels[0] = vals[block.m_hi.m_s00]; pPixels[1] = vals[block.m_hi.m_s10]; @@ -338,7 +577,7 @@ bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels) pPixels[5] = vals[block.m_hi.m_s11]; pPixels[6] = vals[block.m_hi.m_s21]; pPixels[7] = vals[block.m_hi.m_s31]; - + pPixels[8] = vals[block.m_hi.m_s02]; pPixels[9] = vals[block.m_hi.m_s12]; pPixels[10] = vals[block.m_hi.m_s22]; @@ -354,32 +593,43 @@ bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels) bool unpack_bc7(const void *pBlock, color_rgba *pPixels) { - const uint32_t first_byte = static_cast(pBlock)[0]; + const uint8_t *block_bytes = static_cast(pBlock); + uint8_t mode = g_bc7_first_byte_to_mode[block_bytes[0]]; - for (uint32_t mode = 0; mode <= 7; mode++) + uint64_t data_chunks[2]; + + uint64_t endian_check = 1; + if (*reinterpret_cast(&endian_check) == 1) + memcpy(data_chunks, pBlock, 16); + else { - if (first_byte & (1U << mode)) + data_chunks[0] = data_chunks[1] = 0; + for (int chunk_index = 0; chunk_index < 2; chunk_index++) { - switch (mode) - { - case 0: - case 2: - return unpack_bc7_mode0_2(mode, pBlock, pPixels); - case 1: - case 3: - case 7: - return unpack_bc7_mode1_3_7(mode, pBlock, pPixels); - case 4: - case 5: - return unpack_bc7_mode4_5(mode, pBlock, pPixels); - case 6: - return unpack_bc7_mode6(pBlock, pPixels); - default: - break; - } + for (int byte_index = 0; byte_index < 8; byte_index++) + data_chunks[chunk_index] |= static_cast(block_bytes[chunk_index * 8 + byte_index]) << (byte_index * 8); } } + switch (mode) + { + case 0: + case 2: + return unpack_bc7_mode0_2(mode, data_chunks, pPixels); + case 1: + case 3: + case 7: + return unpack_bc7_mode1_3_7(mode, data_chunks, pPixels); + case 4: + case 5: + return unpack_bc7_mode4_5(mode, data_chunks, pPixels); + case 6: + return unpack_bc7_mode6(data_chunks, pPixels); + default: + memset(pPixels, 0, sizeof(color_rgba) * 16); + break; + } + return false; } diff --git a/libkram/bc7enc/bc7decomp.h b/libkram/bc7enc/bc7decomp.h index cccdf50e..49dc9341 100644 --- a/libkram/bc7enc/bc7decomp.h +++ b/libkram/bc7enc/bc7decomp.h @@ -1,8 +1,13 @@ #pragma once +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4201) // nonstandard extension used: nameless struct/union +#endif + #include #include -//#include +#include #include #include @@ -163,3 +168,13 @@ class color_rgba bool unpack_bc7(const void *pBlock, color_rgba *pPixels); } // namespace bc7decomp + +namespace bc7decomp_ref +{ + bool unpack_bc7(const void* pBlock, bc7decomp::color_rgba* pPixels); +} // namespace bc7decomp_ref + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + diff --git a/libkram/bc7enc/bc7decomp_ref.cpp b/libkram/bc7enc/bc7decomp_ref.cpp new file mode 100644 index 00000000..8a69e947 --- /dev/null +++ b/libkram/bc7enc/bc7decomp_ref.cpp @@ -0,0 +1,431 @@ +// File: bc7decomp.c - Richard Geldreich, Jr. 3/31/2020 - MIT license or public domain (see end of file) +#include "bc7decomp.h" + +using namespace bc7decomp; + +namespace bc7decomp_ref +{ + +const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 }; +const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; +const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; + +const uint8_t g_bc7_partition2[64 * 16] = +{ + 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, + 0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1, 0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0, 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0, 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1, + 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0, 0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0, 0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0, 0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0, 0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0, + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1, 0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0, 0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0, 0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0, 0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0, 0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1, 0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1, + 0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0, 0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0, 0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0, 0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0, 0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0, 0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1, 0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1, 0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0, + 0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0, 0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0, 0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0, 0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0, 0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0, + 0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1 +}; + +const uint8_t g_bc7_partition3[64 * 16] = +{ + 0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2, 0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1, 0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1, 0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2, 0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2, 0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1, 0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1, + 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2, 0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2, 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2, 0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2, 0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2, 0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0, + 0,0,0,1,0,0,1,1,0,1,1,2,1,1,2,2, 0,1,1,1,0,0,1,1,2,0,0,1,2,2,0,0, 0,0,0,0,1,1,2,2,1,1,2,2,1,1,2,2, 0,0,2,2,0,0,2,2,0,0,2,2,1,1,1,1, 0,1,1,1,0,1,1,1,0,2,2,2,0,2,2,2, 0,0,0,1,0,0,0,1,2,2,2,1,2,2,2,1, 0,0,0,0,0,0,1,1,0,1,2,2,0,1,2,2, 0,0,0,0,1,1,0,0,2,2,1,0,2,2,1,0, + 0,1,2,2,0,1,2,2,0,0,1,1,0,0,0,0, 0,0,1,2,0,0,1,2,1,1,2,2,2,2,2,2, 0,1,1,0,1,2,2,1,1,2,2,1,0,1,1,0, 0,0,0,0,0,1,1,0,1,2,2,1,1,2,2,1, 0,0,2,2,1,1,0,2,1,1,0,2,0,0,2,2, 0,1,1,0,0,1,1,0,2,0,0,2,2,2,2,2, 0,0,1,1,0,1,2,2,0,1,2,2,0,0,1,1, 0,0,0,0,2,0,0,0,2,2,1,1,2,2,2,1, + 0,0,0,0,0,0,0,2,1,1,2,2,1,2,2,2, 0,2,2,2,0,0,2,2,0,0,1,2,0,0,1,1, 0,0,1,1,0,0,1,2,0,0,2,2,0,2,2,2, 0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,0, 0,0,0,0,1,1,1,1,2,2,2,2,0,0,0,0, 0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0, 0,1,2,0,2,0,1,2,1,2,0,1,0,1,2,0, 0,0,1,1,2,2,0,0,1,1,2,2,0,0,1,1, + 0,0,1,1,1,1,2,2,2,2,0,0,0,0,1,1, 0,1,0,1,0,1,0,1,2,2,2,2,2,2,2,2, 0,0,0,0,0,0,0,0,2,1,2,1,2,1,2,1, 0,0,2,2,1,1,2,2,0,0,2,2,1,1,2,2, 0,0,2,2,0,0,1,1,0,0,2,2,0,0,1,1, 0,2,2,0,1,2,2,1,0,2,2,0,1,2,2,1, 0,1,0,1,2,2,2,2,2,2,2,2,0,1,0,1, 0,0,0,0,2,1,2,1,2,1,2,1,2,1,2,1, + 0,1,0,1,0,1,0,1,0,1,0,1,2,2,2,2, 0,2,2,2,0,1,1,1,0,2,2,2,0,1,1,1, 0,0,0,2,1,1,1,2,0,0,0,2,1,1,1,2, 0,0,0,0,2,1,1,2,2,1,1,2,2,1,1,2, 0,2,2,2,0,1,1,1,0,1,1,1,0,2,2,2, 0,0,0,2,1,1,1,2,1,1,1,2,0,0,0,2, 0,1,1,0,0,1,1,0,0,1,1,0,2,2,2,2, 0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,2, + 0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2, 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2, 0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2, 0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2, 0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1, 0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2, 0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2, 0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0, +}; + +const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 }; + +const uint8_t g_bc7_table_anchor_index_third_subset_1[64] = +{ + 3, 3,15,15, 8, 3,15,15, 8, 8, 6, 6, 6, 5, 3, 3, 3, 3, 8,15, 3, 3, 6,10, 5, 8, 8, 6, 8, 5,15,15, 8,15, 3, 5, 6,10, 8,15, 15, 3,15, 5,15,15,15,15, 3,15, 5, 5, 5, 8, 5,10, 5,10, 8,13,15,12, 3, 3 +}; + +const uint8_t g_bc7_table_anchor_index_third_subset_2[64] = +{ + 15, 8, 8, 3,15,15, 3, 8, 15,15,15,15,15,15,15, 8, 15, 8,15, 3,15, 8,15, 8, 3,15, 6,10,15,15,10, 8, 15, 3,15,10,10, 8, 9,10, 6,15, 8,15, 3, 6, 6, 8, 15, 3,15,15,15,15,15,15, 15,15,15,15, 3,15,15, 8 +}; + +inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) +{ + assert(codesize <= 32); + uint32_t bits = 0; + uint32_t total_bits = 0; + + while (total_bits < codesize) + { + uint32_t byte_bit_offset = bit_offset & 7; + uint32_t bits_to_read = std::min(codesize - total_bits, 8 - byte_bit_offset); + + uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset; + byte_bits &= ((1 << bits_to_read) - 1); + + bits |= (byte_bits << total_bits); + + total_bits += bits_to_read; + bit_offset += bits_to_read; + } + + return bits; +} + +// BC7 mode 0-7 decompression. +// Instead of one monster routine to unpack all the BC7 modes, we're lumping the 3 subset, 2 subset, 1 subset, and dual plane modes together into simple shared routines. + +static inline uint32_t bc7_dequant(uint32_t val, uint32_t pbit, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(pbit < 2); assert(val_bits >= 4 && val_bits <= 8); const uint32_t total_bits = val_bits + 1; val = (val << 1) | pbit; val <<= (8 - total_bits); val |= (val >> total_bits); assert(val <= 255); return val; } +static inline uint32_t bc7_dequant(uint32_t val, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(val_bits >= 4 && val_bits <= 8); val <<= (8 - val_bits); val |= (val >> val_bits); assert(val <= 255); return val; } + +static inline uint32_t bc7_interp2(uint32_t l, uint32_t h, uint32_t w) { assert(w < 4); return (l * (64 - g_bc7_weights2[w]) + h * g_bc7_weights2[w] + 32) >> 6; } +static inline uint32_t bc7_interp3(uint32_t l, uint32_t h, uint32_t w) { assert(w < 8); return (l * (64 - g_bc7_weights3[w]) + h * g_bc7_weights3[w] + 32) >> 6; } +static inline uint32_t bc7_interp4(uint32_t l, uint32_t h, uint32_t w) { assert(w < 16); return (l * (64 - g_bc7_weights4[w]) + h * g_bc7_weights4[w] + 32) >> 6; } +static inline uint32_t bc7_interp(uint32_t l, uint32_t h, uint32_t w, uint32_t bits) +{ + assert(l <= 255 && h <= 255); + switch (bits) + { + case 2: return bc7_interp2(l, h, w); + case 3: return bc7_interp3(l, h, w); + case 4: return bc7_interp4(l, h, w); + default: + break; + } + return 0; +} + +bool unpack_bc7_mode0_2(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels) +{ + //const uint32_t SUBSETS = 3; + const uint32_t ENDPOINTS = 6; + const uint32_t COMPS = 3; + const uint32_t WEIGHT_BITS = (mode == 0) ? 3 : 2; + const uint32_t ENDPOINT_BITS = (mode == 0) ? 4 : 5; + const uint32_t PBITS = (mode == 0) ? 6 : 0; + const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS; + + uint32_t bit_offset = 0; + const uint8_t* pBuf = static_cast(pBlock_bits); + + if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false; + + const uint32_t part = read_bits32(pBuf, bit_offset, (mode == 0) ? 4 : 6); + + color_rgba endpoints[ENDPOINTS]; + for (uint32_t c = 0; c < COMPS; c++) + for (uint32_t e = 0; e < ENDPOINTS; e++) + endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS); + + uint32_t pbits[6]; + for (uint32_t p = 0; p < PBITS; p++) + pbits[p] = read_bits32(pBuf, bit_offset, 1); + + uint32_t weights[16]; + for (uint32_t i = 0; i < 16; i++) + weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == g_bc7_table_anchor_index_third_subset_1[part]) || (i == g_bc7_table_anchor_index_third_subset_2[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS); + + assert(bit_offset == 128); + + for (uint32_t e = 0; e < ENDPOINTS; e++) + for (uint32_t c = 0; c < 4; c++) + endpoints[e][c] = (uint8_t)((c == 3) ? 255 : (PBITS ? bc7_dequant(endpoints[e][c], pbits[e], ENDPOINT_BITS) : bc7_dequant(endpoints[e][c], ENDPOINT_BITS))); + + color_rgba block_colors[3][8]; + for (uint32_t s = 0; s < 3; s++) + for (uint32_t i = 0; i < WEIGHT_VALS; i++) + { + for (uint32_t c = 0; c < 3; c++) + block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS); + block_colors[s][i][3] = 255; + } + + for (uint32_t i = 0; i < 16; i++) + pPixels[i] = block_colors[g_bc7_partition3[part * 16 + i]][weights[i]]; + + return true; +} + +bool unpack_bc7_mode1_3_7(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels) +{ + //const uint32_t SUBSETS = 2; + const uint32_t ENDPOINTS = 4; + const uint32_t COMPS = (mode == 7) ? 4 : 3; + const uint32_t WEIGHT_BITS = (mode == 1) ? 3 : 2; + const uint32_t ENDPOINT_BITS = (mode == 7) ? 5 : ((mode == 1) ? 6 : 7); + const uint32_t PBITS = (mode == 1) ? 2 : 4; + const uint32_t SHARED_PBITS = (mode == 1) ? true : false; + const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS; + + uint32_t bit_offset = 0; + const uint8_t* pBuf = static_cast(pBlock_bits); + + if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false; + + const uint32_t part = read_bits32(pBuf, bit_offset, 6); + + color_rgba endpoints[ENDPOINTS]; + for (uint32_t c = 0; c < COMPS; c++) + for (uint32_t e = 0; e < ENDPOINTS; e++) + endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS); + + uint32_t pbits[4]; + for (uint32_t p = 0; p < PBITS; p++) + pbits[p] = read_bits32(pBuf, bit_offset, 1); + + uint32_t weights[16]; + for (uint32_t i = 0; i < 16; i++) + weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == g_bc7_table_anchor_index_second_subset[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS); + + assert(bit_offset == 128); + + for (uint32_t e = 0; e < ENDPOINTS; e++) + for (uint32_t c = 0; c < 4; c++) + endpoints[e][c] = (uint8_t)((c == ((mode == 7U) ? 4U : 3U)) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS)); + + color_rgba block_colors[2][8]; + for (uint32_t s = 0; s < 2; s++) + for (uint32_t i = 0; i < WEIGHT_VALS; i++) + { + for (uint32_t c = 0; c < COMPS; c++) + block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS); + block_colors[s][i][3] = (COMPS == 3) ? 255 : block_colors[s][i][3]; + } + + for (uint32_t i = 0; i < 16; i++) + pPixels[i] = block_colors[g_bc7_partition2[part * 16 + i]][weights[i]]; + + return true; +} + +bool unpack_bc7_mode4_5(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels) +{ + const uint32_t ENDPOINTS = 2; + const uint32_t COMPS = 4; + const uint32_t WEIGHT_BITS = 2; + const uint32_t A_WEIGHT_BITS = (mode == 4) ? 3 : 2; + const uint32_t ENDPOINT_BITS = (mode == 4) ? 5 : 7; + const uint32_t A_ENDPOINT_BITS = (mode == 4) ? 6 : 8; + //const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS; + //const uint32_t A_WEIGHT_VALS = 1 << A_WEIGHT_BITS; + + uint32_t bit_offset = 0; + const uint8_t* pBuf = static_cast(pBlock_bits); + + if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false; + + const uint32_t comp_rot = read_bits32(pBuf, bit_offset, 2); + const uint32_t index_mode = (mode == 4) ? read_bits32(pBuf, bit_offset, 1) : 0; + + color_rgba endpoints[ENDPOINTS]; + for (uint32_t c = 0; c < COMPS; c++) + for (uint32_t e = 0; e < ENDPOINTS; e++) + endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS); + + const uint32_t weight_bits[2] = { index_mode ? A_WEIGHT_BITS : WEIGHT_BITS, index_mode ? WEIGHT_BITS : A_WEIGHT_BITS }; + + uint32_t weights[16], a_weights[16]; + + for (uint32_t i = 0; i < 16; i++) + (index_mode ? a_weights : weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[index_mode] - ((!i) ? 1 : 0)); + + for (uint32_t i = 0; i < 16; i++) + (index_mode ? weights : a_weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[1 - index_mode] - ((!i) ? 1 : 0)); + + assert(bit_offset == 128); + + for (uint32_t e = 0; e < ENDPOINTS; e++) + for (uint32_t c = 0; c < 4; c++) + endpoints[e][c] = (uint8_t)bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS); + + color_rgba block_colors[8]; + for (uint32_t i = 0; i < (1U << weight_bits[0]); i++) + for (uint32_t c = 0; c < 3; c++) + block_colors[i][c] = (uint8_t)bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0]); + + for (uint32_t i = 0; i < (1U << weight_bits[1]); i++) + block_colors[i][3] = (uint8_t)bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1]); + + for (uint32_t i = 0; i < 16; i++) + { + pPixels[i] = block_colors[weights[i]]; + pPixels[i].a = block_colors[a_weights[i]].a; + if (comp_rot >= 1) + std::swap(pPixels[i].a, pPixels[i].m_comps[comp_rot - 1]); + } + + return true; +} + +struct bc7_mode_6 +{ + struct + { + uint64_t m_mode : 7; + uint64_t m_r0 : 7; + uint64_t m_r1 : 7; + uint64_t m_g0 : 7; + uint64_t m_g1 : 7; + uint64_t m_b0 : 7; + uint64_t m_b1 : 7; + uint64_t m_a0 : 7; + uint64_t m_a1 : 7; + uint64_t m_p0 : 1; + } m_lo; + + union + { + struct + { + uint64_t m_p1 : 1; + uint64_t m_s00 : 3; + uint64_t m_s10 : 4; + uint64_t m_s20 : 4; + uint64_t m_s30 : 4; + + uint64_t m_s01 : 4; + uint64_t m_s11 : 4; + uint64_t m_s21 : 4; + uint64_t m_s31 : 4; + + uint64_t m_s02 : 4; + uint64_t m_s12 : 4; + uint64_t m_s22 : 4; + uint64_t m_s32 : 4; + + uint64_t m_s03 : 4; + uint64_t m_s13 : 4; + uint64_t m_s23 : 4; + uint64_t m_s33 : 4; + + } m_hi; + + uint64_t m_hi_bits; + }; +}; + +bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels) +{ + static_assert(sizeof(bc7_mode_6) == 16, "sizeof(bc7_mode_6) == 16"); + + const bc7_mode_6 &block = *static_cast(pBlock_bits); + + if (block.m_lo.m_mode != (1 << 6)) + return false; + + const uint32_t r0 = (uint32_t)((block.m_lo.m_r0 << 1) | block.m_lo.m_p0); + const uint32_t g0 = (uint32_t)((block.m_lo.m_g0 << 1) | block.m_lo.m_p0); + const uint32_t b0 = (uint32_t)((block.m_lo.m_b0 << 1) | block.m_lo.m_p0); + const uint32_t a0 = (uint32_t)((block.m_lo.m_a0 << 1) | block.m_lo.m_p0); + const uint32_t r1 = (uint32_t)((block.m_lo.m_r1 << 1) | block.m_hi.m_p1); + const uint32_t g1 = (uint32_t)((block.m_lo.m_g1 << 1) | block.m_hi.m_p1); + const uint32_t b1 = (uint32_t)((block.m_lo.m_b1 << 1) | block.m_hi.m_p1); + const uint32_t a1 = (uint32_t)((block.m_lo.m_a1 << 1) | block.m_hi.m_p1); + + color_rgba vals[16]; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t w = g_bc7_weights4[i]; + const uint32_t iw = 64 - w; + vals[i].set_noclamp_rgba( + (r0 * iw + r1 * w + 32) >> 6, + (g0 * iw + g1 * w + 32) >> 6, + (b0 * iw + b1 * w + 32) >> 6, + (a0 * iw + a1 * w + 32) >> 6); + } + + pPixels[0] = vals[block.m_hi.m_s00]; + pPixels[1] = vals[block.m_hi.m_s10]; + pPixels[2] = vals[block.m_hi.m_s20]; + pPixels[3] = vals[block.m_hi.m_s30]; + + pPixels[4] = vals[block.m_hi.m_s01]; + pPixels[5] = vals[block.m_hi.m_s11]; + pPixels[6] = vals[block.m_hi.m_s21]; + pPixels[7] = vals[block.m_hi.m_s31]; + + pPixels[8] = vals[block.m_hi.m_s02]; + pPixels[9] = vals[block.m_hi.m_s12]; + pPixels[10] = vals[block.m_hi.m_s22]; + pPixels[11] = vals[block.m_hi.m_s32]; + + pPixels[12] = vals[block.m_hi.m_s03]; + pPixels[13] = vals[block.m_hi.m_s13]; + pPixels[14] = vals[block.m_hi.m_s23]; + pPixels[15] = vals[block.m_hi.m_s33]; + + return true; +} + +bool unpack_bc7(const void *pBlock, bc7decomp::color_rgba *pPixels) +{ + const uint32_t first_byte = static_cast(pBlock)[0]; + + for (uint32_t mode = 0; mode <= 7; mode++) + { + if (first_byte & (1U << mode)) + { + switch (mode) + { + case 0: + case 2: + return unpack_bc7_mode0_2(mode, pBlock, pPixels); + case 1: + case 3: + case 7: + return unpack_bc7_mode1_3_7(mode, pBlock, pPixels); + case 4: + case 5: + return unpack_bc7_mode4_5(mode, pBlock, pPixels); + case 6: + return unpack_bc7_mode6(pBlock, pPixels); + default: + break; + } + } + } + + return false; +} + +} // namespace bc7decomp_ref + +/* +------------------------------------------------------------------------------ +This software is available under 2 licenses -- choose whichever you prefer. +------------------------------------------------------------------------------ +ALTERNATIVE A - MIT License +Copyright(c) 2020 Richard Geldreich, Jr. +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files(the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions : +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +------------------------------------------------------------------------------ +ALTERNATIVE B - Public Domain(www.unlicense.org) +This is free and unencumbered software released into the public domain. +Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +software, either in source code form or as a compiled binary, for any purpose, +commercial or non - commercial, and by any means. +In jurisdictions that recognize copyright laws, the author or authors of this +software dedicate any and all copyright interest in the software to the public +domain.We make this dedication for the benefit of the public at large and to +the detriment of our heirs and successors.We intend this dedication to be an +overt act of relinquishment in perpetuity of all present and future rights to +this software under copyright law. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +------------------------------------------------------------------------------ +*/ + diff --git a/libkram/bc7enc/bc7enc.cpp b/libkram/bc7enc/bc7enc.cpp index b2403b84..4cbdd552 100644 --- a/libkram/bc7enc/bc7enc.cpp +++ b/libkram/bc7enc/bc7enc.cpp @@ -1,82 +1,49 @@ // File: bc7enc.c - Richard Geldreich, Jr. 3/31/2020 - MIT license or public domain (see end of file) // Currently supports modes 1, 6 for RGB blocks, and modes 5, 6, 7 for RGBA blocks. -// NOTE: This module is still a work in progress as of 3/31/2020. It needs to support mode modes for RGB content. #include "bc7enc.h" #include #include #include #include -#include - -// Make mapping to simd classes more simple. -// Repeated individual ops instead of functions that can be optimized don't result in a speedup. -// The algorithm is already so fast. -#define USE_SIMD_BCENC 0 +#include // Helpers static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high; return value; } static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high; return value; } -#if !USE_SIMD_BCENC static inline float saturate(float value) { return clampf(value, 0, 1.0f); } -#endif //static inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; } +static inline int32_t minimumi(int32_t a, int32_t b) { return (a < b) ? a : b; } static inline uint32_t minimumu(uint32_t a, uint32_t b) { return (a < b) ? a : b; } static inline float minimumf(float a, float b) { return (a < b) ? a : b; } //static inline uint8_t maximumub(uint8_t a, uint8_t b) { return (a > b) ? a : b; } static inline uint32_t maximumu(uint32_t a, uint32_t b) { return (a > b) ? a : b; } +//static inline int32_t maximumi(int32_t a, int32_t b) { return (a > b) ? a : b; } static inline float maximumf(float a, float b) { return (a > b) ? a : b; } static inline int squarei(int i) { return i * i; } static inline float squaref(float i) { return i * i; } +template inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; } static inline int32_t iabs32(int32_t v) { uint32_t msk = v >> 31; return (v ^ msk) - msk; } //static inline void swapub(uint8_t* a, uint8_t* b) { uint8_t t = *a; *a = *b; *b = t; } static inline void swapu(uint32_t* a, uint32_t* b) { uint32_t t = *a; *a = *b; *b = t; } //static inline void swapf(float* a, float* b) { float t = *a; *a = *b; *b = t; } -struct color_quad_u8 { - uint8_t r, g, b, a; - inline const uint8_t& operator[](int index) const { return *(&r + index); } - inline uint8_t& operator[](int index) { return *(&r + index); } -}; - -static inline color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { pRes->r = (uint8_t)clampi(r, 0, 255); pRes->g = (uint8_t)clampi(g, 0, 255); pRes->b = (uint8_t)clampi(b, 0, 255); pRes->a = (uint8_t)clampi(a, 0, 255); return pRes; } -static inline color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { assert((uint32_t)(r | g | b | a) <= 255); pRes->r = (uint8_t)r; pRes->g = (uint8_t)g; pRes->b = (uint8_t)b; pRes->a = (uint8_t)a; return pRes; } -static inline bc7enc_bool color_quad_u8_notequals(const color_quad_u8 *pLHS, const color_quad_u8 *pRHS) { return (pLHS->r != pRHS->r) || (pLHS->g != pRHS->g) || (pLHS->b != pRHS->b) || (pLHS->a != pRHS->a); } - -#if USE_SIMD_BCENC -using namespace simd; -using vec4F = float4; - -static inline vec4F *vec4F_set_scalar(vec4F *pV, float x) { *pV = vec4F(x); return pV; } -static inline vec4F *vec4F_set(vec4F *pV, float x, float y, float z, float w) { *pV = float4m(x,y,z,w); return pV; } -static inline vec4F *vec4F_saturate_in_place(vec4F *pV) { *pV = saturate(*pV); return pV; } -static inline vec4F vec4F_saturate(const vec4F *pV) { vec4F res = saturate(*pV); return res; } - -static inline vec4F vec4F_from_color(const color_quad_u8 *pC) { vec4F res = float4m((float)pC->r, (float)pC->g, (float)pC->b, (float)pC->a); return res; } -static inline vec4F vec4F_add(const vec4F *pLHS, const vec4F *pRHS) { vec4F res = *pLHS + *pRHS; return res; } -static inline vec4F vec4F_sub(const vec4F *pLHS, const vec4F *pRHS) { vec4F res = *pLHS - *pRHS; return res; } -static inline float vec4F_dot(const vec4F *pLHS, const vec4F *pRHS) { return dot(*pLHS, *pRHS); } -static inline vec4F vec4F_mul(const vec4F *pLHS, float s) { vec4F res = *pLHS * s; return res; } -static inline vec4F *vec4F_normalize_in_place(vec4F *pV) { *pV = normalize(*pV); return pV; } +struct vec4F { float m_c[4]; }; + +static inline color_rgba *color_quad_u8_set_clamped(color_rgba *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { pRes->m_c[0] = (uint8_t)clampi(r, 0, 255); pRes->m_c[1] = (uint8_t)clampi(g, 0, 255); pRes->m_c[2] = (uint8_t)clampi(b, 0, 255); pRes->m_c[3] = (uint8_t)clampi(a, 0, 255); return pRes; } +static inline color_rgba *color_quad_u8_set(color_rgba *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { assert((uint32_t)(r | g | b | a) <= 255); pRes->m_c[0] = (uint8_t)r; pRes->m_c[1] = (uint8_t)g; pRes->m_c[2] = (uint8_t)b; pRes->m_c[3] = (uint8_t)a; return pRes; } +static inline bool color_quad_u8_notequals(const color_rgba *pLHS, const color_rgba *pRHS) { return (pLHS->m_c[0] != pRHS->m_c[0]) || (pLHS->m_c[1] != pRHS->m_c[1]) || (pLHS->m_c[2] != pRHS->m_c[2]) || (pLHS->m_c[3] != pRHS->m_c[3]); } +static inline vec4F *vec4F_set_scalar(vec4F *pV, float x) { pV->m_c[0] = x; pV->m_c[1] = x; pV->m_c[2] = x; pV->m_c[3] = x; return pV; } +static inline vec4F *vec4F_set(vec4F *pV, float x, float y, float z, float w) { pV->m_c[0] = x; pV->m_c[1] = y; pV->m_c[2] = z; pV->m_c[3] = w; return pV; } +static inline vec4F *vec4F_saturate_in_place(vec4F *pV) { pV->m_c[0] = saturate(pV->m_c[0]); pV->m_c[1] = saturate(pV->m_c[1]); pV->m_c[2] = saturate(pV->m_c[2]); pV->m_c[3] = saturate(pV->m_c[3]); return pV; } +static inline vec4F vec4F_saturate(const vec4F *pV) { vec4F res; res.m_c[0] = saturate(pV->m_c[0]); res.m_c[1] = saturate(pV->m_c[1]); res.m_c[2] = saturate(pV->m_c[2]); res.m_c[3] = saturate(pV->m_c[3]); return res; } +static inline vec4F vec4F_from_color(const color_rgba *pC) { vec4F res; vec4F_set(&res, pC->m_c[0], pC->m_c[1], pC->m_c[2], pC->m_c[3]); return res; } +static inline vec4F vec4F_add(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] + pRHS->m_c[0], pLHS->m_c[1] + pRHS->m_c[1], pLHS->m_c[2] + pRHS->m_c[2], pLHS->m_c[3] + pRHS->m_c[3]); return res; } +static inline vec4F vec4F_sub(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] - pRHS->m_c[0], pLHS->m_c[1] - pRHS->m_c[1], pLHS->m_c[2] - pRHS->m_c[2], pLHS->m_c[3] - pRHS->m_c[3]); return res; } +static inline float vec4F_dot(const vec4F *pLHS, const vec4F *pRHS) { return pLHS->m_c[0] * pRHS->m_c[0] + pLHS->m_c[1] * pRHS->m_c[1] + pLHS->m_c[2] * pRHS->m_c[2] + pLHS->m_c[3] * pRHS->m_c[3]; } +static inline vec4F vec4F_mul(const vec4F *pLHS, float s) { vec4F res; vec4F_set(&res, pLHS->m_c[0] * s, pLHS->m_c[1] * s, pLHS->m_c[2] * s, pLHS->m_c[3] * s); return res; } +static inline vec4F *vec4F_normalize_in_place(vec4F *pV) { float s = pV->m_c[0] * pV->m_c[0] + pV->m_c[1] * pV->m_c[1] + pV->m_c[2] * pV->m_c[2] + pV->m_c[3] * pV->m_c[3]; if (s != 0.0f) { s = 1.0f / sqrtf(s); pV->m_c[0] *= s; pV->m_c[1] *= s; pV->m_c[2] *= s; pV->m_c[3] *= s; } return pV; } -#else -struct vec4F { - float r, g, b, a; - inline const float& operator[](int index) const { return *(&r + index); } - inline float& operator[](int index) { return *(&r + index); } -}; - -static inline vec4F *vec4F_set_scalar(vec4F *pV, float x) { pV->r = x; pV->g = x; pV->b = x; pV->a = x; return pV; } -static inline vec4F *vec4F_set(vec4F *pV, float x, float y, float z, float w) { pV->r = x; pV->g = y; pV->b = z; pV->a = w; return pV; } -static inline vec4F *vec4F_saturate_in_place(vec4F *pV) { pV->r = saturate(pV->r); pV->g = saturate(pV->g); pV->b = saturate(pV->b); pV->a = saturate(pV->a); return pV; } -static inline vec4F vec4F_saturate(const vec4F *pV) { vec4F res; res.r = saturate(pV->r); res.g = saturate(pV->g); res.b = saturate(pV->b); res.a = saturate(pV->a); return res; } -static inline vec4F vec4F_from_color(const color_quad_u8 *pC) { vec4F res; vec4F_set(&res, pC->r, pC->g, pC->b, pC->a); return res; } -static inline vec4F vec4F_add(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->r + pRHS->r, pLHS->g + pRHS->g, pLHS->b + pRHS->b, pLHS->a + pRHS->a); return res; } -static inline vec4F vec4F_sub(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->r - pRHS->r, pLHS->g - pRHS->g, pLHS->b - pRHS->b, pLHS->a - pRHS->a); return res; } -static inline float vec4F_dot(const vec4F *pLHS, const vec4F *pRHS) { return pLHS->r * pRHS->r + pLHS->g * pRHS->g + pLHS->b * pRHS->b + pLHS->a * pRHS->a; } -static inline vec4F vec4F_mul(const vec4F *pLHS, float s) { vec4F res; vec4F_set(&res, pLHS->r * s, pLHS->g * s, pLHS->b * s, pLHS->a * s); return res; } -static inline vec4F *vec4F_normalize_in_place(vec4F *pV) { float s = pV->r * pV->r + pV->g * pV->g + pV->b * pV->b + pV->a * pV->a; if (s != 0.0f) { s = 1.0f / sqrtf(s); pV->r *= s; pV->g *= s; pV->b *= s; pV->a *= s; } return pV; } -#endif // Various BC7 tables static const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 }; static const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; @@ -135,7 +102,7 @@ static const uint8_t g_bc7_mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 }; static const uint8_t g_bc7_mode_has_shared_p_bits[8] = { 0, 1, 0, 0, 0, 0, 0, 0 }; static const uint8_t g_bc7_color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 }; static const int8_t g_bc7_alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 }; -static bc7enc_bool get_bc7_mode_has_seperate_alpha_selectors(int mode) { return (mode == 4) || (mode == 5); } +static bool get_bc7_mode_has_seperate_alpha_selectors(int mode) { return (mode == 4) || (mode == 5); } typedef struct { uint16_t m_error; uint8_t m_lo; uint8_t m_hi; } endpoint_err; @@ -145,9 +112,105 @@ static const uint32_t BC7ENC_MODE_1_OPTIMAL_INDEX = 2; static endpoint_err g_bc7_mode_7_optimal_endpoints[256][2][2]; // [c][pbit][hp][lp] const uint32_t BC7E_MODE_7_OPTIMAL_INDEX = 1; -// Initialize the lookup table used for optimal single color compression in mode 1. Must be called before encoding. +static float g_mode1_rgba_midpoints[64][2]; +static float g_mode5_rgba_midpoints[128]; +static float g_mode7_rgba_midpoints[32][2]; + +static uint8_t g_mode6_reduced_quant[2048][2]; + +static bool g_initialized; + +// Initialize the lookup table used for optimal single color compression in mode 1/7. Must be called before encoding. void bc7enc_compress_block_init() { + if (g_initialized) + return; + + // Mode 7 endpoint midpoints + for (uint32_t p = 0; p < 2; p++) + { + for (uint32_t i = 0; i < 32; i++) + { + uint32_t vl = ((i << 1) | p) << 2; + vl |= (vl >> 6); + float lo = vl / 255.0f; + + uint32_t vh = ((minimumi(31, (i + 1)) << 1) | p) << 2; + vh |= (vh >> 6); + float hi = vh / 255.0f; + + //g_mode7_quant_values[i][p] = lo; + if (i == 31) + g_mode7_rgba_midpoints[i][p] = 1.0f; + else + g_mode7_rgba_midpoints[i][p] = (lo + hi) / 2.0f; + } + } + + // Mode 1 endpoint midpoints + for (uint32_t p = 0; p < 2; p++) + { + for (uint32_t i = 0; i < 64; i++) + { + uint32_t vl = ((i << 1) | p) << 1; + vl |= (vl >> 7); + float lo = vl / 255.0f; + + uint32_t vh = ((minimumi(63, (i + 1)) << 1) | p) << 1; + vh |= (vh >> 7); + float hi = vh / 255.0f; + + //g_mode1_quant_values[i][p] = lo; + if (i == 63) + g_mode1_rgba_midpoints[i][p] = 1.0f; + else + g_mode1_rgba_midpoints[i][p] = (lo + hi) / 2.0f; + } + } + + // Mode 5 endpoint midpoints + for (uint32_t i = 0; i < 128; i++) + { + uint32_t vl = (i << 1); + vl |= (vl >> 7); + float lo = vl / 255.0f; + + uint32_t vh = minimumi(127, i + 1) << 1; + vh |= (vh >> 7); + float hi = vh / 255.0f; + + if (i == 127) + g_mode5_rgba_midpoints[i] = 1.0f; + else + g_mode5_rgba_midpoints[i] = (lo + hi) / 2.0f; + } + + for (uint32_t p = 0; p < 2; p++) + { + for (uint32_t i = 0; i < 2048; i++) + { + float f = i / 2047.0f; + + float best_err = 1e+9f; + int best_index = 0; + for (int j = 0; j < 64; j++) + { + int ik = (j * 127 + 31) / 63; + float k = ((ik << 1) + p) / 255.0f; + + float e = fabsf(k - f); + if (e < best_err) + { + best_err = e; + best_index = ik; + } + } + + g_mode6_reduced_quant[i][p] = (uint8_t)best_index; + } + } // p + + // Mode 1 for (int c = 0; c < 256; c++) { for (uint32_t lp = 0; lp < 2; lp++) @@ -217,9 +280,11 @@ void bc7enc_compress_block_init() } // lp } // c + + g_initialized = true; } -static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors) +static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_rgba *pColors) { // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf // I did this in matrix form first, expanded out all the ops, then optimized it a bit. @@ -231,14 +296,14 @@ static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSel for (uint32_t i = 0; i < N; i++) { const uint32_t sel = pSelectors[i]; - z00 += pSelector_weights[sel][0]; - z10 += pSelector_weights[sel][1]; - z11 += pSelector_weights[sel][2]; - float w = pSelector_weights[sel][3]; - q00_r += w * pColors[i].r; t_r += pColors[i].r; - q00_g += w * pColors[i].g; t_g += pColors[i].g; - q00_b += w * pColors[i].b; t_b += pColors[i].b; - q00_a += w * pColors[i].a; t_a += pColors[i].a; + z00 += pSelector_weights[sel].m_c[0]; + z10 += pSelector_weights[sel].m_c[1]; + z11 += pSelector_weights[sel].m_c[2]; + float w = pSelector_weights[sel].m_c[3]; + q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; + q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; + q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; + q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3]; } q10_r = t_r - q00_r; @@ -258,32 +323,32 @@ static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSel iz10 = -z10 * det; iz11 = z00 * det; - pXl->r = (float)(iz00 * q00_r + iz01 * q10_r); pXh->r = (float)(iz10 * q00_r + iz11 * q10_r); - pXl->g = (float)(iz00 * q00_g + iz01 * q10_g); pXh->g = (float)(iz10 * q00_g + iz11 * q10_g); - pXl->b = (float)(iz00 * q00_b + iz01 * q10_b); pXh->b = (float)(iz10 * q00_b + iz11 * q10_b); - pXl->a = (float)(iz00 * q00_a + iz01 * q10_a); pXh->a = (float)(iz10 * q00_a + iz11 * q10_a); + pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r); + pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g); + pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b); + pXl->m_c[3] = (float)(iz00 * q00_a + iz01 * q10_a); pXh->m_c[3] = (float)(iz10 * q00_a + iz11 * q10_a); for (uint32_t c = 0; c < 4; c++) { - if (((*pXl)[c] < 0.0f) || ((*pXh)[c] > 255.0f)) + if ((pXl->m_c[c] < 0.0f) || (pXh->m_c[c] > 255.0f)) { uint32_t lo_v = UINT32_MAX, hi_v = 0; for (uint32_t i = 0; i < N; i++) { - lo_v = minimumu(lo_v, pColors[i][c]); - hi_v = maximumu(hi_v, pColors[i][c]); + lo_v = minimumu(lo_v, pColors[i].m_c[c]); + hi_v = maximumu(hi_v, pColors[i].m_c[c]); } if (lo_v == hi_v) { - (*pXl)[c] = (float)lo_v; - (*pXh)[c] = (float)hi_v; + pXl->m_c[c] = (float)lo_v; + pXh->m_c[c] = (float)hi_v; } } } } -static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors) +static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_rgba*pColors) { float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; @@ -292,13 +357,13 @@ static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSele for (uint32_t i = 0; i < N; i++) { const uint32_t sel = pSelectors[i]; - z00 += pSelector_weights[sel].r; - z10 += pSelector_weights[sel].g; - z11 += pSelector_weights[sel].b; - float w = pSelector_weights[sel].a; - q00_r += w * pColors[i].r; t_r += pColors[i].r; - q00_g += w * pColors[i].g; t_g += pColors[i].g; - q00_b += w * pColors[i].b; t_b += pColors[i].b; + z00 += pSelector_weights[sel].m_c[0]; + z10 += pSelector_weights[sel].m_c[1]; + z11 += pSelector_weights[sel].m_c[2]; + float w = pSelector_weights[sel].m_c[3]; + q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0]; + q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1]; + q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2]; } q10_r = t_r - q00_r; @@ -317,32 +382,32 @@ static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSele iz10 = -z10 * det; iz11 = z00 * det; - pXl->r = (float)(iz00 * q00_r + iz01 * q10_r); pXh->r = (float)(iz10 * q00_r + iz11 * q10_r); - pXl->g = (float)(iz00 * q00_g + iz01 * q10_g); pXh->g = (float)(iz10 * q00_g + iz11 * q10_g); - pXl->b = (float)(iz00 * q00_b + iz01 * q10_b); pXh->b = (float)(iz10 * q00_b + iz11 * q10_b); - pXl->a = 255.0f; pXh->a = 255.0f; + pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r); + pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g); + pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b); + pXl->m_c[3] = 255.0f; pXh->m_c[3] = 255.0f; for (uint32_t c = 0; c < 3; c++) { - if (((*pXl)[c] < 0.0f) || ((*pXh)[c] > 255.0f)) + if ((pXl->m_c[c] < 0.0f) || (pXh->m_c[c] > 255.0f)) { uint32_t lo_v = UINT32_MAX, hi_v = 0; for (uint32_t i = 0; i < N; i++) { - lo_v = minimumu(lo_v, pColors[i][c]); - hi_v = maximumu(hi_v, pColors[i][c]); + lo_v = minimumu(lo_v, pColors[i].m_c[c]); + hi_v = maximumu(hi_v, pColors[i].m_c[c]); } if (lo_v == hi_v) { - (*pXl)[c] = (float)lo_v; - (*pXh)[c] = (float)hi_v; + pXl->m_c[c] = (float)lo_v; + pXh->m_c[c] = (float)hi_v; } } } } -static void compute_least_squares_endpoints_a(uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, float* pXl, float* pXh, const color_quad_u8* pColors) +static void compute_least_squares_endpoints_a(uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, float* pXl, float* pXh, const color_rgba *pColors) { // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf // I did this in matrix form first, expanded out all the ops, then optimized it a bit. @@ -352,13 +417,13 @@ static void compute_least_squares_endpoints_a(uint32_t N, const uint8_t* pSelect { const uint32_t sel = pSelectors[i]; - z00 += pSelector_weights[sel].r; - z10 += pSelector_weights[sel].g; - z11 += pSelector_weights[sel].b; + z00 += pSelector_weights[sel].m_c[0]; + z10 += pSelector_weights[sel].m_c[1]; + z11 += pSelector_weights[sel].m_c[2]; - float w = pSelector_weights[sel].a; + float w = pSelector_weights[sel].m_c[3]; - q00_a += w * pColors[i].a; t_a += pColors[i].a; + q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3]; } q10_a = t_a - q00_a; @@ -382,8 +447,8 @@ static void compute_least_squares_endpoints_a(uint32_t N, const uint8_t* pSelect uint32_t lo_v = UINT32_MAX, hi_v = 0; for (uint32_t i = 0; i < N; i++) { - lo_v = minimumu(lo_v, pColors[i].a); - hi_v = maximumu(hi_v, pColors[i].a); + lo_v = minimumu(lo_v, pColors[i].m_c[3]); + hi_v = maximumu(hi_v, pColors[i].m_c[3]); } if (lo_v == hi_v) @@ -394,78 +459,78 @@ static void compute_least_squares_endpoints_a(uint32_t N, const uint8_t* pSelect } } -typedef struct +struct color_cell_compressor_params { uint32_t m_num_pixels; - const color_quad_u8 *m_pPixels; + const color_rgba *m_pPixels; uint32_t m_num_selector_weights; const uint32_t *m_pSelector_weights; const vec4F *m_pSelector_weightsx; uint32_t m_comp_bits; uint32_t m_weights[4]; - bc7enc_bool m_has_alpha; - bc7enc_bool m_has_pbits; - bc7enc_bool m_endpoints_share_pbit; - bc7enc_bool m_perceptual; -} color_cell_compressor_params; + bool m_has_alpha; + bool m_has_pbits; + bool m_endpoints_share_pbit; + bool m_perceptual; +}; -typedef struct +struct color_cell_compressor_results { uint64_t m_best_overall_err; - color_quad_u8 m_low_endpoint; - color_quad_u8 m_high_endpoint; + color_rgba m_low_endpoint; + color_rgba m_high_endpoint; uint32_t m_pbits[2]; uint8_t *m_pSelectors; uint8_t *m_pSelectors_temp; -} color_cell_compressor_results; +}; -static inline color_quad_u8 scale_color(const color_quad_u8 *pC, const color_cell_compressor_params *pParams) +static inline color_rgba scale_color(const color_rgba *pC, const color_cell_compressor_params *pParams) { - color_quad_u8 results; + color_rgba results; const uint32_t n = pParams->m_comp_bits + (pParams->m_has_pbits ? 1 : 0); assert((n >= 4) && (n <= 8)); for (uint32_t i = 0; i < 4; i++) { - uint32_t v = (*pC)[i] << (8 - n); + uint32_t v = pC->m_c[i] << (8 - n); v |= (v >> n); assert(v <= 255); - results[i] = (uint8_t)(v); + results.m_c[i] = (uint8_t)(v); } return results; } -static inline uint64_t compute_color_distance_rgb(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc_bool perceptual, const uint32_t weights[4]) +static inline uint64_t compute_color_distance_rgb(const color_rgba *pE1, const color_rgba *pE2, bool perceptual, const uint32_t weights[4]) { int dr, dg, db; if (perceptual) { - const int l1 = pE1->r * 109 + pE1->g * 366 + pE1->b * 37; - const int cr1 = ((int)pE1->r << 9) - l1; - const int cb1 = ((int)pE1->b << 9) - l1; - const int l2 = pE2->r * 109 + pE2->g * 366 + pE2->b * 37; - const int cr2 = ((int)pE2->r << 9) - l2; - const int cb2 = ((int)pE2->b << 9) - l2; + const int l1 = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37; + const int cr1 = ((int)pE1->m_c[0] << 9) - l1; + const int cb1 = ((int)pE1->m_c[2] << 9) - l1; + const int l2 = pE2->m_c[0] * 109 + pE2->m_c[1] * 366 + pE2->m_c[2] * 37; + const int cr2 = ((int)pE2->m_c[0] << 9) - l2; + const int cb2 = ((int)pE2->m_c[2] << 9) - l2; dr = (l1 - l2) >> 8; dg = (cr1 - cr2) >> 8; db = (cb1 - cb2) >> 8; } else { - dr = (int)pE1->r - (int)pE2->r; - dg = (int)pE1->g - (int)pE2->g; - db = (int)pE1->b - (int)pE2->b; + dr = (int)pE1->m_c[0] - (int)pE2->m_c[0]; + dg = (int)pE1->m_c[1] - (int)pE2->m_c[1]; + db = (int)pE1->m_c[2] - (int)pE2->m_c[2]; } return weights[0] * (uint32_t)(dr * dr) + weights[1] * (uint32_t)(dg * dg) + weights[2] * (uint32_t)(db * db); } -static inline uint64_t compute_color_distance_rgba(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc_bool perceptual, const uint32_t weights[4]) +static inline uint64_t compute_color_distance_rgba(const color_rgba *pE1, const color_rgba *pE2, bool perceptual, const uint32_t weights[4]) { - int da = (int)pE1->a - (int)pE2->a; + int da = (int)pE1->m_c[3] - (int)pE2->m_c[3]; return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * (uint32_t)(da * da)); } @@ -497,18 +562,18 @@ static uint64_t pack_mode1_to_one_color(const color_cell_compressor_params *pPar memset(pSelectors, BC7ENC_MODE_1_OPTIMAL_INDEX, pParams->m_num_pixels); - color_quad_u8 p; + color_rgba p; for (uint32_t i = 0; i < 3; i++) { - uint32_t low = ((pResults->m_low_endpoint[i] << 1) | pResults->m_pbits[0]) << 1; + uint32_t low = ((pResults->m_low_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1; low |= (low >> 7); - uint32_t high = ((pResults->m_high_endpoint[i] << 1) | pResults->m_pbits[0]) << 1; + uint32_t high = ((pResults->m_high_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1; high |= (high >> 7); - p[i] = (uint8_t)((low * (64 - g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX] + 32) >> 6); + p.m_c[i] = (uint8_t)((low * (64 - g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX] + 32) >> 6); } - p.a = 255; + p.m_c[3] = 255; uint64_t total_err = 0; for (uint32_t i = 0; i < pParams->m_num_pixels; i++) @@ -520,7 +585,7 @@ static uint64_t pack_mode1_to_one_color(const color_cell_compressor_params *pPar } static uint64_t pack_mode7_to_one_color(const color_cell_compressor_params* pParams, color_cell_compressor_results* pResults, uint32_t r, uint32_t g, uint32_t b, uint32_t a, - uint8_t* pSelectors, uint32_t num_pixels, const color_quad_u8* pPixels) + uint8_t* pSelectors, uint32_t num_pixels, const color_rgba *pPixels) { uint32_t best_err = UINT_MAX; uint32_t best_p = 0; @@ -553,19 +618,19 @@ static uint64_t pack_mode7_to_one_color(const color_cell_compressor_params* pPar pResults->m_pbits[1] = best_hi_p; for (uint32_t i = 0; i < num_pixels; i++) - pSelectors[i] = BC7E_MODE_7_OPTIMAL_INDEX; + pSelectors[i] = (uint8_t)BC7E_MODE_7_OPTIMAL_INDEX; - color_quad_u8 p; + color_rgba p; for (uint32_t i = 0; i < 4; i++) { - uint32_t low = (pResults->m_low_endpoint[i] << 1) | pResults->m_pbits[0]; - uint32_t high = (pResults->m_high_endpoint[i] << 1) | pResults->m_pbits[1]; + uint32_t low = (pResults->m_low_endpoint.m_c[i] << 1) | pResults->m_pbits[0]; + uint32_t high = (pResults->m_high_endpoint.m_c[i] << 1) | pResults->m_pbits[1]; low = (low << 2) | (low >> 6); high = (high << 2) | (high >> 6); - p[i] = (low * (64 - g_bc7_weights2[BC7E_MODE_7_OPTIMAL_INDEX]) + high * g_bc7_weights2[BC7E_MODE_7_OPTIMAL_INDEX] + 32) >> 6; + p.m_c[i] = (uint8_t)((low * (64 - g_bc7_weights2[BC7E_MODE_7_OPTIMAL_INDEX]) + high * g_bc7_weights2[BC7E_MODE_7_OPTIMAL_INDEX] + 32) >> 6); } uint64_t total_err = 0; @@ -577,10 +642,11 @@ static uint64_t pack_mode7_to_one_color(const color_cell_compressor_params* pPar return total_err; } -static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 *pHigh, const uint32_t pbits[2], const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) +static uint64_t evaluate_solution(const color_rgba *pLow, const color_rgba *pHigh, const uint32_t pbits[2], const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, + const bc7enc_compress_block_params* pComp_params) { - color_quad_u8 quantMinColor = *pLow; - color_quad_u8 quantMaxColor = *pHigh; + color_rgba quantMinColor = *pLow; + color_rgba quantMaxColor = *pHigh; if (pParams->m_has_pbits) { @@ -594,62 +660,79 @@ static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 maxPBit = pbits[1]; } - quantMinColor.r = (uint8_t)((pLow->r << 1) | minPBit); - quantMinColor.g = (uint8_t)((pLow->g << 1) | minPBit); - quantMinColor.b = (uint8_t)((pLow->b << 1) | minPBit); - quantMinColor.a = (uint8_t)((pLow->a << 1) | minPBit); + quantMinColor.m_c[0] = (uint8_t)((pLow->m_c[0] << 1) | minPBit); + quantMinColor.m_c[1] = (uint8_t)((pLow->m_c[1] << 1) | minPBit); + quantMinColor.m_c[2] = (uint8_t)((pLow->m_c[2] << 1) | minPBit); + quantMinColor.m_c[3] = (uint8_t)((pLow->m_c[3] << 1) | minPBit); - quantMaxColor.r = (uint8_t)((pHigh->r << 1) | maxPBit); - quantMaxColor.g = (uint8_t)((pHigh->g << 1) | maxPBit); - quantMaxColor.b = (uint8_t)((pHigh->b << 1) | maxPBit); - quantMaxColor.a = (uint8_t)((pHigh->a << 1) | maxPBit); + quantMaxColor.m_c[0] = (uint8_t)((pHigh->m_c[0] << 1) | maxPBit); + quantMaxColor.m_c[1] = (uint8_t)((pHigh->m_c[1] << 1) | maxPBit); + quantMaxColor.m_c[2] = (uint8_t)((pHigh->m_c[2] << 1) | maxPBit); + quantMaxColor.m_c[3] = (uint8_t)((pHigh->m_c[3] << 1) | maxPBit); } - color_quad_u8 actualMinColor = scale_color(&quantMinColor, pParams); - color_quad_u8 actualMaxColor = scale_color(&quantMaxColor, pParams); + color_rgba actualMinColor = scale_color(&quantMinColor, pParams); + color_rgba actualMaxColor = scale_color(&quantMaxColor, pParams); const uint32_t N = pParams->m_num_selector_weights; - color_quad_u8 weightedColors[16]; + color_rgba weightedColors[16]; weightedColors[0] = actualMinColor; weightedColors[N - 1] = actualMaxColor; const uint32_t nc = pParams->m_has_alpha ? 4 : 3; for (uint32_t i = 1; i < (N - 1); i++) for (uint32_t j = 0; j < nc; j++) - weightedColors[i][j] = (uint8_t)((actualMinColor[j] * (64 - pParams->m_pSelector_weights[i]) + actualMaxColor[j] * pParams->m_pSelector_weights[i] + 32) >> 6); - - const int lr = actualMinColor.r; - const int lg = actualMinColor.g; - const int lb = actualMinColor.b; - const int dr = actualMaxColor.r - lr; - const int dg = actualMaxColor.g - lg; - const int db = actualMaxColor.b - lb; + weightedColors[i].m_c[j] = (uint8_t)((actualMinColor.m_c[j] * (64 - pParams->m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams->m_pSelector_weights[i] + 32) >> 6); + + const int lr = actualMinColor.m_c[0]; + const int lg = actualMinColor.m_c[1]; + const int lb = actualMinColor.m_c[2]; + const int dr = actualMaxColor.m_c[0] - lr; + const int dg = actualMaxColor.m_c[1] - lg; + const int db = actualMaxColor.m_c[2] - lb; uint64_t total_err = 0; - - if (!pParams->m_perceptual) + + if (pComp_params->m_force_selectors) + { + for (uint32_t i = 0; i < pParams->m_num_pixels; i++) + { + const uint32_t best_sel = pComp_params->m_selectors[i]; + + uint64_t best_err; + if (pParams->m_has_alpha) + best_err = compute_color_distance_rgba(&weightedColors[best_sel], &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights); + else + best_err = compute_color_distance_rgb(&weightedColors[best_sel], &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights); + + total_err += best_err; + + pResults->m_pSelectors_temp[i] = (uint8_t)best_sel; + } + } + else if (!pParams->m_perceptual) { if (pParams->m_has_alpha) { - const int la = actualMinColor.a; - const int da = actualMaxColor.a - la; + const int la = actualMinColor.m_c[3]; + const int da = actualMaxColor.m_c[3] - la; const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f); for (uint32_t i = 0; i < pParams->m_num_pixels; i++) { - const color_quad_u8 *pC = &pParams->m_pPixels[i]; - int r = pC->r; - int g = pC->g; - int b = pC->b; - int a = pC->a; + const color_rgba *pC = &pParams->m_pPixels[i]; + int r = pC->m_c[0]; + int g = pC->m_c[1]; + int b = pC->m_c[2]; + int a = pC->m_c[3]; int best_sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f); best_sel = clampi(best_sel, 1, N - 1); - uint64_t err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC_FALSE, pParams->m_weights); - uint64_t err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC_FALSE, pParams->m_weights); + uint64_t err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, false, pParams->m_weights); + uint64_t err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, false, pParams->m_weights); if (err1 > err0) { @@ -667,16 +750,16 @@ static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 for (uint32_t i = 0; i < pParams->m_num_pixels; i++) { - const color_quad_u8 *pC = &pParams->m_pPixels[i]; - int r = pC->r; - int g = pC->g; - int b = pC->b; + const color_rgba *pC = &pParams->m_pPixels[i]; + int r = pC->m_c[0]; + int g = pC->m_c[1]; + int b = pC->m_c[2]; int sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f); sel = clampi(sel, 1, N - 1); - uint64_t err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC_FALSE, pParams->m_weights); - uint64_t err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC_FALSE, pParams->m_weights); + uint64_t err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, false, pParams->m_weights); + uint64_t err1 = compute_color_distance_rgb(&weightedColors[sel], pC, false, pParams->m_weights); int best_sel = sel; uint64_t best_err = err1; @@ -704,7 +787,7 @@ static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 { for (uint32_t j = 0; j < N; j++) { - uint64_t err = compute_color_distance_rgba(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC_TRUE, pParams->m_weights); + uint64_t err = compute_color_distance_rgba(&weightedColors[j], &pParams->m_pPixels[i], true, pParams->m_weights); if (err < best_err) { best_err = err; @@ -716,7 +799,7 @@ static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 { for (uint32_t j = 0; j < N; j++) { - uint64_t err = compute_color_distance_rgb(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC_TRUE, pParams->m_weights); + uint64_t err = compute_color_distance_rgb(&weightedColors[j], &pParams->m_pPixels[i], true, pParams->m_weights); if (err < best_err) { best_err = err; @@ -747,32 +830,34 @@ static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 return total_err; } -static void fixDegenerateEndpoints(uint32_t mode, color_quad_u8 *pTrialMinColor, color_quad_u8 *pTrialMaxColor, const vec4F *pXl, const vec4F *pXh, uint32_t iscale) +static void fixDegenerateEndpoints(uint32_t mode, color_rgba *pTrialMinColor, color_rgba *pTrialMaxColor, const vec4F *pXl, const vec4F *pXh, uint32_t iscale, + const bc7enc_compress_block_params* pComp_params) { //if ((mode == 1) || (mode == 7)) - if (mode == 1) + //if (mode == 1) + if ( (mode == 1) || ((mode == 6) && (pComp_params->m_quant_mode6_endpoints)) ) { // fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps) for (uint32_t i = 0; i < 3; i++) { - if ((*pTrialMinColor)[i] == (*pTrialMaxColor)[i]) + if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i]) { - if (fabs((*pXl)[i] - (*pXh)[i]) > 0.0f) + if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.0f) { - if ((*pTrialMinColor)[i] > (iscale >> 1)) + if (pTrialMinColor->m_c[i] > (iscale >> 1)) { - if ((*pTrialMinColor)[i] > 0) - (*pTrialMinColor)[i]--; + if (pTrialMinColor->m_c[i] > 0) + pTrialMinColor->m_c[i]--; else - if ((*pTrialMaxColor)[i] < iscale) - (*pTrialMaxColor)[i]++; + if (pTrialMaxColor->m_c[i] < iscale) + pTrialMaxColor->m_c[i]++; } else { - if ((*pTrialMaxColor)[i] < iscale) - (*pTrialMaxColor)[i]++; - else if ((*pTrialMinColor)[i] > 0) - (*pTrialMinColor)[i]--; + if (pTrialMaxColor->m_c[i] < iscale) + pTrialMaxColor->m_c[i]++; + else if (pTrialMinColor->m_c[i] > 0) + pTrialMinColor->m_c[i]--; } } } @@ -780,7 +865,8 @@ static void fixDegenerateEndpoints(uint32_t mode, color_quad_u8 *pTrialMinColor, } } -static uint64_t find_optimal_solution(uint32_t mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults) +static uint64_t find_optimal_solution(uint32_t mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, + const bc7enc_compress_block_params* pComp_params) { vec4F_saturate_in_place(&xl); vec4F_saturate_in_place(&xh); @@ -792,114 +878,221 @@ static uint64_t find_optimal_solution(uint32_t mode, vec4F xl, vec4F xh, const c const int32_t totalComps = pParams->m_has_alpha ? 4 : 3; uint32_t best_pbits[2]; - color_quad_u8 bestMinColor, bestMaxColor; + color_rgba bestMinColor, bestMaxColor; if (!pParams->m_endpoints_share_pbit) { - float best_err0 = 1e+9; - float best_err1 = 1e+9; - - for (int p = 0; p < 2; p++) + if ((pParams->m_comp_bits == 7) && (pComp_params->m_quant_mode6_endpoints)) { - color_quad_u8 xMinColor, xMaxColor; + best_pbits[0] = 0; + bestMinColor.m_c[0] = g_mode6_reduced_quant[(int)((xl.m_c[0] * 2047.0f) + .5f)][0]; + bestMinColor.m_c[1] = g_mode6_reduced_quant[(int)((xl.m_c[1] * 2047.0f) + .5f)][0]; + bestMinColor.m_c[2] = g_mode6_reduced_quant[(int)((xl.m_c[2] * 2047.0f) + .5f)][0]; + bestMinColor.m_c[3] = g_mode6_reduced_quant[(int)((xl.m_c[3] * 2047.0f) + .5f)][0]; + + best_pbits[1] = 1; + bestMaxColor.m_c[0] = g_mode6_reduced_quant[(int)((xh.m_c[0] * 2047.0f) + .5f)][1]; + bestMaxColor.m_c[1] = g_mode6_reduced_quant[(int)((xh.m_c[1] * 2047.0f) + .5f)][1]; + bestMaxColor.m_c[2] = g_mode6_reduced_quant[(int)((xh.m_c[2] * 2047.0f) + .5f)][1]; + bestMaxColor.m_c[3] = g_mode6_reduced_quant[(int)((xh.m_c[3] * 2047.0f) + .5f)][1]; + } + else + { + float best_err0 = 1e+9; + float best_err1 = 1e+9; - // Notes: The pbit controls which quantization intervals are selected. - // total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc. - // pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value - // rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5) - // rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5) - for (uint32_t c = 0; c < 4; c++) + for (int p = 0; p < 2; p++) { - xMinColor[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - xMaxColor[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - } + color_rgba xMinColor, xMaxColor; + + // Notes: The pbit controls which quantization intervals are selected. + // total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc. + // pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value + // rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5) + // rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5) + if (pParams->m_comp_bits == 5) + { + for (uint32_t c = 0; c < 4; c++) + { + int vl = (int)(xl.m_c[c] * 31.0f); + vl += (xl.m_c[c] > g_mode7_rgba_midpoints[vl][p]); + xMinColor.m_c[c] = (uint8_t)clampi(vl * 2 + p, p, 63 - 1 + p); - color_quad_u8 scaledLow = scale_color(&xMinColor, pParams); - color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams); + int vh = (int)(xh.m_c[c] * 31.0f); + vh += (xh.m_c[c] > g_mode7_rgba_midpoints[vh][p]); + xMaxColor.m_c[c] = (uint8_t)clampi(vh * 2 + p, p, 63 - 1 + p); + } + } + else + { + for (uint32_t c = 0; c < 4; c++) + { + xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + } + } - float err0 = 0, err1 = 0; - for (int i = 0; i < totalComps; i++) - { - err0 += squaref(scaledLow[i] - xl[i] * 255.0f); - err1 += squaref(scaledHigh[i] - xh[i] * 255.0f); - } + color_rgba scaledLow = scale_color(&xMinColor, pParams); + color_rgba scaledHigh = scale_color(&xMaxColor, pParams); - if (err0 < best_err0) - { - best_err0 = err0; - best_pbits[0] = p; + float err0 = 0, err1 = 0; + for (int i = 0; i < totalComps; i++) + { + err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f); + err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f); + } - bestMinColor.r = xMinColor.r >> 1; - bestMinColor.g = xMinColor.g >> 1; - bestMinColor.b = xMinColor.b >> 1; - bestMinColor.a = xMinColor.a >> 1; - } + if (p == 1) + { + err0 *= pComp_params->m_pbit1_weight; + err1 *= pComp_params->m_pbit1_weight; + } + + if (err0 < best_err0) + { + best_err0 = err0; + best_pbits[0] = p; - if (err1 < best_err1) - { - best_err1 = err1; - best_pbits[1] = p; + bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1; + bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1; + bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1; + bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1; + } + + if (err1 < best_err1) + { + best_err1 = err1; + best_pbits[1] = p; - bestMaxColor.r = xMaxColor.r >> 1; - bestMaxColor.g = xMaxColor.g >> 1; - bestMaxColor.b = xMaxColor.b >> 1; - bestMaxColor.a = xMaxColor.a >> 1; + bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1; + bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1; + bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1; + bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1; + } } } } else { - // Endpoints share pbits - float best_err = 1e+9; - - for (int p = 0; p < 2; p++) + if ((mode == 1) && (pComp_params->m_bias_mode1_pbits)) { - color_quad_u8 xMinColor, xMaxColor; + float x = 0.0f; + for (uint32_t c = 0; c < 3; c++) + x = std::max(std::max(x, xl.m_c[c]), xh.m_c[c]); + + int p = 0; + if (x > (253.0f / 255.0f)) + p = 1; + + color_rgba xMinColor, xMaxColor; for (uint32_t c = 0; c < 4; c++) { - xMinColor[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - xMaxColor[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - } + int vl = (int)(xl.m_c[c] * 63.0f); + vl += (xl.m_c[c] > g_mode1_rgba_midpoints[vl][p]); + xMinColor.m_c[c] = (uint8_t)clampi(vl * 2 + p, p, 127 - 1 + p); - color_quad_u8 scaledLow = scale_color(&xMinColor, pParams); - color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams); + int vh = (int)(xh.m_c[c] * 63.0f); + vh += (xh.m_c[c] > g_mode1_rgba_midpoints[vh][p]); + xMaxColor.m_c[c] = (uint8_t)clampi(vh * 2 + p, p, 127 - 1 + p); + } - float err = 0; - for (int i = 0; i < totalComps; i++) - err += squaref((scaledLow[i] / 255.0f) - xl[i]) + squaref((scaledHigh[i] / 255.0f) - xh[i]); + best_pbits[0] = p; + best_pbits[1] = p; + for (uint32_t j = 0; j < 4; j++) + { + bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1; + bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1; + } + } + else + { + // Endpoints share pbits + float best_err = 1e+9; - if (err < best_err) + for (int p = 0; p < 2; p++) { - best_err = err; - best_pbits[0] = p; - best_pbits[1] = p; - for (uint32_t j = 0; j < 4; j++) + color_rgba xMinColor, xMaxColor; + if (pParams->m_comp_bits == 6) + { + for (uint32_t c = 0; c < 4; c++) + { + int vl = (int)(xl.m_c[c] * 63.0f); + vl += (xl.m_c[c] > g_mode1_rgba_midpoints[vl][p]); + xMinColor.m_c[c] = (uint8_t)clampi(vl * 2 + p, p, 127 - 1 + p); + + int vh = (int)(xh.m_c[c] * 63.0f); + vh += (xh.m_c[c] > g_mode1_rgba_midpoints[vh][p]); + xMaxColor.m_c[c] = (uint8_t)clampi(vh * 2 + p, p, 127 - 1 + p); + } + } + else { - bestMinColor[j] = xMinColor[j] >> 1; - bestMaxColor[j] = xMaxColor[j] >> 1; + for (uint32_t c = 0; c < 4; c++) + { + xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + } + } + + color_rgba scaledLow = scale_color(&xMinColor, pParams); + color_rgba scaledHigh = scale_color(&xMaxColor, pParams); + + float err = 0; + for (int i = 0; i < totalComps; i++) + err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]); + + if (p == 1) + err *= pComp_params->m_pbit1_weight; + + if (err < best_err) + { + best_err = err; + best_pbits[0] = p; + best_pbits[1] = p; + for (uint32_t j = 0; j < 4; j++) + { + bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1; + bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1; + } } } } } - fixDegenerateEndpoints(mode, &bestMinColor, &bestMaxColor, &xl, &xh, iscalep >> 1); + fixDegenerateEndpoints(mode, &bestMinColor, &bestMaxColor, &xl, &xh, iscalep >> 1, pComp_params); if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&bestMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&bestMaxColor, &pResults->m_high_endpoint) || (best_pbits[0] != pResults->m_pbits[0]) || (best_pbits[1] != pResults->m_pbits[1])) - evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits, pParams, pResults); + evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits, pParams, pResults, pComp_params); } else { const int iscale = (1 << pParams->m_comp_bits) - 1; const float scale = (float)iscale; - color_quad_u8 trialMinColor, trialMaxColor; - color_quad_u8_set_clamped(&trialMinColor, (int)(xl.r * scale + .5f), (int)(xl.g * scale + .5f), (int)(xl.b * scale + .5f), (int)(xl.a * scale + .5f)); - color_quad_u8_set_clamped(&trialMaxColor, (int)(xh.r * scale + .5f), (int)(xh.g * scale + .5f), (int)(xh.b * scale + .5f), (int)(xh.a * scale + .5f)); + color_rgba trialMinColor, trialMaxColor; + if (pParams->m_comp_bits == 7) + { + for (uint32_t c = 0; c < 4; c++) + { + int vl = (int)(xl.m_c[c] * 127.0f); + vl += (xl.m_c[c] > g_mode5_rgba_midpoints[vl]); + trialMinColor.m_c[c] = (uint8_t)clampi(vl, 0, 127); - fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, iscale); + int vh = (int)(xh.m_c[c] * 127.0f); + vh += (xh.m_c[c] > g_mode5_rgba_midpoints[vh]); + trialMaxColor.m_c[c] = (uint8_t)clampi(vh, 0, 127); + } + } + else + { + color_quad_u8_set_clamped(&trialMinColor, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f)); + color_quad_u8_set_clamped(&trialMaxColor, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f)); + } + + fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, iscale, pComp_params); if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint)) - evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults); + evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults, pComp_params); } return pResults->m_best_overall_err; @@ -914,14 +1107,14 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso // If the partition's colors are all the same in mode 1, then just pack them as a single color. if (mode == 1) { - const uint32_t cr = pParams->m_pPixels[0].r, cg = pParams->m_pPixels[0].g, cb = pParams->m_pPixels[0].b; + const uint32_t cr = pParams->m_pPixels[0].m_c[0], cg = pParams->m_pPixels[0].m_c[1], cb = pParams->m_pPixels[0].m_c[2]; - bc7enc_bool allSame = BC7ENC_TRUE; + bool allSame = true; for (uint32_t i = 1; i < pParams->m_num_pixels; i++) { - if ((cr != pParams->m_pPixels[i].r) || (cg != pParams->m_pPixels[i].g) || (cb != pParams->m_pPixels[i].b)) + if ((cr != pParams->m_pPixels[i].m_c[0]) || (cg != pParams->m_pPixels[i].m_c[1]) || (cb != pParams->m_pPixels[i].m_c[2])) { - allSame = BC7ENC_FALSE; + allSame = false; break; } } @@ -931,14 +1124,14 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso } else if (mode == 7) { - const uint32_t cr = pParams->m_pPixels[0].r, cg = pParams->m_pPixels[0].g, cb = pParams->m_pPixels[0].b, ca = pParams->m_pPixels[0].a; + const uint32_t cr = pParams->m_pPixels[0].m_c[0], cg = pParams->m_pPixels[0].m_c[1], cb = pParams->m_pPixels[0].m_c[2], ca = pParams->m_pPixels[0].m_c[3]; - bc7enc_bool allSame = BC7ENC_TRUE; + bool allSame = true; for (uint32_t i = 1; i < pParams->m_num_pixels; i++) { - if ((cr != pParams->m_pPixels[i].r) || (cg != pParams->m_pPixels[i].g) || (cb != pParams->m_pPixels[i].b) || (ca != pParams->m_pPixels[i].a)) + if ((cr != pParams->m_pPixels[i].m_c[0]) || (cg != pParams->m_pPixels[i].m_c[1]) || (cb != pParams->m_pPixels[i].m_c[2]) || (ca != pParams->m_pPixels[i].m_c[3])) { - allSame = BC7ENC_FALSE; + allSame = false; break; } } @@ -970,16 +1163,16 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso { vec4F color = vec4F_from_color(&pParams->m_pPixels[i]); color = vec4F_sub(&color, &meanColorScaled); - vec4F a = vec4F_mul(&color, color.r); - vec4F b = vec4F_mul(&color, color.g); - vec4F c = vec4F_mul(&color, color.b); - vec4F d = vec4F_mul(&color, color.a); + vec4F a = vec4F_mul(&color, color.m_c[0]); + vec4F b = vec4F_mul(&color, color.m_c[1]); + vec4F c = vec4F_mul(&color, color.m_c[2]); + vec4F d = vec4F_mul(&color, color.m_c[3]); vec4F n = i ? axis : color; vec4F_normalize_in_place(&n); - axis.r += vec4F_dot(&a, &n); - axis.g += vec4F_dot(&b, &n); - axis.b += vec4F_dot(&c, &n); - axis.a += vec4F_dot(&d, &n); + axis.m_c[0] += vec4F_dot(&a, &n); + axis.m_c[1] += vec4F_dot(&b, &n); + axis.m_c[2] += vec4F_dot(&c, &n); + axis.m_c[3] += vec4F_dot(&d, &n); } vec4F_normalize_in_place(&axis); } @@ -990,10 +1183,10 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso for (uint32_t i = 0; i < pParams->m_num_pixels; i++) { - const color_quad_u8 *pV = &pParams->m_pPixels[i]; - float r = pV->r - meanColorScaled.r; - float g = pV->g - meanColorScaled.g; - float b = pV->b - meanColorScaled.b; + const color_rgba *pV = &pParams->m_pPixels[i]; + float r = pV->m_c[0] - meanColorScaled.m_c[0]; + float g = pV->m_c[1] - meanColorScaled.m_c[1]; + float b = pV->m_c[2] - meanColorScaled.m_c[2]; cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b; } @@ -1070,20 +1263,20 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso minColor = maxColor; maxColor = temp; #else - float a = minColor.r, b = minColor.g, c = minColor.b, d = minColor.a; - minColor.r = maxColor.r; - minColor.g = maxColor.g; - minColor.b = maxColor.b; - minColor.a = maxColor.a; - maxColor.r = a; - maxColor.g = b; - maxColor.b = c; - maxColor.a = d; + float a = minColor.m_c[0], b = minColor.m_c[1], c = minColor.m_c[2], d = minColor.m_c[3]; + minColor.m_c[0] = maxColor.m_c[0]; + minColor.m_c[1] = maxColor.m_c[1]; + minColor.m_c[2] = maxColor.m_c[2]; + minColor.m_c[3] = maxColor.m_c[3]; + maxColor.m_c[0] = a; + maxColor.m_c[1] = b; + maxColor.m_c[2] = c; + maxColor.m_c[3] = d; #endif } // First find a solution using the block's PCA. - if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults)) + if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults, pComp_params)) return 0; if (pComp_params->m_try_least_squares) @@ -1100,7 +1293,7 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso xl = vec4F_mul(&xl, (1.0f / 255.0f)); xh = vec4F_mul(&xh, (1.0f / 255.0f)); - if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) + if (!find_optimal_solution(mode, xl, xh, pParams, pResults, pComp_params)) return 0; } @@ -1141,7 +1334,7 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso xl = vec4F_mul(&xl, (1.0f / 255.0f)); xh = vec4F_mul(&xh, (1.0f / 255.0f)); - if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) + if (!find_optimal_solution(mode, xl, xh, pParams, pResults, pComp_params)) return 0; for (uint32_t i = 0; i < pParams->m_num_pixels; i++) @@ -1160,7 +1353,7 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso xl = vec4F_mul(&xl, (1.0f / 255.0f)); xh = vec4F_mul(&xh, (1.0f / 255.0f)); - if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) + if (!find_optimal_solution(mode, xl, xh, pParams, pResults, pComp_params)) return 0; for (uint32_t i = 0; i < pParams->m_num_pixels; i++) @@ -1181,7 +1374,7 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso xl = vec4F_mul(&xl, (1.0f / 255.0f)); xh = vec4F_mul(&xh, (1.0f / 255.0f)); - if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) + if (!find_optimal_solution(mode, xl, xh, pParams, pResults, pComp_params)) return 0; // In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another. @@ -1210,7 +1403,7 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso xl = vec4F_mul(&xl, (1.0f / 255.0f)); xh = vec4F_mul(&xh, (1.0f / 255.0f)); - if (!find_optimal_solution(mode, xl, xh, pParams, pResults)) + if (!find_optimal_solution(mode, xl, xh, pParams, pResults, pComp_params)) return 0; } } @@ -1221,7 +1414,7 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso { // Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean. color_cell_compressor_results avg_results = *pResults; - const uint32_t r = (int)(.5f + meanColor.r * 255.0f), g = (int)(.5f + meanColor.g * 255.0f), b = (int)(.5f + meanColor.b * 255.0f); + const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f); uint64_t avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp); if (avg_err < pResults->m_best_overall_err) { @@ -1234,7 +1427,7 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso { // Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean. color_cell_compressor_results avg_results = *pResults; - const uint32_t r = (int)(.5f + meanColor.r * 255.0f), g = (int)(.5f + meanColor.g * 255.0f), b = (int)(.5f + meanColor.b * 255.0f), a = (int)(.5f + meanColor.a * 255.0f); + const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f), a = (int)(.5f + meanColor.m_c[3] * 255.0f); uint64_t avg_err = pack_mode7_to_one_color(pParams, &avg_results, r, g, b, a, pResults->m_pSelectors_temp, pParams->m_num_pixels, pParams->m_pPixels); if (avg_err < pResults->m_best_overall_err) { @@ -1247,46 +1440,46 @@ static uint64_t color_cell_compression(uint32_t mode, const color_cell_compresso return pResults->m_best_overall_err; } -static uint64_t color_cell_compression_est_mode1(uint32_t num_pixels, const color_quad_u8 *pPixels, bc7enc_bool perceptual, uint32_t pweights[4], uint64_t best_err_so_far) +static uint64_t color_cell_compression_est_mode1(uint32_t num_pixels, const color_rgba *pPixels, bool perceptual, uint32_t pweights[4], uint64_t best_err_so_far) { // Find RGB bounds as an approximation of the block's principle axis uint32_t lr = 255, lg = 255, lb = 255; uint32_t hr = 0, hg = 0, hb = 0; for (uint32_t i = 0; i < num_pixels; i++) { - const color_quad_u8 *pC = &pPixels[i]; - if (pC->r < lr) lr = pC->r; - if (pC->g < lg) lg = pC->g; - if (pC->b < lb) lb = pC->b; - if (pC->r > hr) hr = pC->r; - if (pC->g > hg) hg = pC->g; - if (pC->b > hb) hb = pC->b; + const color_rgba *pC = &pPixels[i]; + if (pC->m_c[0] < lr) lr = pC->m_c[0]; + if (pC->m_c[1] < lg) lg = pC->m_c[1]; + if (pC->m_c[2] < lb) lb = pC->m_c[2]; + if (pC->m_c[0] > hr) hr = pC->m_c[0]; + if (pC->m_c[1] > hg) hg = pC->m_c[1]; + if (pC->m_c[2] > hb) hb = pC->m_c[2]; } - color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0); - color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0); + color_rgba lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0); + color_rgba highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0); // Place endpoints at bbox diagonals and compute interpolated colors const uint32_t N = 8; - color_quad_u8 weightedColors[8]; + color_rgba weightedColors[8]; weightedColors[0] = lowColor; weightedColors[N - 1] = highColor; for (uint32_t i = 1; i < (N - 1); i++) { - weightedColors[i].r = (uint8_t)((lowColor.r * (64 - g_bc7_weights3[i]) + highColor.r * g_bc7_weights3[i] + 32) >> 6); - weightedColors[i].g = (uint8_t)((lowColor.g * (64 - g_bc7_weights3[i]) + highColor.g * g_bc7_weights3[i] + 32) >> 6); - weightedColors[i].b = (uint8_t)((lowColor.b * (64 - g_bc7_weights3[i]) + highColor.b * g_bc7_weights3[i] + 32) >> 6); + weightedColors[i].m_c[0] = (uint8_t)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6); + weightedColors[i].m_c[1] = (uint8_t)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6); + weightedColors[i].m_c[2] = (uint8_t)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6); } // Compute dots and thresholds - const int ar = highColor.r - lowColor.r; - const int ag = highColor.g - lowColor.g; - const int ab = highColor.b - lowColor.b; + const int ar = highColor.m_c[0] - lowColor.m_c[0]; + const int ag = highColor.m_c[1] - lowColor.m_c[1]; + const int ab = highColor.m_c[2] - lowColor.m_c[2]; int dots[8]; for (uint32_t i = 0; i < N; i++) - dots[i] = weightedColors[i].r * ar + weightedColors[i].g * ag + weightedColors[i].b * ab; + dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab; int thresh[8 - 1]; for (uint32_t i = 0; i < (N - 1); i++) @@ -1299,17 +1492,17 @@ static uint64_t color_cell_compression_est_mode1(uint32_t num_pixels, const colo int l1[8], cr1[8], cb1[8]; for (int j = 0; j < 8; j++) { - const color_quad_u8 *pE1 = &weightedColors[j]; - l1[j] = pE1->r * 109 + pE1->g * 366 + pE1->b * 37; - cr1[j] = ((int)pE1->r << 9) - l1[j]; - cb1[j] = ((int)pE1->b << 9) - l1[j]; + const color_rgba *pE1 = &weightedColors[j]; + l1[j] = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37; + cr1[j] = ((int)pE1->m_c[0] << 9) - l1[j]; + cb1[j] = ((int)pE1->m_c[2] << 9) - l1[j]; } for (uint32_t i = 0; i < num_pixels; i++) { - const color_quad_u8 *pC = &pPixels[i]; + const color_rgba *pC = &pPixels[i]; - int d = ar * pC->r + ag * pC->g + ab * pC->b; + int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2]; // Find approximate selector uint32_t s = 0; @@ -1329,9 +1522,9 @@ static uint64_t color_cell_compression_est_mode1(uint32_t num_pixels, const colo s = 1; // Compute error - const int l2 = pC->r * 109 + pC->g * 366 + pC->b * 37; - const int cr2 = ((int)pC->r << 9) - l2; - const int cb2 = ((int)pC->b << 9) - l2; + const int l2 = pC->m_c[0] * 109 + pC->m_c[1] * 366 + pC->m_c[2] * 37; + const int cr2 = ((int)pC->m_c[0] << 9) - l2; + const int cb2 = ((int)pC->m_c[2] << 9) - l2; const int dl = (l1[s] - l2) >> 8; const int dcr = (cr1[s] - cr2) >> 8; @@ -1348,9 +1541,9 @@ static uint64_t color_cell_compression_est_mode1(uint32_t num_pixels, const colo { for (uint32_t i = 0; i < num_pixels; i++) { - const color_quad_u8 *pC = &pPixels[i]; + const color_rgba *pC = &pPixels[i]; - int d = ar * pC->r + ag * pC->g + ab * pC->b; + int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2]; // Find approximate selector uint32_t s = 0; @@ -1370,11 +1563,11 @@ static uint64_t color_cell_compression_est_mode1(uint32_t num_pixels, const colo s = 1; // Compute error - const color_quad_u8 *pE1 = &weightedColors[s]; + const color_rgba *pE1 = &weightedColors[s]; - int dr = (int)pE1->r - (int)pC->r; - int dg = (int)pE1->g - (int)pC->g; - int db = (int)pE1->b - (int)pC->b; + int dr = (int)pE1->m_c[0] - (int)pC->m_c[0]; + int dg = (int)pE1->m_c[1] - (int)pC->m_c[1]; + int db = (int)pE1->m_c[2] - (int)pC->m_c[2]; total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db); if (total_err > best_err_so_far) @@ -1385,51 +1578,51 @@ static uint64_t color_cell_compression_est_mode1(uint32_t num_pixels, const colo return total_err; } -static uint64_t color_cell_compression_est_mode7(uint32_t num_pixels, const color_quad_u8* pPixels, bc7enc_bool perceptual, uint32_t pweights[4], uint64_t best_err_so_far) +static uint64_t color_cell_compression_est_mode7(uint32_t num_pixels, const color_rgba * pPixels, bool perceptual, uint32_t pweights[4], uint64_t best_err_so_far) { // Find RGB bounds as an approximation of the block's principle axis uint32_t lr = 255, lg = 255, lb = 255, la = 255; uint32_t hr = 0, hg = 0, hb = 0, ha = 0; for (uint32_t i = 0; i < num_pixels; i++) { - const color_quad_u8* pC = &pPixels[i]; - if (pC->r < lr) lr = pC->r; - if (pC->g < lg) lg = pC->g; - if (pC->b < lb) lb = pC->b; - if (pC->a < la) la = pC->a; - - if (pC->r > hr) hr = pC->r; - if (pC->g > hg) hg = pC->g; - if (pC->b > hb) hb = pC->b; - if (pC->a > ha) ha = pC->a; + const color_rgba* pC = &pPixels[i]; + if (pC->m_c[0] < lr) lr = pC->m_c[0]; + if (pC->m_c[1] < lg) lg = pC->m_c[1]; + if (pC->m_c[2] < lb) lb = pC->m_c[2]; + if (pC->m_c[3] < la) la = pC->m_c[3]; + + if (pC->m_c[0] > hr) hr = pC->m_c[0]; + if (pC->m_c[1] > hg) hg = pC->m_c[1]; + if (pC->m_c[2] > hb) hb = pC->m_c[2]; + if (pC->m_c[3] > ha) ha = pC->m_c[3]; } - color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, la); - color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, ha); + color_rgba lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, la); + color_rgba highColor; color_quad_u8_set(&highColor, hr, hg, hb, ha); // Place endpoints at bbox diagonals and compute interpolated colors const uint32_t N = 4; - color_quad_u8 weightedColors[4]; + color_rgba weightedColors[4]; weightedColors[0] = lowColor; weightedColors[N - 1] = highColor; for (uint32_t i = 1; i < (N - 1); i++) { - weightedColors[i].r = (uint8_t)((lowColor.r * (64 - g_bc7_weights2[i]) + highColor.r * g_bc7_weights2[i] + 32) >> 6); - weightedColors[i].g = (uint8_t)((lowColor.g * (64 - g_bc7_weights2[i]) + highColor.g * g_bc7_weights2[i] + 32) >> 6); - weightedColors[i].b = (uint8_t)((lowColor.b * (64 - g_bc7_weights2[i]) + highColor.b * g_bc7_weights2[i] + 32) >> 6); - weightedColors[i].a = (uint8_t)((lowColor.a * (64 - g_bc7_weights2[i]) + highColor.a * g_bc7_weights2[i] + 32) >> 6); + weightedColors[i].m_c[0] = (uint8_t)((lowColor.m_c[0] * (64 - g_bc7_weights2[i]) + highColor.m_c[0] * g_bc7_weights2[i] + 32) >> 6); + weightedColors[i].m_c[1] = (uint8_t)((lowColor.m_c[1] * (64 - g_bc7_weights2[i]) + highColor.m_c[1] * g_bc7_weights2[i] + 32) >> 6); + weightedColors[i].m_c[2] = (uint8_t)((lowColor.m_c[2] * (64 - g_bc7_weights2[i]) + highColor.m_c[2] * g_bc7_weights2[i] + 32) >> 6); + weightedColors[i].m_c[3] = (uint8_t)((lowColor.m_c[3] * (64 - g_bc7_weights2[i]) + highColor.m_c[3] * g_bc7_weights2[i] + 32) >> 6); } // Compute dots and thresholds - const int ar = highColor.r - lowColor.r; - const int ag = highColor.g - lowColor.g; - const int ab = highColor.b - lowColor.b; - const int aa = highColor.a - lowColor.a; + const int ar = highColor.m_c[0] - lowColor.m_c[0]; + const int ag = highColor.m_c[1] - lowColor.m_c[1]; + const int ab = highColor.m_c[2] - lowColor.m_c[2]; + const int aa = highColor.m_c[3] - lowColor.m_c[3]; int dots[4]; for (uint32_t i = 0; i < N; i++) - dots[i] = weightedColors[i].r * ar + weightedColors[i].g * ag + weightedColors[i].b * ab + weightedColors[i].a * aa; + dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab + weightedColors[i].m_c[3] * aa; int thresh[4 - 1]; for (uint32_t i = 0; i < (N - 1); i++) @@ -1442,17 +1635,17 @@ static uint64_t color_cell_compression_est_mode7(uint32_t num_pixels, const colo int l1[4], cr1[4], cb1[4]; for (int j = 0; j < 4; j++) { - const color_quad_u8* pE1 = &weightedColors[j]; - l1[j] = pE1->r * 109 + pE1->g * 366 + pE1->b * 37; - cr1[j] = ((int)pE1->r << 9) - l1[j]; - cb1[j] = ((int)pE1->b << 9) - l1[j]; + const color_rgba* pE1 = &weightedColors[j]; + l1[j] = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37; + cr1[j] = ((int)pE1->m_c[0] << 9) - l1[j]; + cb1[j] = ((int)pE1->m_c[2] << 9) - l1[j]; } for (uint32_t i = 0; i < num_pixels; i++) { - const color_quad_u8* pC = &pPixels[i]; + const color_rgba* pC = &pPixels[i]; - int d = ar * pC->r + ag * pC->g + ab * pC->b + aa * pC->a; + int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2] + aa * pC->m_c[3]; // Find approximate selector uint32_t s = 0; @@ -1464,15 +1657,15 @@ static uint64_t color_cell_compression_est_mode7(uint32_t num_pixels, const colo s = 1; // Compute error - const int l2 = pC->r * 109 + pC->g * 366 + pC->b * 37; - const int cr2 = ((int)pC->r << 9) - l2; - const int cb2 = ((int)pC->b << 9) - l2; + const int l2 = pC->m_c[0] * 109 + pC->m_c[1] * 366 + pC->m_c[2] * 37; + const int cr2 = ((int)pC->m_c[0] << 9) - l2; + const int cb2 = ((int)pC->m_c[2] << 9) - l2; const int dl = (l1[s] - l2) >> 8; const int dcr = (cr1[s] - cr2) >> 8; const int dcb = (cb1[s] - cb2) >> 8; - const int dca = (int)pC->a - (int)weightedColors[s].a; + const int dca = (int)pC->m_c[3] - (int)weightedColors[s].m_c[3]; int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb) + (pweights[3] * dca * dca); @@ -1485,9 +1678,9 @@ static uint64_t color_cell_compression_est_mode7(uint32_t num_pixels, const colo { for (uint32_t i = 0; i < num_pixels; i++) { - const color_quad_u8* pC = &pPixels[i]; + const color_rgba* pC = &pPixels[i]; - int d = ar * pC->r + ag * pC->g + ab * pC->b + aa * pC->a; + int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2] + aa * pC->m_c[3]; // Find approximate selector uint32_t s = 0; @@ -1499,12 +1692,12 @@ static uint64_t color_cell_compression_est_mode7(uint32_t num_pixels, const colo s = 1; // Compute error - const color_quad_u8* pE1 = &weightedColors[s]; + const color_rgba* pE1 = &weightedColors[s]; - int dr = (int)pE1->r - (int)pC->r; - int dg = (int)pE1->g - (int)pC->g; - int db = (int)pE1->b - (int)pC->b; - int da = (int)pE1->a - (int)pC->a; + int dr = (int)pE1->m_c[0] - (int)pC->m_c[0]; + int dg = (int)pE1->m_c[1] - (int)pC->m_c[1]; + int db = (int)pE1->m_c[2] - (int)pC->m_c[2]; + int da = (int)pE1->m_c[3] - (int)pC->m_c[3]; total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db) + pweights[3] * (da * da); if (total_err > best_err_so_far) @@ -1558,9 +1751,9 @@ static const uint32_t g_partition_predictors[35] = }; // Estimate the partition used by modes 1/7. This scans through each partition and computes an approximate error for each. -static uint32_t estimate_partition(const color_quad_u8 *pPixels, const bc7enc_compress_block_params *pComp_params, uint32_t pweights[4], uint32_t mode) +static uint32_t estimate_partition(const color_rgba *pPixels, const bc7enc_compress_block_params *pComp_params, uint32_t pweights[4], uint32_t mode) { - const uint32_t total_partitions = minimumu(pComp_params->m_max_partitions_mode, BC7ENC_MAX_PARTITIONS1); + const uint32_t total_partitions = minimumu(pComp_params->m_max_partitions, BC7ENC_MAX_PARTITIONS); if (total_partitions <= 1) return 0; @@ -1590,7 +1783,7 @@ static uint32_t estimate_partition(const color_quad_u8 *pPixels, const bc7enc_co const uint32_t partition = s_sorted_partition_order[partition_iter]; // Check to see if we should bother evaluating this partition at all, depending on the best partition found from the first 14. - if (pComp_params->m_mode_partition_estimation_filterbank) + if (pComp_params->m_mode17_partition_estimation_filterbank) { if ((partition_iter >= 14) && (partition_iter <= 34)) { @@ -1607,7 +1800,7 @@ static uint32_t estimate_partition(const color_quad_u8 *pPixels, const bc7enc_co const uint8_t *pPartition = &g_bc7_partition2[partition * 16]; - color_quad_u8 subset_colors[2][16]; + color_rgba subset_colors[2][16]; uint32_t subset_total_colors[2] = { 0, 0 }; for (uint32_t index = 0; index < 16; index++) subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index]; @@ -1621,6 +1814,11 @@ static uint32_t estimate_partition(const color_quad_u8 *pPixels, const bc7enc_co total_subset_err += color_cell_compression_est_mode1(subset_total_colors[subset], &subset_colors[subset][0], pComp_params->m_perceptual, pweights, best_err); } + if (partition < 16) + { + total_subset_err = (uint64_t)((double)total_subset_err * pComp_params->m_low_frequency_partition_weight + .5f); + } + if (total_subset_err < best_err) { best_err = total_subset_err; @@ -1653,20 +1851,20 @@ static void set_block_bits(uint8_t *pBytes, uint32_t val, uint32_t num_bits, uin assert(*pCur_ofs <= 128); } -typedef struct +struct bc7_optimization_results { uint32_t m_mode; uint32_t m_partition; uint8_t m_selectors[16]; uint8_t m_alpha_selectors[16]; - color_quad_u8 m_low[3]; - color_quad_u8 m_high[3]; + color_rgba m_low[3]; + color_rgba m_high[3]; uint32_t m_pbits[3][2]; uint32_t m_rotation; uint32_t m_index_selector; -} bc7_optimization_results; +}; -static void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResults) +void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResults) { assert(pResults->m_index_selector <= 1); assert(pResults->m_rotation <= 3); @@ -1692,7 +1890,7 @@ static void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResu uint8_t alpha_selectors[16]; memcpy(alpha_selectors, pResults->m_alpha_selectors, 16); - color_quad_u8 low[3], high[3]; + color_rgba low[3], high[3]; memcpy(low, pResults->m_low, sizeof(low)); memcpy(high, pResults->m_high, sizeof(high)); @@ -1729,14 +1927,14 @@ static void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResu { for (uint32_t q = 0; q < 3; q++) { - uint8_t t = low[k][q]; - low[k][q] = high[k][q]; - high[k][q] = t; + uint8_t t = low[k].m_c[q]; + low[k].m_c[q] = high[k].m_c[q]; + high[k].m_c[q] = t; } } else { - color_quad_u8 tmp = low[k]; + color_rgba tmp = low[k]; low[k] = high[k]; high[k] = tmp; } @@ -1760,9 +1958,9 @@ static void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResu if (pPartition[i] == k) alpha_selectors[i] = (uint8_t)((num_alpha_indices - 1) - alpha_selectors[i]); - uint8_t t = low[k].a; - low[k].a = high[k].a; - high[k].a = t; + uint8_t t = low[k].m_c[3]; + low[k].m_c[3] = high[k].m_c[3]; + high[k].m_c[3] = t; } } } @@ -1787,8 +1985,8 @@ static void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResu { for (uint32_t subset = 0; subset < total_subsets; subset++) { - set_block_bits(pBlock_bytes, low[subset][comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); - set_block_bits(pBlock_bytes, high[subset][comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); + set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); + set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); } } @@ -1838,16 +2036,16 @@ static void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResu assert(cur_bit_ofs == 128); } -static void handle_alpha_block_mode5(const color_quad_u8* pPixels, const bc7enc_compress_block_params* pComp_params, color_cell_compressor_params* pParams, uint32_t lo_a, uint32_t hi_a, bc7_optimization_results* pOpt_results5, uint64_t* pMode5_err, uint64_t* pMode5_alpha_err) +static void handle_alpha_block_mode5(const color_rgba* pPixels, const bc7enc_compress_block_params* pComp_params, color_cell_compressor_params* pParams, uint32_t lo_a, uint32_t hi_a, bc7_optimization_results* pOpt_results5, uint64_t* pMode5_err, uint64_t* pMode5_alpha_err) { pParams->m_pSelector_weights = g_bc7_weights2; pParams->m_pSelector_weightsx = (const vec4F*)g_bc7_weights2x; pParams->m_num_selector_weights = 4; pParams->m_comp_bits = 7; - pParams->m_has_pbits = BC7ENC_FALSE; - pParams->m_endpoints_share_pbit = BC7ENC_FALSE; - pParams->m_has_alpha = BC7ENC_FALSE; + pParams->m_has_pbits = false; + pParams->m_endpoints_share_pbit = false; + pParams->m_has_alpha = false; pParams->m_perceptual = pComp_params->m_perceptual; @@ -1869,8 +2067,8 @@ static void handle_alpha_block_mode5(const color_quad_u8* pPixels, const bc7enc_ if (lo_a == hi_a) { *pMode5_alpha_err = 0; - pOpt_results5->m_low[0].a = (uint8_t)lo_a; - pOpt_results5->m_high[0].a = (uint8_t)hi_a; + pOpt_results5->m_low[0].m_c[3] = (uint8_t)lo_a; + pOpt_results5->m_high[0].m_c[3] = (uint8_t)hi_a; memset(pOpt_results5->m_alpha_selectors, 0, sizeof(pOpt_results5->m_alpha_selectors)); } else @@ -1893,7 +2091,7 @@ static void handle_alpha_block_mode5(const color_quad_u8* pPixels, const bc7enc_ uint64_t trial_alpha_err = 0; for (uint32_t i = 0; i < 16; i++) { - const int32_t a = pParams->m_pPixels[i].a; + const int32_t a = pParams->m_pPixels[i].m_c[3]; int s = 0; int32_t be = iabs32(a - vals[0]); @@ -1911,8 +2109,8 @@ static void handle_alpha_block_mode5(const color_quad_u8* pPixels, const bc7enc_ if (trial_alpha_err < *pMode5_alpha_err) { *pMode5_alpha_err = trial_alpha_err; - pOpt_results5->m_low[0].a = (uint8_t)lo_a; - pOpt_results5->m_high[0].a = (uint8_t)hi_a; + pOpt_results5->m_low[0].m_c[3] = (uint8_t)lo_a; + pOpt_results5->m_high[0].m_c[3] = (uint8_t)hi_a; memcpy(pOpt_results5->m_alpha_selectors, trial_alpha_selectors, sizeof(pOpt_results5->m_alpha_selectors)); } @@ -1938,35 +2136,44 @@ static void handle_alpha_block_mode5(const color_quad_u8* pPixels, const bc7enc_ } } -static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc_compress_block_params *pComp_params, color_cell_compressor_params *pParams) +static void handle_alpha_block(void *pBlock, const color_rgba *pPixels, const bc7enc_compress_block_params *pComp_params, color_cell_compressor_params *pParams) { + assert((pComp_params->m_mode_mask & (1 << 6)) || (pComp_params->m_mode_mask & (1 << 5)) || (pComp_params->m_mode_mask & (1 << 7))); + pParams->m_pSelector_weights = g_bc7_weights4; pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x; pParams->m_num_selector_weights = 16; pParams->m_comp_bits = 7; - pParams->m_has_pbits = BC7ENC_TRUE; - pParams->m_endpoints_share_pbit = BC7ENC_FALSE; - pParams->m_has_alpha = BC7ENC_TRUE; + pParams->m_has_pbits = true; + pParams->m_endpoints_share_pbit = false; + pParams->m_has_alpha = true; pParams->m_perceptual = pComp_params->m_perceptual; pParams->m_num_pixels = 16; pParams->m_pPixels = pPixels; bc7_optimization_results opt_results6, opt_results5, opt_results7; - color_cell_compressor_results results6; - results6.m_pSelectors = opt_results6.m_selectors; + memset(&results6, 0, sizeof(results6)); + + uint64_t best_err = UINT64_MAX; + uint32_t best_mode = 0; uint8_t selectors_temp[16]; - results6.m_pSelectors_temp = selectors_temp; - uint64_t best_err = color_cell_compression(6, pParams, &results6, pComp_params); - uint32_t best_mode = 6; + if (pComp_params->m_mode_mask & (1 << 6)) + { + results6.m_pSelectors = opt_results6.m_selectors; + results6.m_pSelectors_temp = selectors_temp; + + best_err = (uint64_t)(color_cell_compression(6, pParams, &results6, pComp_params) * pComp_params->m_mode6_error_weight + .5f); + best_mode = 6; + } - if ((best_err > 0) && (pComp_params->m_use_mode5_for_alpha)) + if ((best_err > 0) && (pComp_params->m_mode_mask & (1 << 5))) { uint32_t lo_a = 255, hi_a = 0; for (uint32_t i = 0; i < 16; i++) { - uint32_t a = pPixels[i].a; + uint32_t a = pPixels[i].m_c[3]; lo_a = minimumu(lo_a, a); hi_a = maximumu(hi_a, a); } @@ -1974,6 +2181,8 @@ static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const uint64_t mode5_err, mode5_alpha_err; handle_alpha_block_mode5(pPixels, pComp_params, pParams, lo_a, hi_a, &opt_results5, &mode5_err, &mode5_alpha_err); + mode5_err = (uint64_t)(mode5_err * pComp_params->m_mode5_error_weight + .5f); + if (mode5_err < best_err) { best_err = mode5_err; @@ -1981,7 +2190,7 @@ static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const } } - if ((best_err > 0) && (pComp_params->m_use_mode7_for_alpha)) + if ((best_err > 0) && (pComp_params->m_mode_mask & (1 << 7))) { const uint32_t trial_partition = estimate_partition(pPixels, pComp_params, pParams->m_weights, 7); @@ -1989,13 +2198,13 @@ static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const pParams->m_pSelector_weightsx = (const vec4F*)g_bc7_weights2x; pParams->m_num_selector_weights = 4; pParams->m_comp_bits = 5; - pParams->m_has_pbits = BC7ENC_TRUE; - pParams->m_endpoints_share_pbit = BC7ENC_FALSE; - pParams->m_has_alpha = BC7ENC_TRUE; + pParams->m_has_pbits = true; + pParams->m_endpoints_share_pbit = false; + pParams->m_has_alpha = true; const uint8_t* pPartition = &g_bc7_partition2[trial_partition * 16]; - color_quad_u8 subset_colors[2][16]; + color_rgba subset_colors[2][16]; uint32_t subset_total_colors7[2] = { 0, 0 }; @@ -2022,14 +2231,16 @@ static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const pResults->m_pSelectors_temp = selectors_temp; uint64_t err = color_cell_compression(7, pParams, pResults, pComp_params); trial_err += err; - if (trial_err > best_err) + if ((uint64_t)(trial_err * pComp_params->m_mode7_error_weight + .5f) > best_err) break; } // subset - if (trial_err < best_err) + const uint64_t mode7_trial_err = (uint64_t)(trial_err * pComp_params->m_mode7_error_weight + .5f); + + if (mode7_trial_err < best_err) { - best_err = trial_err; + best_err = mode7_trial_err; best_mode = 7; opt_results7.m_mode = 7; opt_results7.m_partition = trial_partition; @@ -2073,43 +2284,56 @@ static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const encode_bc7_block(pBlock, &opt_results6); } + else + { + assert(0); + } } -static void handle_opaque_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc_compress_block_params *pComp_params, color_cell_compressor_params *pParams) +static void handle_opaque_block(void *pBlock, const color_rgba *pPixels, const bc7enc_compress_block_params *pComp_params, color_cell_compressor_params *pParams) { + assert((pComp_params->m_mode_mask & (1 << 6)) || (pComp_params->m_mode_mask & (1 << 1))); + uint8_t selectors_temp[16]; - - // Mode 6 + bc7_optimization_results opt_results; - - pParams->m_pSelector_weights = g_bc7_weights4; - pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x; - pParams->m_num_selector_weights = 16; - pParams->m_comp_bits = 7; - pParams->m_has_pbits = BC7ENC_TRUE; - pParams->m_endpoints_share_pbit = BC7ENC_FALSE; + + uint64_t best_err = UINT64_MAX; + pParams->m_perceptual = pComp_params->m_perceptual; pParams->m_num_pixels = 16; pParams->m_pPixels = pPixels; - pParams->m_has_alpha = BC7ENC_FALSE; + pParams->m_has_alpha = false; - color_cell_compressor_results results6; - results6.m_pSelectors = opt_results.m_selectors; - results6.m_pSelectors_temp = selectors_temp; - - uint64_t best_err = color_cell_compression(6, pParams, &results6, pComp_params); - - opt_results.m_mode = 6; opt_results.m_partition = 0; - opt_results.m_low[0] = results6.m_low_endpoint; - opt_results.m_high[0] = results6.m_high_endpoint; - opt_results.m_pbits[0][0] = results6.m_pbits[0]; - opt_results.m_pbits[0][1] = results6.m_pbits[1]; opt_results.m_index_selector = 0; opt_results.m_rotation = 0; + // Mode 6 + if (pComp_params->m_mode_mask & (1 << 6)) + { + pParams->m_pSelector_weights = g_bc7_weights4; + pParams->m_pSelector_weightsx = (const vec4F*)g_bc7_weights4x; + pParams->m_num_selector_weights = 16; + pParams->m_comp_bits = 7; + pParams->m_has_pbits = true; + pParams->m_endpoints_share_pbit = false; + + color_cell_compressor_results results6; + results6.m_pSelectors = opt_results.m_selectors; + results6.m_pSelectors_temp = selectors_temp; + + best_err = (uint64_t)(color_cell_compression(6, pParams, &results6, pComp_params) * pComp_params->m_mode6_error_weight + .5f); + + opt_results.m_mode = 6; + opt_results.m_low[0] = results6.m_low_endpoint; + opt_results.m_high[0] = results6.m_high_endpoint; + opt_results.m_pbits[0][0] = results6.m_pbits[0]; + opt_results.m_pbits[0][1] = results6.m_pbits[1]; + } + // Mode 1 - if ((best_err > 0) && (pComp_params->m_max_partitions_mode > 0)) + if ((best_err > 0) && (pComp_params->m_max_partitions > 0) && (pComp_params->m_mode_mask & (1 << 1))) { const uint32_t trial_partition = estimate_partition(pPixels, pComp_params, pParams->m_weights, 1); @@ -2117,12 +2341,12 @@ static void handle_opaque_block(void *pBlock, const color_quad_u8 *pPixels, cons pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights3x; pParams->m_num_selector_weights = 8; pParams->m_comp_bits = 6; - pParams->m_has_pbits = BC7ENC_TRUE; - pParams->m_endpoints_share_pbit = BC7ENC_TRUE; + pParams->m_has_pbits = true; + pParams->m_endpoints_share_pbit = true; const uint8_t *pPartition = &g_bc7_partition2[trial_partition * 16]; - color_quad_u8 subset_colors[2][16]; + color_rgba subset_colors[2][16]; uint32_t subset_total_colors1[2] = { 0, 0 }; @@ -2150,14 +2374,15 @@ static void handle_opaque_block(void *pBlock, const color_quad_u8 *pPixels, cons uint64_t err = color_cell_compression(1, pParams, pResults, pComp_params); trial_err += err; - if (trial_err > best_err) + if ((uint64_t)(trial_err * pComp_params->m_mode1_error_weight + .5f) > best_err) break; } // subset - if (trial_err < best_err) + const uint64_t mode1_trial_err = (uint64_t)(trial_err * pComp_params->m_mode1_error_weight + .5f); + if (mode1_trial_err < best_err) { - best_err = trial_err; + best_err = mode1_trial_err; opt_results.m_mode = 1; opt_results.m_partition = trial_partition; for (uint32_t subset = 0; subset < 2; subset++) @@ -2174,11 +2399,11 @@ static void handle_opaque_block(void *pBlock, const color_quad_u8 *pPixels, cons encode_bc7_block(pBlock, &opt_results); } -bc7enc_bool bc7enc_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc_compress_block_params *pComp_params) +bool bc7enc_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc_compress_block_params *pComp_params) { assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0); - const color_quad_u8 *pPixels = (const color_quad_u8 *)(pPixelsRGBA); + const color_rgba *pPixels = (const color_rgba *)(pPixelsRGBA); color_cell_compressor_params params; if (pComp_params->m_perceptual) @@ -2193,25 +2418,133 @@ bc7enc_bool bc7enc_compress_block(void *pBlock, const void *pPixelsRGBA, const b } else memcpy(params.m_weights, pComp_params->m_weights, sizeof(params.m_weights)); + + if (pComp_params->m_force_alpha) + { + handle_alpha_block(pBlock, pPixels, pComp_params, ¶ms); + return true; + } for (uint32_t i = 0; i < 16; i++) { - if (pPixels[i].a < 255) + if (pPixels[i].m_c[3] < 255) { handle_alpha_block(pBlock, pPixels, pComp_params, ¶ms); - return BC7ENC_TRUE; + return true; } } handle_opaque_block(pBlock, pPixels, pComp_params, ¶ms); - return BC7ENC_FALSE; + return false; } +/* +static const uint8_t g_tdefl_small_dist_extra[512] = +{ + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7 +}; + +static const uint8_t g_tdefl_large_dist_extra[128] = +{ + 0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 +}; + +static inline uint32_t compute_match_cost_estimate(uint32_t dist, uint32_t match_len_in_bytes) +{ + assert(match_len_in_bytes <= 258); + + uint32_t len_cost = 6; + if (match_len_in_bytes >= 12) + len_cost = 9; + else if (match_len_in_bytes >= 8) + len_cost = 8; + else if (match_len_in_bytes >= 6) + len_cost = 7; + + uint32_t dist_cost = 5; + if (dist < 512) + dist_cost += g_tdefl_small_dist_extra[dist & 511]; + else + { + dist_cost += g_tdefl_large_dist_extra[std::min(dist, 32767) >> 8]; + while (dist >= 32768) + { + dist_cost++; + dist >>= 1; + } + } + return len_cost + dist_cost; +} +*/ +class tracked_stat +{ +public: + tracked_stat() { clear(); } + + void clear() { m_num = 0; m_total = 0; m_total2 = 0; } + + void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; } + + tracked_stat& operator += (uint32_t val) { update(val); return *this; } + + uint32_t get_number_of_values() { return m_num; } + uint64_t get_total() const { return m_total; } + uint64_t get_total2() const { return m_total2; } + + float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; }; + float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } + float get_variance() const { float s = get_std_dev(); return s * s; } + +private: + uint32_t m_num; + uint64_t m_total; + uint64_t m_total2; +}; + +/* +static inline float compute_block_max_std_dev(const color_rgba* pPixels) +{ + tracked_stat r_stats, g_stats, b_stats, a_stats; + + for (uint32_t i = 0; i < 16; i++) + { + r_stats.update(pPixels[i].m_c[0]); + g_stats.update(pPixels[i].m_c[1]); + b_stats.update(pPixels[i].m_c[2]); + a_stats.update(pPixels[i].m_c[3]); + } + + return std::max(std::max(std::max(r_stats.get_std_dev(), g_stats.get_std_dev()), b_stats.get_std_dev()), a_stats.get_std_dev()); +} +*/ +struct bc7_block +{ + uint8_t m_bytes[16]; + + uint32_t get_mode() const + { + uint32_t bc7_mode = 0; + while (((m_bytes[0] & (1 << bc7_mode)) == 0) && (bc7_mode < 8)) + bc7_mode++; + return bc7_mode; + } +}; + /* ------------------------------------------------------------------------------ This software is available under 2 licenses -- choose whichever you prefer. +If you use this software in a product, attribution / credits is requested but not required. ------------------------------------------------------------------------------ ALTERNATIVE A - MIT License -Copyright(c) 2020 Richard Geldreich, Jr. +Copyright(c) 2020-2021 Richard Geldreich, Jr. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files(the "Software"), to deal in the Software without restriction, including without limitation the rights to diff --git a/libkram/bc7enc/bc7enc.h b/libkram/bc7enc/bc7enc.h index 2dbd9101..8794c15d 100644 --- a/libkram/bc7enc/bc7enc.h +++ b/libkram/bc7enc/bc7enc.h @@ -1,23 +1,22 @@ // File: bc7enc.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc.c) +// If you use this software in a product, attribution / credits is requested but not required. #include #include - -#ifdef __cplusplus -extern "C" { -#endif +#include +#include #define BC7ENC_BLOCK_SIZE (16) -#define BC7ENC_MAX_PARTITIONS1 (64) +#define BC7ENC_MAX_PARTITIONS (64) #define BC7ENC_MAX_UBER_LEVEL (4) -typedef uint8_t bc7enc_bool; -#define BC7ENC_TRUE (1) -#define BC7ENC_FALSE (0) +struct color_rgba { uint8_t m_c[4]; }; -typedef struct +struct bc7enc_compress_block_params { - // m_max_partitions_mode may range from 0 (disables mode 1) to BC7ENC_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality. - uint32_t m_max_partitions_mode; + uint32_t m_mode_mask; + + // m_max_partitions may range from 0 (disables mode 1) to BC7ENC_MAX_PARTITIONS. The higher this value, the slower the compressor, but the higher the quality. + uint32_t m_max_partitions; // Relative RGBA or YCbCrA weights. uint32_t m_weights[4]; @@ -26,23 +25,58 @@ typedef struct uint32_t m_uber_level; // If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB. - bc7enc_bool m_perceptual; + bool m_perceptual; // Set m_try_least_squares to false for slightly faster/lower quality compression. - bc7enc_bool m_try_least_squares; + bool m_try_least_squares; - // When m_mode_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful. + // When m_mode17_partition_estimation_filterbank, the mode1 partition estimator skips lesser used partition patterns unless they are strongly predicted to be potentially useful. // There's a slight loss in quality with this enabled (around .08 dB RGB PSNR or .05 dB Y PSNR), but up to a 11% gain in speed depending on the other settings. - bc7enc_bool m_mode_partition_estimation_filterbank; + bool m_mode17_partition_estimation_filterbank; + + bool m_force_alpha; + + bool m_force_selectors; + uint8_t m_selectors[16]; + + bool m_quant_mode6_endpoints; + bool m_bias_mode1_pbits; - bc7enc_bool m_use_mode5_for_alpha; - bc7enc_bool m_use_mode7_for_alpha; + float m_pbit1_weight; -} bc7enc_compress_block_params; + float m_mode1_error_weight; + float m_mode5_error_weight; + float m_mode6_error_weight; + float m_mode7_error_weight; + + float m_low_frequency_partition_weight; + + void clear() + { + memset(this, 0, sizeof(*this)); + } + + void print() + { + printf("Mode mask: 0x%X\n", m_mode_mask); + printf("Max partitions: %u\n", m_max_partitions); + printf("Weights: %u %u %u %u\n", m_weights[0], m_weights[1], m_weights[2], m_weights[3]); + printf("Uber level: %u\n", m_uber_level); + printf("Perceptual: %u\n", m_perceptual); + printf("Try least squares: %u\n", m_try_least_squares); + printf("Mode 1/7 partition estimation filterbank: %u\n", m_mode17_partition_estimation_filterbank); + printf("Force alpha: %u\n", m_force_alpha); + printf("Quant mode 6 endpoints: %u\n", m_quant_mode6_endpoints); + printf("Bias mode 1 p-bits: %u\n", m_bias_mode1_pbits); + printf("p-bit 1 weight: %f\n", m_pbit1_weight); + printf("Mode error weights: %f %f %f %f\n", m_mode1_error_weight, m_mode5_error_weight, m_mode6_error_weight, m_mode7_error_weight); + printf("Low frequency partition weight: %f\n", m_low_frequency_partition_weight); + } +}; inline void bc7enc_compress_block_params_init_linear_weights(bc7enc_compress_block_params *p) { - p->m_perceptual = BC7ENC_FALSE; + p->m_perceptual = false; p->m_weights[0] = 1; p->m_weights[1] = 1; p->m_weights[2] = 1; @@ -51,7 +85,7 @@ inline void bc7enc_compress_block_params_init_linear_weights(bc7enc_compress_blo inline void bc7enc_compress_block_params_init_perceptual_weights(bc7enc_compress_block_params *p) { - p->m_perceptual = BC7ENC_TRUE; + p->m_perceptual = true; p->m_weights[0] = 128; p->m_weights[1] = 64; p->m_weights[2] = 16; @@ -60,23 +94,30 @@ inline void bc7enc_compress_block_params_init_perceptual_weights(bc7enc_compress inline void bc7enc_compress_block_params_init(bc7enc_compress_block_params *p) { - p->m_max_partitions_mode = BC7ENC_MAX_PARTITIONS1; - p->m_try_least_squares = BC7ENC_TRUE; - p->m_mode_partition_estimation_filterbank = BC7ENC_TRUE; + p->m_mode_mask = UINT32_MAX; + p->m_max_partitions = BC7ENC_MAX_PARTITIONS; + p->m_try_least_squares = true; + p->m_mode17_partition_estimation_filterbank = true; p->m_uber_level = 0; - p->m_use_mode5_for_alpha = BC7ENC_TRUE; - p->m_use_mode7_for_alpha = BC7ENC_TRUE; + p->m_force_selectors = false; + p->m_force_alpha = false; + p->m_quant_mode6_endpoints = false; + p->m_bias_mode1_pbits = false; + p->m_pbit1_weight = 1.0f; + p->m_mode1_error_weight = 1.0f; + p->m_mode5_error_weight = 1.0f; + p->m_mode6_error_weight = 1.0f; + p->m_mode7_error_weight = 1.0f; + p->m_low_frequency_partition_weight = 1.0f; bc7enc_compress_block_params_init_perceptual_weights(p); } // bc7enc_compress_block_init() MUST be called before calling bc7enc_compress_block() (or you'll get artifacts). void bc7enc_compress_block_init(); -// Packs a single block of 4x4=16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6. +// Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6. // Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6. -// Returns BC7ENC_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC_FALSE. (This is not an error code - a block is always encoded.) -bc7enc_bool bc7enc_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc_compress_block_params *pComp_params); +// Returns true if the block had any pixels with alpha < 255, otherwise it return false. (This is not an error code - a block is always encoded.) +bool bc7enc_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc_compress_block_params *pComp_params); + -#ifdef __cplusplus -} -#endif diff --git a/libkram/bc7enc/ert.cpp b/libkram/bc7enc/ert.cpp new file mode 100644 index 00000000..c09b9668 --- /dev/null +++ b/libkram/bc7enc/ert.cpp @@ -0,0 +1,705 @@ +#include "ert.h" +#include +#include +#include +#include "utils.h" + +#define ERT_FAVOR_CONT_AND_REP0_MATCHES (1) +#define ERT_FAVOR_REP0_MATCHES (0) + +namespace ert +{ + const uint32_t MAX_BLOCK_PIXELS = 12 * 12; + const uint32_t MAX_BLOCK_SIZE_IN_BYTES = 256; + const uint32_t MIN_MATCH_LEN = 3; + const float LITERAL_BITS = 13.0f; + const float MATCH_CONTINUE_BITS = 1.0f; + const float MATCH_REP0_BITS = 4.0f; + + static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high; return value; } + template inline F lerp(F a, F b, F s) { return a + (b - a) * s; } + + static const uint8_t g_tdefl_small_dist_extra[512] = + { + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7 + }; + + static const uint8_t g_tdefl_large_dist_extra[128] = + { + 0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 + }; + + static inline uint32_t compute_match_cost_estimate(uint32_t dist, uint32_t match_len_in_bytes) + { + assert(match_len_in_bytes <= 258); + + uint32_t len_cost = 6; + if (match_len_in_bytes >= 12) + len_cost = 9; + else if (match_len_in_bytes >= 8) + len_cost = 8; + else if (match_len_in_bytes >= 6) + len_cost = 7; + + uint32_t dist_cost = 5; + if (dist < 512) + dist_cost += g_tdefl_small_dist_extra[dist & 511]; + else + { + dist_cost += g_tdefl_large_dist_extra[std::min(dist, 32767) >> 8]; + while (dist >= 32768) + { + dist_cost++; + dist >>= 1; + } + } + return len_cost + dist_cost; + } + + class tracked_stat + { + public: + tracked_stat() { clear(); } + + void clear() { m_num = 0; m_total = 0; m_total2 = 0; } + + void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; } + + tracked_stat& operator += (uint32_t val) { update(val); return *this; } + + uint32_t get_number_of_values() { return m_num; } + uint64_t get_total() const { return m_total; } + uint64_t get_total2() const { return m_total2; } + + float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; }; + float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } + float get_variance() const { float s = get_std_dev(); return s * s; } + + private: + uint32_t m_num; + uint64_t m_total; + uint64_t m_total2; + }; + + static inline float compute_block_max_std_dev(const color_rgba* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps) + { + tracked_stat comp_stats[4]; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_rgba* pPixel = pPixels + x + y * block_width; + + for (uint32_t c = 0; c < num_comps; c++) + comp_stats[c].update(pPixel->m_c[c]); + } + } + + float max_std_dev = 0.0f; + for (uint32_t i = 0; i < num_comps; i++) + max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev()); + return max_std_dev; + } + + static inline float compute_block_mse(const color_rgba* pPixelsA, const color_rgba* pPixelsB, uint32_t block_width, uint32_t block_height, uint32_t total_block_pixels, uint32_t num_comps, const uint32_t weights[4], float one_over_total_color_weight) + { + uint64_t total_err = 0; + + if ((block_width == 4) && (block_height == 4) && (num_comps == 4)) + { + if ((weights[0] == 1) && (weights[1] == 1) && (weights[2] == 1) && (weights[3] == 1)) + { + for (uint32_t i = 0; i < 16; i++) + { + const color_rgba* pA = pPixelsA + i; + const color_rgba* pB = pPixelsB + i; + + const int dr = pA->m_c[0] - pB->m_c[0]; + const int dg = pA->m_c[1] - pB->m_c[1]; + const int db = pA->m_c[2] - pB->m_c[2]; + const int da = pA->m_c[3] - pB->m_c[3]; + + total_err += dr * dr + dg * dg + db * db + da * da; + } + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + const color_rgba* pA = pPixelsA + i; + const color_rgba* pB = pPixelsB + i; + + const int dr = pA->m_c[0] - pB->m_c[0]; + const int dg = pA->m_c[1] - pB->m_c[1]; + const int db = pA->m_c[2] - pB->m_c[2]; + const int da = pA->m_c[3] - pB->m_c[3]; + + total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db + weights[3] * da * da; + } + } + } + else if ((block_width == 4) && (block_height == 4) && (num_comps == 3)) + { + for (uint32_t y = 0; y < 4; y++) + { + const uint32_t y_ofs = y * 4; + for (uint32_t x = 0; x < 4; x++) + { + const color_rgba* pA = pPixelsA + x + y_ofs; + const color_rgba* pB = pPixelsB + x + y_ofs; + + const int dr = pA->m_c[0] - pB->m_c[0]; + const int dg = pA->m_c[1] - pB->m_c[1]; + const int db = pA->m_c[2] - pB->m_c[2]; + + total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db; + } + } + } + else if ((block_width == 4) && (block_height == 4) && (num_comps == 2)) + { + for (uint32_t y = 0; y < 4; y++) + { + const uint32_t y_ofs = y * 4; + for (uint32_t x = 0; x < 4; x++) + { + const color_rgba* pA = pPixelsA + x + y_ofs; + const color_rgba* pB = pPixelsB + x + y_ofs; + + const int dr = pA->m_c[0] - pB->m_c[0]; + const int dg = pA->m_c[1] - pB->m_c[1]; + + total_err += weights[0] * dr * dr + weights[1] * dg * dg; + } + } + } + else if ((block_width == 4) && (block_height == 4) && (num_comps == 1)) + { + for (uint32_t y = 0; y < 4; y++) + { + const uint32_t y_ofs = y * 4; + for (uint32_t x = 0; x < 4; x++) + { + const color_rgba* pA = pPixelsA + x + y_ofs; + const color_rgba* pB = pPixelsB + x + y_ofs; + + const int dr = pA->m_c[0] - pB->m_c[0]; + + total_err += weights[0] * dr * dr; + } + } + } + else + { + for (uint32_t y = 0; y < block_height; y++) + { + const uint32_t y_ofs = y * block_width; + for (uint32_t x = 0; x < block_width; x++) + { + const color_rgba* pA = pPixelsA + x + y_ofs; + const color_rgba* pB = pPixelsB + x + y_ofs; + + for (uint32_t c = 0; c < num_comps; c++) + { + const int d = pA->m_c[c] - pB->m_c[c]; + total_err += weights[c] * d * d; + } + } + } + } + + return total_err * (one_over_total_color_weight / total_block_pixels); + } + + uint32_t hash_hsieh(const uint8_t* pBuf, size_t len, uint32_t salt) + { + if (!pBuf || !len) + return 0; + + uint32_t h = static_cast(len + (salt << 16)); + + const uint32_t bytes_left = len & 3; + len >>= 2; + + while (len--) + { + const uint16_t* pWords = reinterpret_cast(pBuf); + + h += pWords[0]; + + const uint32_t t = (pWords[1] << 11) ^ h; + h = (h << 16) ^ t; + + pBuf += sizeof(uint32_t); + + h += h >> 11; + } + + switch (bytes_left) + { + case 1: + h += *reinterpret_cast(pBuf); + h ^= h << 10; + h += h >> 1; + break; + case 2: + h += *reinterpret_cast(pBuf); + h ^= h << 11; + h += h >> 17; + break; + case 3: + h += *reinterpret_cast(pBuf); + h ^= h << 16; + h ^= (static_cast(pBuf[sizeof(uint16_t)])) << 18; + h += h >> 11; + break; + default: + break; + } + + h ^= h << 3; + h += h >> 5; + h ^= h << 4; + h += h >> 17; + h ^= h << 25; + h += h >> 6; + + return h; + } + + // BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations + bool reduce_entropy(void* pBlocks, uint32_t num_blocks, + uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps, + const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified, + pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data, + std::vector* pBlock_mse_scales) + { + assert(total_block_stride_in_bytes && block_size_to_optimize_in_bytes); + assert(total_block_stride_in_bytes >= block_size_to_optimize_in_bytes); + + assert(num_comps >= 1 && num_comps <= 4); + for (uint32_t i = num_comps; i < 4; i++) + { + assert(!params.m_color_weights[i]); + if (params.m_color_weights[i]) + return false; + } + + const uint32_t total_color_weight = params.m_color_weights[0] + params.m_color_weights[1] + params.m_color_weights[2] + params.m_color_weights[3]; + assert(total_color_weight); + const float one_over_total_color_weight = 1.0f / total_color_weight; + + assert((block_size_to_optimize_in_bytes >= MIN_MATCH_LEN) && (block_size_to_optimize_in_bytes <= MAX_BLOCK_SIZE_IN_BYTES)); + if ((block_size_to_optimize_in_bytes < MIN_MATCH_LEN) || (block_size_to_optimize_in_bytes > MAX_BLOCK_SIZE_IN_BYTES)) + return false; + + uint8_t* pBlock_bytes = (uint8_t*)pBlocks; + + const uint32_t total_block_pixels = block_width * block_height; + if (total_block_pixels > MAX_BLOCK_PIXELS) + return false; + + const int total_blocks_to_check = std::max(1U, params.m_lookback_window_size / total_block_stride_in_bytes); + + std::vector len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); + std::vector second_len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); + uint32_t total_second_matches = 0; + + int prev_match_window_ofs_to_favor_cont = -1, prev_match_dist_to_favor = -1; + + uint32_t total_smooth_blocks = 0; + + const uint32_t HASH_SIZE = 8192; + uint32_t hash[HASH_SIZE]; + + for (uint32_t block_index = 0; block_index < num_blocks; block_index++) + { + if ((block_index & 0xFF) == 0) + memset(hash, 0, sizeof(hash)); + + uint8_t* pOrig_block = &pBlock_bytes[block_index * total_block_stride_in_bytes]; + const color_rgba* pPixels = &pBlock_pixels[block_index * total_block_pixels]; + + color_rgba decoded_block[MAX_BLOCK_PIXELS]; + if (!(*pUnpack_block_func)(pOrig_block, decoded_block, block_index, pUnpack_block_func_user_data)) + return false; + + float cur_mse = compute_block_mse(pPixels, decoded_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); + + if ((params.m_skip_zero_mse_blocks) && (cur_mse == 0.0f)) + continue; + + const float max_std_dev = compute_block_max_std_dev(pPixels, block_width, block_height, num_comps); + + float yl = clampf(max_std_dev / params.m_max_smooth_block_std_dev, 0.0f, 1.0f); + yl = yl * yl; + float smooth_block_mse_scale = lerp(params.m_smooth_block_max_mse_scale, 1.0f, yl); + + if (pBlock_mse_scales) + { + if ((*pBlock_mse_scales)[block_index] > 0.0f) + { + smooth_block_mse_scale = (*pBlock_mse_scales)[block_index]; + } + } + + if (smooth_block_mse_scale > 1.0f) + total_smooth_blocks++; + + float cur_bits = (LITERAL_BITS * block_size_to_optimize_in_bytes); + float cur_t = cur_mse * smooth_block_mse_scale + cur_bits * params.m_lambda; + + int first_block_to_check = std::max(0, block_index - total_blocks_to_check); + int last_block_to_check = block_index - 1; + + uint8_t best_block[MAX_BLOCK_SIZE_IN_BYTES]; + memcpy(best_block, pOrig_block, block_size_to_optimize_in_bytes); + + float best_t = cur_t; + uint32_t best_match_len = 0, best_match_src_window_ofs = 0, best_match_dst_window_ofs = 0, best_match_src_block_ofs = 0, best_match_dst_block_ofs = 0; + float best_match_bits = 0; + + // Don't let thresh_ms_err be 0 to let zero error blocks have slightly increased distortion + const float thresh_ms_err = params.m_max_allowed_rms_increase_ratio * params.m_max_allowed_rms_increase_ratio * std::max(cur_mse, 1.0f); + + for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index) + { + const uint8_t* pPrev_blk = &pBlock_bytes[prev_block_index * total_block_stride_in_bytes]; + + for (uint32_t len = block_size_to_optimize_in_bytes; len >= MIN_MATCH_LEN; len--) + { + if (params.m_allow_relative_movement) + { + for (uint32_t src_ofs = 0; src_ofs <= (block_size_to_optimize_in_bytes - len); src_ofs++) + { + assert(len + src_ofs <= block_size_to_optimize_in_bytes); + + const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + src_ofs; + + for (uint32_t dst_ofs = 0; dst_ofs <= (block_size_to_optimize_in_bytes - len); dst_ofs++) + { + assert(len + dst_ofs <= block_size_to_optimize_in_bytes); + + const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + dst_ofs; + + const uint32_t match_dist = dst_match_window_ofs - src_match_window_ofs; + + float trial_match_bits, trial_total_bits; + + uint32_t hs = hash_hsieh(pPrev_blk + src_ofs, len, dst_ofs); + +#if ERT_FAVOR_CONT_AND_REP0_MATCHES + // Continue a previous match (which would cross block boundaries) + if (((int)src_match_window_ofs == prev_match_window_ofs_to_favor_cont) && (dst_ofs == 0)) + { + trial_match_bits = MATCH_CONTINUE_BITS; + trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_CONTINUE_BITS; + } + // Exploit REP0 matches + else if ((prev_match_dist_to_favor != -1) && (src_match_window_ofs == (dst_match_window_ofs - prev_match_dist_to_favor))) + { + trial_match_bits = MATCH_REP0_BITS; + trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_REP0_BITS; + } + else + { + trial_match_bits = (float)compute_match_cost_estimate(match_dist, len); + trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + trial_match_bits; + + uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; + if ((hash_check & 0xFF) == (block_index & 0xFF)) + { + if ((hash_check >> 8) == (hs >> 8)) + continue; + } + } +#else + uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; + if ((hash_check & 0xFF) == (block_index & 0xFF)) + { + if ((hash_check >> 8) == (hs >> 8)) + continue; + } +#endif + + hash[hs & (HASH_SIZE - 1)] = (hs & 0xFFFFFF00) | (block_index & 0xFF); + + const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda; + + uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES]; + memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes); + memcpy(trial_block + dst_ofs, pPrev_blk + src_ofs, len); + + color_rgba decoded_trial_block[MAX_BLOCK_PIXELS]; + if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, pUnpack_block_func_user_data)) + continue; + + float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); + + if (trial_mse < thresh_ms_err) + { + float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda; + + if (t < best_t) + { + best_t = t; + memcpy(best_block, trial_block, block_size_to_optimize_in_bytes); + best_match_len = len; + best_match_src_window_ofs = src_match_window_ofs; + best_match_dst_window_ofs = dst_match_window_ofs; + best_match_src_block_ofs = src_ofs; + best_match_dst_block_ofs = dst_ofs; + best_match_bits = trial_match_bits; + } + } + + } // dst_ofs + } // src_ofs + } + else + { + const uint32_t match_dist = (block_index - prev_block_index) * total_block_stride_in_bytes; + + // Assume the block has 1 match and block_size_to_optimize_in_bytes-match_len literals. + const float trial_match_bits = (float)compute_match_cost_estimate(match_dist, len); + const float trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + trial_match_bits; + const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda; + + for (uint32_t ofs = 0; ofs <= (block_size_to_optimize_in_bytes - len); ofs++) + { + assert(len + ofs <= block_size_to_optimize_in_bytes); + + const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + ofs; + const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + ofs; + + float trial_match_bits_to_use = trial_match_bits; + float trial_total_bits_times_lambda_to_use = trial_total_bits_times_lambda; + + uint32_t hs = hash_hsieh(pPrev_blk + ofs, len, ofs); + +#if ERT_FAVOR_CONT_AND_REP0_MATCHES + // Continue a previous match (which would cross block boundaries) + if (((int)src_match_window_ofs == prev_match_window_ofs_to_favor_cont) && (ofs == 0)) + { + float continue_match_trial_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_CONTINUE_BITS; + trial_match_bits_to_use = MATCH_CONTINUE_BITS; + trial_total_bits_times_lambda_to_use = continue_match_trial_bits * params.m_lambda; + } + // Exploit REP0 matches + else if ((prev_match_dist_to_favor != -1) && (src_match_window_ofs == (dst_match_window_ofs - prev_match_dist_to_favor))) + { + float continue_match_trial_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_REP0_BITS; + trial_match_bits_to_use = MATCH_REP0_BITS; + trial_total_bits_times_lambda_to_use = continue_match_trial_bits * params.m_lambda; + } + else + { + uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; + if ((hash_check & 0xFF) == (block_index & 0xFF)) + { + if ((hash_check >> 8) == (hs >> 8)) + continue; + } + } +#else + uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; + if ((hash_check & 0xFF) == (block_index & 0xFF)) + { + if ((hash_check >> 8) == (hs >> 8)) + continue; + } +#endif + + hash[hs & (HASH_SIZE - 1)] = (hs & 0xFFFFFF00) | (block_index & 0xFF); + + uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES]; + memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes); + memcpy(trial_block + ofs, pPrev_blk + ofs, len); + + color_rgba decoded_trial_block[MAX_BLOCK_PIXELS]; + if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, pUnpack_block_func_user_data)) + continue; + + float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); + + if (trial_mse < thresh_ms_err) + { + float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda_to_use; + + if (t < best_t) + { + best_t = t; + memcpy(best_block, trial_block, block_size_to_optimize_in_bytes); + best_match_len = len; + best_match_src_window_ofs = src_match_window_ofs; + best_match_dst_window_ofs = dst_match_window_ofs; + best_match_src_block_ofs = ofs; + best_match_dst_block_ofs = ofs; + best_match_bits = trial_match_bits_to_use; + } + } + } // ofs + } + + } // len + + } // prev_block_index + + if (best_t < cur_t) + { + uint32_t best_second_match_len = 0, best_second_match_src_window_ofs = 0, best_second_match_dst_window_ofs = 0, best_second_match_src_block_ofs = 0, best_second_match_dst_block_ofs = 0; + + // Try injecting a second match, being sure it does't overlap with the first. + if ((params.m_try_two_matches) && (best_match_len <= (block_size_to_optimize_in_bytes - 3))) + { + uint8_t matched_flags[MAX_BLOCK_SIZE_IN_BYTES]; + memset(matched_flags, 0, sizeof(matched_flags)); + memset(matched_flags + best_match_dst_block_ofs, 1, best_match_len); + + uint8_t orig_best_block[MAX_BLOCK_SIZE_IN_BYTES]; + memcpy(orig_best_block, best_block, block_size_to_optimize_in_bytes); + + for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index) + { + const uint8_t* pPrev_blk = &pBlock_bytes[prev_block_index * total_block_stride_in_bytes]; + + const uint32_t match_dist = (block_index - prev_block_index) * total_block_stride_in_bytes; + + for (uint32_t len = 3; len <= (block_size_to_optimize_in_bytes - best_match_len); len++) + { + const float trial_total_bits = (block_size_to_optimize_in_bytes - len - best_match_len) * LITERAL_BITS + compute_match_cost_estimate(match_dist, len) + best_match_bits; + + const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda; + + for (uint32_t ofs = 0; ofs <= (block_size_to_optimize_in_bytes - len); ofs++) + { + int i; + for (i = 0; i < (int)len; i++) + if (matched_flags[ofs + i]) + break; + if (i != (int)len) + continue; + + assert(len + ofs <= block_size_to_optimize_in_bytes); + + const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + ofs; + const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + ofs; + + uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES]; + memcpy(trial_block, orig_best_block, block_size_to_optimize_in_bytes); + memcpy(trial_block + ofs, pPrev_blk + ofs, len); + + color_rgba decoded_trial_block[MAX_BLOCK_PIXELS]; + if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, pUnpack_block_func_user_data)) + continue; + + float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); + + if (trial_mse < thresh_ms_err) + { + float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda; + + if (t < best_t) + { + best_t = t; + memcpy(best_block, trial_block, block_size_to_optimize_in_bytes); + best_second_match_len = len; + best_second_match_src_window_ofs = src_match_window_ofs; + best_second_match_dst_window_ofs = dst_match_window_ofs; + best_second_match_src_block_ofs = ofs; + best_second_match_dst_block_ofs = ofs; + } + } + } + } + } + } + + memcpy(pOrig_block, best_block, block_size_to_optimize_in_bytes); + total_modified++; + + if ((best_second_match_len == 0) || (best_match_dst_window_ofs > best_second_match_dst_window_ofs)) + { + int best_match_dist = best_match_dst_window_ofs - best_match_src_window_ofs; + assert(best_match_dist >= 1); + (void)best_match_dist; + + if (block_size_to_optimize_in_bytes == total_block_stride_in_bytes) + { + // If the match goes all the way to the end of a block, we can try to continue it on the next encoded block. + if ((best_match_dst_block_ofs + best_match_len) == total_block_stride_in_bytes) + prev_match_window_ofs_to_favor_cont = best_match_src_window_ofs + best_match_len; + else + prev_match_window_ofs_to_favor_cont = -1; + } + +#if ERT_FAVOR_REP0_MATCHES + // Compute the window offset where a cheaper REP0 match would be available + prev_match_dist_to_favor = best_match_dist; +#endif + } + else + { + int best_match_dist = best_second_match_dst_window_ofs - best_second_match_src_window_ofs; + assert(best_match_dist >= 1); + (void)best_match_dist; + + if (block_size_to_optimize_in_bytes == total_block_stride_in_bytes) + { + // If the match goes all the way to the end of a block, we can try to continue it on the next encoded block. + if ((best_second_match_dst_block_ofs + best_second_match_len) == total_block_stride_in_bytes) + prev_match_window_ofs_to_favor_cont = best_second_match_src_window_ofs + best_second_match_len; + else + prev_match_window_ofs_to_favor_cont = -1; + } + +#if ERT_FAVOR_REP0_MATCHES + // Compute the window offset where a cheaper REP0 match would be available + prev_match_dist_to_favor = best_match_dist; +#endif + } + + len_hist[best_match_len]++; + + if (best_second_match_len) + { + second_len_hist[best_second_match_len]++; + total_second_matches++; + } + } + else + { + prev_match_window_ofs_to_favor_cont = -1; + } + + } // block_index + + if (params.m_debug_output) + { + printf("Total smooth blocks: %3.2f%%\n", total_smooth_blocks * 100.0f / num_blocks); + + printf("Match length histogram:\n"); + for (uint32_t i = MIN_MATCH_LEN; i <= block_size_to_optimize_in_bytes; i++) + printf("%u%c", len_hist[i], (i < block_size_to_optimize_in_bytes) ? ',' : '\n'); + + printf("Total second matches: %u %3.2f%%\n", total_second_matches, total_second_matches * 100.0f / num_blocks); + printf("Secod match length histogram:\n"); + for (uint32_t i = MIN_MATCH_LEN; i <= block_size_to_optimize_in_bytes; i++) + printf("%u%c", second_len_hist[i], (i < block_size_to_optimize_in_bytes) ? ',' : '\n'); + } + + return true; + } + +} // namespace ert + diff --git a/libkram/bc7enc/ert.h b/libkram/bc7enc/ert.h new file mode 100644 index 00000000..d387f527 --- /dev/null +++ b/libkram/bc7enc/ert.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ert +{ + struct color_rgba { uint8_t m_c[4]; }; + + struct reduce_entropy_params + { + // m_lambda: The post-processor tries to reduce distortion*smooth_block_scale + rate*lambda (rate is approximate LZ bits and distortion is scaled MS error multiplied against the smooth block MSE weighting factor). + // Larger values push the postprocessor towards optimizing more for lower rate, and smaller values more for distortion. 0=minimal distortion. + float m_lambda; + + // m_lookback_window_size: The number of bytes the encoder can look back from each block to find matches. The larger this value, the slower the encoder but the higher the quality per LZ compressed bit. + uint32_t m_lookback_window_size; + + // m_max_allowed_rms_increase_ratio: How much the RMS error of a block is allowed to increase before a trial is rejected. 1.0=no increase allowed, 1.05=5% increase allowed, etc. + float m_max_allowed_rms_increase_ratio; + + float m_max_smooth_block_std_dev; + float m_smooth_block_max_mse_scale; + + uint32_t m_color_weights[4]; + + bool m_try_two_matches; + bool m_allow_relative_movement; + bool m_skip_zero_mse_blocks; + bool m_debug_output; + + reduce_entropy_params() { clear(); } + + void clear() + { + m_lookback_window_size = 256; + m_lambda = 1.0f; + m_max_allowed_rms_increase_ratio = 10.0f; + m_max_smooth_block_std_dev = 18.0f; + m_smooth_block_max_mse_scale = 10.0f; + m_color_weights[0] = 1; + m_color_weights[1] = 1; + m_color_weights[2] = 1; + m_color_weights[3] = 1; + m_try_two_matches = false; + m_allow_relative_movement = false; + m_skip_zero_mse_blocks = false; + m_debug_output = false; + } + + void print() + { + printf("lambda: %f\n", m_lambda); + printf("Lookback window size: %u\n", m_lookback_window_size); + printf("Max allowed RMS increase ratio: %f\n", m_max_allowed_rms_increase_ratio); + printf("Max smooth block std dev: %f\n", m_max_smooth_block_std_dev); + printf("Smooth block max MSE scale: %f\n", m_smooth_block_max_mse_scale); + printf("Color weights: %u %u %u %u\n", m_color_weights[0], m_color_weights[1], m_color_weights[2], m_color_weights[3]); + printf("Try two matches: %u\n", m_try_two_matches); + printf("Allow relative movement: %u\n", m_allow_relative_movement); + printf("Skip zero MSE blocks: %u\n", m_skip_zero_mse_blocks); + } + }; + + typedef bool (*pUnpack_block_func)(const void* pBlock, color_rgba* pPixels, uint32_t block_index, void* pUser_data); + + // BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations + bool reduce_entropy(void* pBlocks, uint32_t num_blocks, + uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps, + const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified, + pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data, + std::vector* pBlock_mse_scales = nullptr); + +} // namespace ert diff --git a/libkram/bc7enc/rdo_bc_encoder.cpp b/libkram/bc7enc/rdo_bc_encoder.cpp new file mode 100644 index 00000000..44d39333 --- /dev/null +++ b/libkram/bc7enc/rdo_bc_encoder.cpp @@ -0,0 +1,1270 @@ +// rdo_bc_encoder.cpp +#include "rdo_bc_encoder.h" + +#define RGBCX_IMPLEMENTATION +#include "rgbcx.h" + +#define DECODE_BC4_TO_GRAYSCALE (0) + +#ifdef _MSC_VER +#pragma warning (disable: 4127) // conditional expression is constant +#endif + +using namespace utils; + +namespace rdo_bc +{ + static const char* get_dxgi_format_string(DXGI_FORMAT fmt) + { + switch (fmt) + { + case DXGI_FORMAT_BC1_UNORM: return "BC1_UNORM"; + case DXGI_FORMAT_BC4_UNORM: return "BC4_UNORM"; + case DXGI_FORMAT_BC3_UNORM: return "BC3_UNORM"; + case DXGI_FORMAT_BC5_UNORM: return "BC5_UNORM"; + case DXGI_FORMAT_BC7_UNORM: return "BC7_UNORM"; + default: break; + } + return "?"; + } + + static std::vector compute_block_mse_scales(const image_u8& source_image, uint32_t blocks_x, uint32_t blocks_y, uint32_t total_blocks, bool rdo_debug_output) + { + const float ULTRASMOOTH_BLOCK_STD_DEV_THRESHOLD = 2.9f; + const float DARK_THRESHOLD = 13.0f; + const float BRIGHT_THRESHOLD = 222.0f; + const float ULTRAMOOTH_BLOCK_MSE_SCALE = 120.0f; + const uint32_t ULTRASMOOTH_REGION_TOO_SMALL_THRESHOLD = 64; + + image_u8 ultrasmooth_blocks_vis(blocks_x, blocks_y); + + for (uint32_t by = 0; by < blocks_y; by++) + { + for (uint32_t bx = 0; bx < blocks_x; bx++) + { + color_quad_u8 block_pixels[16]; + source_image.get_block(bx, by, 4, 4, block_pixels); + + tracked_stat y_stats; + for (uint32_t y = 0; y < 4; y++) + for (uint32_t x = 0; x < 4; x++) + { + int l = block_pixels[x + y * 4].get_luma(); + y_stats.update(l); + } + + float max_std_dev = compute_block_max_std_dev((color_quad_u8*)block_pixels, 4, 4, 3); + + float yl = max_std_dev / ULTRASMOOTH_BLOCK_STD_DEV_THRESHOLD; + + yl = clamp(yl, 0.0f, 1.0f); + yl *= yl; + + float y_avg = y_stats.get_mean(); + + if ((y_avg < DARK_THRESHOLD) || (y_avg >= BRIGHT_THRESHOLD)) + yl = 1.0f; + + int k = std::min((int)(yl * 255.0f + .5f), 255); + + ultrasmooth_blocks_vis.fill_box(bx, by, 1, 1, color_quad_u8((uint8_t)k, 255)); + } + } + + for (int pass = 0; pass < 1; pass++) + { + image_u8 next_vis(ultrasmooth_blocks_vis); + + for (int y = 0; y < (int)blocks_y; y++) + { + for (int x = 0; x < (int)blocks_x; x++) + { + int m = 0; + + for (int dy = -1; dy <= 1; dy++) + for (int dx = -1; dx <= 1; dx++) + { + if (ultrasmooth_blocks_vis.get_clamped(x + dx, y + dy).r == 255) + m = std::max(m, ultrasmooth_blocks_vis.get_clamped(x + dx, y + dy).r); + } + + next_vis(x, y).set((uint8_t)m, 255); + } + } + + ultrasmooth_blocks_vis.swap(next_vis); + } + + for (uint32_t pass = 0; pass < 32; pass++) + { + image_u8 next_vis(ultrasmooth_blocks_vis); + for (int y = 0; y < (int)blocks_y; y++) + { + for (int x = 0; x < (int)blocks_x; x++) + { + if (ultrasmooth_blocks_vis.get_clamped(x, y).r < 255) + { + int m = 0; + + for (int dy = -1; dy <= 1; dy++) + for (int dx = -1; dx <= 1; dx++) + if (ultrasmooth_blocks_vis.get_clamped(x + dx, y + dy).r == 255) + m++; + + if (m >= 5) + next_vis.set_pixel_clipped(x, y, color_quad_u8(255, 255, 255, 255)); + } + } + } + ultrasmooth_blocks_vis.swap(next_vis); + } + + image_u8 orig_ultrasmooth_blocks_vis(ultrasmooth_blocks_vis); + + if (rdo_debug_output) + { + save_png("ultrasmooth_block_mask_pre_filter.png", ultrasmooth_blocks_vis, false); + } + + for (uint32_t by = 0; by < blocks_y; by++) + { + for (uint32_t bx = 0; bx < blocks_x; bx++) + { + const bool is_ultrasmooth = ultrasmooth_blocks_vis(bx, by).r == 0; + if (!is_ultrasmooth) + continue; + + std::vector filled_pixels; + filled_pixels.reserve(256); + + uint32_t total_set_pixels = ultrasmooth_blocks_vis.flood_fill(bx, by, color_quad_u8(255, 255, 255, 255), color_quad_u8(0, 0, 0, 255), &filled_pixels); + + if (total_set_pixels < ULTRASMOOTH_REGION_TOO_SMALL_THRESHOLD) + { + for (uint32_t i = 0; i < filled_pixels.size(); i++) + orig_ultrasmooth_blocks_vis(filled_pixels[i].m_x, filled_pixels[i].m_y) = color_quad_u8(255, 255, 255, 255); + } + + } // bx + } // by + + ultrasmooth_blocks_vis = orig_ultrasmooth_blocks_vis; + + if (rdo_debug_output) + { + save_png("ultrasmooth_block_mask.png", ultrasmooth_blocks_vis, false); + } + + std::vector block_mse_scales(total_blocks); + + uint32_t total_ultrasmooth_blocks = 0; + for (uint32_t by = 0; by < blocks_y; by++) + { + for (uint32_t bx = 0; bx < blocks_x; bx++) + { + const bool is_ultrasmooth = ultrasmooth_blocks_vis(bx, by).r == 0; + + block_mse_scales[bx + by * blocks_x] = is_ultrasmooth ? ULTRAMOOTH_BLOCK_MSE_SCALE : -1.0f; + + total_ultrasmooth_blocks += is_ultrasmooth; + } + } + + if (rdo_debug_output) + printf("Total ultrasmooth blocks: %3.2f%%\n", total_ultrasmooth_blocks * 100.0f / total_blocks); + + return block_mse_scales; + } + + rdo_bc_encoder::rdo_bc_encoder() : + m_pOrig_source_image(nullptr), + m_orig_width(0), + m_orig_height(0), + m_blocks_x(0), + m_blocks_y(0), + m_total_blocks(0), + m_bytes_per_block(0), + m_pixel_format_bpp(0), + m_total_texels(0), + m_has_alpha(false) + { + } + + void rdo_bc_encoder::clear() + { + m_pOrig_source_image = nullptr; + + m_source_image.clear(); + + m_params.clear(); + + m_orig_width = 0; + m_orig_height = 0; + m_blocks_x = 0; + m_blocks_y = 0; + m_total_blocks = 0; + m_bytes_per_block = 0; + m_pixel_format_bpp = 0; + m_total_texels = 0; + m_has_alpha = false; + + m_packed_image8.clear(); + m_packed_image16.clear(); + + m_prerdo_packed_image8.clear(); + m_prerdo_packed_image16.clear(); + + m_bc7enc_pack_params.clear(); +#if SUPPORT_BC7E + memset(&m_bc7e_pack_params, 0, sizeof(m_bc7e_pack_params)); +#endif + } + + bool rdo_bc_encoder::init(const utils::image_u8& src_image, rdo_bc_params& params) + { + clear(); + + m_pOrig_source_image = &src_image; + m_params = params; + + init_encoders(); + + if (!init_source_image()) + return false; + + return true; + } + + bool rdo_bc_encoder::encode() + { + if (!m_packed_image8.size() && !m_packed_image16.size()) + return false; + + if (!init_encoder_params()) + return false; + + if (!encode_texture()) + return false; + + if (!postprocess_rdo()) + return false; + + return true; + } + + void rdo_bc_encoder::init_encoders() + { + rgbcx::init(m_params.m_bc1_mode); + bc7enc_compress_block_init(); +#if SUPPORT_BC7E + ispc::bc7e_compress_block_init(); +#endif + } + + bool rdo_bc_encoder::init_encoder_params() + { + bc7enc_compress_block_params_init(&m_bc7enc_pack_params); + if (!m_params.m_perceptual) + bc7enc_compress_block_params_init_linear_weights(&m_bc7enc_pack_params); + m_bc7enc_pack_params.m_max_partitions = m_params.m_bc7enc_max_partitions_to_scan; + m_bc7enc_pack_params.m_uber_level = std::min(BC7ENC_MAX_UBER_LEVEL, m_params.m_bc7_uber_level); + + if (m_params.m_bc7enc_mode6_only) + m_bc7enc_pack_params.m_mode_mask = 1 << 6; + + if ((m_params.m_dxgi_format == DXGI_FORMAT_BC7_UNORM) && (m_params.m_rdo_lambda > 0.0f)) + { + // Slam off perceptual in RDO mode - we don't support it (too slow). + m_params.m_perceptual = false; + m_bc7enc_pack_params.m_perceptual = false; + bc7enc_compress_block_params_init_linear_weights(&m_bc7enc_pack_params); + } + + if ((m_params.m_dxgi_format == DXGI_FORMAT_BC7_UNORM) && (m_params.m_bc7enc_reduce_entropy)) + { + // Configure the BC7 encoder with some decent parameters for later RDO post-processing. + // Textures with alpha are harder for BC7 to handle, so we use more conservative defaults. + + m_bc7enc_pack_params.m_mode17_partition_estimation_filterbank = false; + + if (m_params.m_bc7enc_rdo_bc7_weight_modes) + { + // Weight modes 5 and especially 6 more highly than the other modes. + if (m_has_alpha) + { + m_bc7enc_pack_params.m_mode5_error_weight = .7f; + m_bc7enc_pack_params.m_mode6_error_weight = .6f; + } + else + { + m_bc7enc_pack_params.m_mode6_error_weight = .4f; + } + } + + if (m_params.m_bc7enc_rdo_bc7_weight_low_frequency_partitions) + { + // Slightly prefer the lower frequency partition patterns. + m_bc7enc_pack_params.m_low_frequency_partition_weight = .9999f; + } + + if (m_params.m_bc7enc_rdo_bc7_quant_mode6_endpoints) + { + // As a good default, don't quantize mode 6 endpoints if the texture has alpha. This isn't required, but helps mask textures. + //if (!has_alpha) + m_bc7enc_pack_params.m_quant_mode6_endpoints = true; + } + + if (m_params.m_bc7enc_rdo_bc7_pbit1_weighting) + { + // Favor p-bit 0 vs. 1, to slightly lower the entropy of output blocks with p-bits + m_bc7enc_pack_params.m_pbit1_weight = 1.3f; + } + } + +#if SUPPORT_BC7E + // Now initialize the BC7 compressor's parameters. + + memset(&m_bc7e_pack_params, 0, sizeof(m_bc7e_pack_params)); + switch (m_params.m_bc7_uber_level) + { + case 0: + ispc::bc7e_compress_block_params_init_ultrafast(&m_bc7e_pack_params, m_params.m_perceptual); + break; + case 1: + ispc::bc7e_compress_block_params_init_veryfast(&m_bc7e_pack_params, m_params.m_perceptual); + break; + case 2: + ispc::bc7e_compress_block_params_init_fast(&m_bc7e_pack_params, m_params.m_perceptual); + break; + case 3: + ispc::bc7e_compress_block_params_init_basic(&m_bc7e_pack_params, m_params.m_perceptual); + break; + case 4: + ispc::bc7e_compress_block_params_init_slow(&m_bc7e_pack_params, m_params.m_perceptual); + break; + case 5: + ispc::bc7e_compress_block_params_init_veryslow(&m_bc7e_pack_params, m_params.m_perceptual); + break; + case 6: + default: + ispc::bc7e_compress_block_params_init_slowest(&m_bc7e_pack_params, m_params.m_perceptual); + break; + } +#endif + + if (m_params.m_status_output) + { + if (m_params.m_dxgi_format == DXGI_FORMAT_BC7_UNORM) + { + if ((SUPPORT_BC7E) && (m_params.m_use_bc7e)) + printf("bc7e.ispc uber level: %u, perceptual: %u\n", m_params.m_bc7_uber_level, m_params.m_perceptual); + else + { + printf("\nbc7enc parameters:\n"); + m_bc7enc_pack_params.print(); + } + } + else + { + printf("BC1 level: %u, use 3-color mode: %u, use 3-color mode for black: %u, bc1_mode: %u\n", + m_params.m_bc1_quality_level, m_params.m_use_bc1_3color_mode, m_params.m_use_bc1_3color_mode_for_black, (int)m_params.m_bc1_mode); + } + + if ((m_params.m_dxgi_format == DXGI_FORMAT_BC3_UNORM) || (m_params.m_dxgi_format == DXGI_FORMAT_BC4_UNORM) || (m_params.m_dxgi_format == DXGI_FORMAT_BC5_UNORM)) + { + printf("Use high quality BC4 block encoder: %u, BC4 block radius: %u, use 6 value mode: %u, use 8 value mode: %u\n", + m_params.m_use_hq_bc345, m_params.m_bc345_search_rad, (m_params.m_bc345_mode_mask & 2) != 0, (m_params.m_bc345_mode_mask & 1) != 0); + } + + printf("\nrdo_bc_params:\n"); + printf(" Perceptual: %u\n", m_params.m_perceptual); + printf(" Y Flip: %u\n", m_params.m_y_flip); + printf(" DXGI format: 0x%X %s\n", m_params.m_dxgi_format, get_dxgi_format_string(m_params.m_dxgi_format)); + + printf("BC1-5 parameters:\n"); + printf(" BC45 channels: %u %u\n", m_params.m_bc45_channel0, m_params.m_bc45_channel1); + printf(" BC1 approximation mode: %u\n", (int)m_params.m_bc1_mode); + printf(" Use BC1 3-color mode: %u\n", m_params.m_use_bc1_3color_mode); + printf(" Use BC1 3-color mode for black: %u\n", m_params.m_use_bc1_3color_mode_for_black); + printf(" BC1 quality level: %u\n", m_params.m_bc1_quality_level); + printf(" Use HQ BC345: %u\n", m_params.m_use_hq_bc345); + printf(" BC345 search radius: %u\n", m_params.m_bc345_search_rad); + printf(" BC345 mode mask: 0x%X\n", m_params.m_bc345_mode_mask); + + printf("BC7 parameters:\n"); + printf(" Use bc7e: %u\n", m_params.m_use_bc7e); + printf(" BC7 uber level: %u\n", m_params.m_bc7_uber_level); + + printf("RDO parameters:\n"); + printf(" Lambda: %f\n", m_params.m_rdo_lambda); + printf(" Lookback window size: %u\n", m_params.m_lookback_window_size); + printf(" Custom lookback window size: %u\n", m_params.m_custom_lookback_window_size); + printf(" Try 2 matches: %u\n", m_params.m_rdo_try_2_matches); + printf(" Smooth block error scale: %f\n", m_params.m_rdo_smooth_block_error_scale); + printf(" Custom RDO smooth block error scale: %u\n", m_params.m_custom_rdo_smooth_block_error_scale); + printf(" Max smooth block std dev: %f\n", m_params.m_rdo_max_smooth_block_std_dev); + printf(" Allow relative movement: %u\n", m_params.m_rdo_allow_relative_movement); + printf(" Ultrasmooth block handling: %u\n", m_params.m_rdo_ultrasmooth_block_handling); + printf(" Multithreading: %u, max threads: %u\n", m_params.m_rdo_multithreading, m_params.m_rdo_max_threads); + + printf("bc7enc parameters:\n"); + printf(" Mode 6 only: %u\n", m_params.m_bc7enc_mode6_only); + printf(" Max partitions to scan: %u\n", m_params.m_bc7enc_max_partitions_to_scan); + printf(" Quant mode 6 endpoints: %u\n", m_params.m_bc7enc_rdo_bc7_quant_mode6_endpoints); + printf(" Weight modes: %u\n", m_params.m_bc7enc_rdo_bc7_weight_modes); + printf(" Weight low freq partitions: %u\n", m_params.m_bc7enc_rdo_bc7_weight_low_frequency_partitions); + printf(" P-bit1 weighting: %u\n", m_params.m_bc7enc_rdo_bc7_pbit1_weighting); + printf(" Reduce entropy mode: %u\n", m_params.m_bc7enc_reduce_entropy); + printf("\n"); + } + + return true; + } + + bool rdo_bc_encoder::init_source_image() + { + switch (m_params.m_dxgi_format) + { + case DXGI_FORMAT_BC1_UNORM: + case DXGI_FORMAT_BC4_UNORM: + m_pixel_format_bpp = 4; + break; + case DXGI_FORMAT_BC3_UNORM: + case DXGI_FORMAT_BC5_UNORM: + case DXGI_FORMAT_BC7_UNORM: + m_pixel_format_bpp = 8; + break; + default: + return false; + } + + m_bytes_per_block = (16 * m_pixel_format_bpp) / 8; + assert((m_bytes_per_block == 8) || (m_bytes_per_block == 16)); + + m_source_image = *m_pOrig_source_image; + + m_orig_width = m_source_image.width(); + m_orig_height = m_source_image.height(); + + if (m_params.m_y_flip) + { + utils::image_u8 temp; + temp.init(m_orig_width, m_orig_height); + + for (uint32_t y = 0; y < m_orig_height; y++) + for (uint32_t x = 0; x < m_orig_width; x++) + temp(x, (m_orig_height - 1) - y) = m_source_image(x, y); + + temp.swap(m_source_image); + } + + m_source_image.crop_dup_borders((m_source_image.width() + 3) & ~3, (m_source_image.height() + 3) & ~3); + + m_blocks_x = m_source_image.width() / 4; + m_blocks_y = m_source_image.height() / 4; + m_total_blocks = m_blocks_x * m_blocks_y; + m_total_texels = m_total_blocks * 16; + + bool has_alpha = false; + for (int by = 0; by < ((int)m_blocks_y) && !has_alpha; by++) + { + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + color_quad_u8 pixels[16]; + m_source_image.get_block(bx, by, 4, 4, pixels); + + for (uint32_t i = 0; i < 16; i++) + { + if (pixels[i].m_c[3] < 255) + { + has_alpha = true; + break; + } + } + } + } + + if (m_pixel_format_bpp == 8) + m_packed_image16.resize(m_total_blocks); + else + m_packed_image8.resize(m_total_blocks); + + return true; + } + + bool rdo_bc_encoder::encode_texture() + { + clock_t start_t = clock(); + + uint32_t bc7_mode_hist[8]; + memset(bc7_mode_hist, 0, sizeof(bc7_mode_hist)); + +#if SUPPORT_BC7E + if ((m_params.m_dxgi_format == DXGI_FORMAT_BC7_UNORM) && (m_params.m_use_bc7e)) + { + if (m_params.m_status_output) + printf("Using bc7e: "); + +#pragma omp parallel for + for (int32_t by = 0; by < static_cast(m_blocks_y); by++) + { + // Process 64 blocks at a time, for efficient SIMD processing. + // Ideally, N >= 8 (or more) and (N % 8) == 0. + const int N = 64; + + for (uint32_t bx = 0; bx < m_blocks_x; bx += N) + { + const uint32_t num_blocks_to_process = std::min(m_blocks_x - bx, N); + + color_quad_u8 pixels[16 * N]; + + // Extract num_blocks_to_process 4x4 pixel blocks from the source image and put them into the pixels[] array. + for (uint32_t b = 0; b < num_blocks_to_process; b++) + m_source_image.get_block(bx + b, by, 4, 4, pixels + b * 16); + + // Compress the blocks to BC7. + // Note: If you've used Intel's ispc_texcomp, the input pixels are different. BC7E requires a pointer to an array of 16 pixels for each block. + block16* pBlock = &m_packed_image16[bx + by * m_blocks_x]; + ispc::bc7e_compress_blocks(num_blocks_to_process, reinterpret_cast(pBlock), reinterpret_cast(pixels), &m_bc7e_pack_params); + } + + if (m_params.m_status_output) + { + if ((by & 63) == 0) + printf("."); + } + } + + for (int by = 0; by < (int)m_blocks_y; by++) + { + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + block16* pBlock = &m_packed_image16[bx + by * m_blocks_x]; + + uint32_t mode = ((uint8_t*)pBlock)[0]; + for (uint32_t m = 0; m <= 7; m++) + { + if (mode & (1 << m)) + { + bc7_mode_hist[m]++; + break; + } + } + } + } + } + else +#endif + { +#pragma omp parallel for + for (int by = 0; by < (int)m_blocks_y; by++) + { + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + color_quad_u8 pixels[16]; + + m_source_image.get_block(bx, by, 4, 4, pixels); + + switch (m_params.m_dxgi_format) + { + case DXGI_FORMAT_BC1_UNORM: + { + block8* pBlock = &m_packed_image8[bx + by * m_blocks_x]; + + rgbcx::encode_bc1(m_params.m_bc1_quality_level, pBlock, &pixels[0].m_c[0], m_params.m_use_bc1_3color_mode, m_params.m_use_bc1_3color_mode_for_black); + break; + } + case DXGI_FORMAT_BC3_UNORM: + { + block16* pBlock = &m_packed_image16[bx + by * m_blocks_x]; + + if (m_params.m_use_hq_bc345) + rgbcx::encode_bc3_hq(m_params.m_bc1_quality_level, pBlock, &pixels[0].m_c[0], m_params.m_bc345_search_rad, m_params.m_bc345_mode_mask); + else + rgbcx::encode_bc3(m_params.m_bc1_quality_level, pBlock, &pixels[0].m_c[0]); + break; + } + case DXGI_FORMAT_BC4_UNORM: + { + block8* pBlock = &m_packed_image8[bx + by * m_blocks_x]; + + if (m_params.m_use_hq_bc345) + rgbcx::encode_bc4_hq(pBlock, &pixels[0].m_c[m_params.m_bc45_channel0], 4, m_params.m_bc345_search_rad, m_params.m_bc345_mode_mask); + else + rgbcx::encode_bc4(pBlock, &pixels[0].m_c[m_params.m_bc45_channel0], 4); + break; + } + case DXGI_FORMAT_BC5_UNORM: + { + block16* pBlock = &m_packed_image16[bx + by * m_blocks_x]; + + if (m_params.m_use_hq_bc345) + rgbcx::encode_bc5_hq(pBlock, &pixels[0].m_c[0], m_params.m_bc45_channel0, m_params.m_bc45_channel1, 4, m_params.m_bc345_search_rad, m_params.m_bc345_mode_mask); + else + rgbcx::encode_bc5(pBlock, &pixels[0].m_c[0], m_params.m_bc45_channel0, m_params.m_bc45_channel1, 4); + break; + } + case DXGI_FORMAT_BC7_UNORM: + { + block16* pBlock = &m_packed_image16[bx + by * m_blocks_x]; + + bc7enc_compress_block(pBlock, pixels, &m_bc7enc_pack_params); + +#pragma omp critical + { + uint32_t mode = ((uint8_t*)pBlock)[0]; + for (uint32_t m = 0; m <= 7; m++) + { + if (mode & (1 << m)) + { + bc7_mode_hist[m]++; + break; + } + } + } + + break; + } + default: + { + assert(0); + break; + } + } + } + + if (m_params.m_status_output) + { + if ((by & 127) == 0) + printf("."); + } + } + } + + clock_t end_t = clock(); + + if (m_params.m_status_output) + { + printf("\nTotal encoding time: %f secs\n", (double)(end_t - start_t) / CLOCKS_PER_SEC); + + if (m_params.m_dxgi_format == DXGI_FORMAT_BC7_UNORM) + { + printf("BC7 mode histogram:\n"); + for (uint32_t i = 0; i < 8; i++) + printf("%u: %u\n", i, bc7_mode_hist[i]); + } + } + + return true; + } + + bool rdo_bc_encoder::postprocess_rdo() + { + m_prerdo_packed_image8 = m_packed_image8; + m_prerdo_packed_image16 = m_packed_image16; + + // Post-process the data with Rate Distortion Optimization + if (m_params.m_rdo_lambda <= 0.0f) + return true; + + const uint32_t MIN_RDO_MULTITHREADING_BLOCKS = 4096; + const int rdo_total_threads = (m_params.m_rdo_multithreading && (m_params.m_rdo_max_threads > 1) && (m_total_blocks >= MIN_RDO_MULTITHREADING_BLOCKS)) ? m_params.m_rdo_max_threads : 1; + + if (m_params.m_status_output) + printf("rdo_total_threads: %u\n", rdo_total_threads); + + int blocks_remaining = m_total_blocks, cur_block_index = 0; + std::vector blocks_to_do(rdo_total_threads), first_block_index(rdo_total_threads); + for (int p = 0; p < rdo_total_threads; p++) + { + const int num_blocks = (p == (rdo_total_threads - 1)) ? blocks_remaining : (m_total_blocks / rdo_total_threads); + + blocks_to_do[p] = num_blocks; + first_block_index[p] = cur_block_index; + + cur_block_index += num_blocks; + blocks_remaining -= num_blocks; + } + + assert(!blocks_remaining && cur_block_index == (int)m_total_blocks); + + ert::reduce_entropy_params ert_p; + + ert_p.m_lambda = m_params.m_rdo_lambda; + ert_p.m_lookback_window_size = m_params.m_lookback_window_size; + ert_p.m_smooth_block_max_mse_scale = m_params.m_rdo_smooth_block_error_scale; + ert_p.m_max_smooth_block_std_dev = m_params.m_rdo_max_smooth_block_std_dev; + ert_p.m_debug_output = m_params.m_rdo_debug_output; + ert_p.m_try_two_matches = m_params.m_rdo_try_2_matches; + ert_p.m_allow_relative_movement = m_params.m_rdo_allow_relative_movement; + ert_p.m_skip_zero_mse_blocks = false; + + std::vector block_rgb_mse_scales(compute_block_mse_scales(m_source_image, m_blocks_x, m_blocks_y, m_total_blocks, m_params.m_rdo_debug_output)); + + std::vector block_pixels(m_total_blocks * 16); + + for (uint32_t by = 0; by < m_blocks_y; by++) + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + m_source_image.get_block(bx, by, 4, 4, (color_quad_u8*)&block_pixels[(bx + by * m_blocks_x) * 16]); + + unpacker_funcs block_unpackers; + block_unpackers.m_allow_3color_mode = m_params.m_use_bc1_3color_mode; + block_unpackers.m_use_bc1_3color_mode_for_black = m_params.m_use_bc1_3color_mode_for_black; + block_unpackers.m_mode = m_params.m_bc1_mode; + + if (m_params.m_dxgi_format == DXGI_FORMAT_BC7_UNORM) + { + ert_p.m_lookback_window_size = std::max(16U, m_params.m_lookback_window_size); + + // BC7 RDO + const uint32_t NUM_COMPONENTS = 4; + + if (!m_params.m_custom_rdo_smooth_block_error_scale) + { + // Attempt to compute a decent conservative smooth block MSE max scaling factor. + // No single smooth block scale setting can work for all textures (unless it's ridiuclously large, killing efficiency). + ert_p.m_smooth_block_max_mse_scale = lerp(15.0f, 50.0f, std::min(1.0f, ert_p.m_lambda / 4.0f)); + + if (m_params.m_status_output) + printf("Using an automatically computed smooth block error scale of %f (use -zb# to override)\n", ert_p.m_smooth_block_max_mse_scale); + } + + for (uint32_t by = 0; by < m_blocks_y; by++) + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + float& s = block_rgb_mse_scales[bx + by * m_blocks_x]; + if (s > 0.0f) + s = std::max(ert_p.m_smooth_block_max_mse_scale, s * std::min(ert_p.m_lambda, 3.0f)); + } + + if (m_params.m_status_output) + { + printf("\nERT parameters:\n"); + ert_p.print(); + printf("\n"); + } + + uint32_t total_modified = 0; + + clock_t rdo_start_t = clock(); + +#pragma omp parallel for + for (int p = 0; p < rdo_total_threads; p++) + { + const int first_block_to_encode = first_block_index[p]; + const int num_blocks_to_encode = blocks_to_do[p]; + if (!num_blocks_to_encode) + continue; + + uint32_t total_modified_local = 0; + + std::vector local_block_rgb_mse_scales(num_blocks_to_encode); + for (int i = 0; i < num_blocks_to_encode; i++) + local_block_rgb_mse_scales[i] = block_rgb_mse_scales[first_block_to_encode + i]; + + ert::reduce_entropy(&m_packed_image16[first_block_to_encode], num_blocks_to_encode, + 16, 16, 4, 4, NUM_COMPONENTS, + (ert::color_rgba*)&block_pixels[16 * first_block_to_encode], ert_p, total_modified_local, + unpacker_funcs::unpack_bc7_block, &block_unpackers, + m_params.m_rdo_ultrasmooth_block_handling ? &local_block_rgb_mse_scales : nullptr); + +#pragma omp critical + { + total_modified += total_modified_local; + } + } // p + + clock_t rdo_end_t = clock(); + + if (m_params.m_status_output) + { + printf("Total RDO time: %f secs\n", (double)(rdo_end_t - rdo_start_t) / CLOCKS_PER_SEC); + + printf("Total blocks modified: %u %3.2f%%\n", total_modified, total_modified * 100.0f / m_total_blocks); + + uint32_t bc7_mode_hist[8]; + memset(bc7_mode_hist, 0, sizeof(bc7_mode_hist)); + + for (int by = 0; by < (int)m_blocks_y; by++) + { + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + block16* pBlock = &m_packed_image16[bx + by * m_blocks_x]; + + const uint32_t mode_byte = ((uint8_t*)pBlock)[0]; + + uint32_t m; + for (m = 0; m <= 7; m++) + { + if (mode_byte & (1 << m)) + { + bc7_mode_hist[m]++; + break; + } + } + assert(m != 8); + } + } + + printf("BC7 mode histogram:\n"); + for (uint32_t i = 0; i < 8; i++) + printf("%u: %u\n", i, bc7_mode_hist[i]); + } + } + else if (m_params.m_dxgi_format == DXGI_FORMAT_BC5_UNORM) + { + // BC5 RDO - One BC4 block for R followed by one BC4 block for G + + ert_p.m_lookback_window_size = std::max(16U, m_params.m_lookback_window_size); + + std::vector block_pixels_r(m_total_blocks * 16), block_pixels_g(m_total_blocks * 16); + + for (uint32_t by = 0; by < m_blocks_y; by++) + { + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + color_quad_u8 orig_block[16]; + m_source_image.get_block(bx, by, 4, 4, orig_block); + + color_quad_u8* pDst_block_r = (color_quad_u8*)&block_pixels_r[(bx + by * m_blocks_x) * 16]; + color_quad_u8* pDst_block_g = (color_quad_u8*)&block_pixels_g[(bx + by * m_blocks_x) * 16]; + + for (uint32_t i = 0; i < 16; i++) + { + pDst_block_r[i].set(orig_block[i].r, 0, 0, 0); + pDst_block_g[i].set(orig_block[i].g, 0, 0, 0); + } + } + } + + const uint32_t NUM_COMPONENTS = 1; + + ert_p.m_color_weights[1] = 0; + ert_p.m_color_weights[2] = 0; + ert_p.m_color_weights[3] = 0; + + if (!m_params.m_custom_rdo_smooth_block_error_scale) + { + // Attempt to compute a decent conservative smooth block MSE max scaling factor. + // No single smooth block scale setting can work for all textures (unless it's ridiuclously large, killing efficiency). + ert_p.m_smooth_block_max_mse_scale = lerp(10.0f, 30.0f, std::min(1.0f, ert_p.m_lambda / 4.0f)); + + if (m_params.m_status_output) + printf("Using an automatically computed smooth block error scale of %f (use -zb# to override)\n", ert_p.m_smooth_block_max_mse_scale); + } + + if (m_params.m_status_output) + { + printf("\nERT parameters:\n"); + ert_p.print(); + printf("\n"); + } + + uint32_t total_modified_r = 0, total_modified_g = 0; + + clock_t rdo_start_t = clock(); + +#pragma omp parallel for + for (int p = 0; p < rdo_total_threads; p++) + { + const int first_block_to_encode = first_block_index[p]; + const int num_blocks_to_encode = blocks_to_do[p]; + if (!num_blocks_to_encode) + continue; + + uint32_t total_modified_local_r = 0, total_modified_local_g = 0; + + ert::reduce_entropy(&m_packed_image16[first_block_to_encode], num_blocks_to_encode, + 2 * sizeof(rgbcx::bc4_block), sizeof(rgbcx::bc4_block), 4, 4, NUM_COMPONENTS, + (ert::color_rgba*)&block_pixels_r[16 * first_block_to_encode], ert_p, total_modified_local_r, + unpacker_funcs::unpack_bc4_block, &block_unpackers); + + ert::reduce_entropy((uint8_t*)&m_packed_image16[first_block_to_encode] + sizeof(rgbcx::bc4_block), num_blocks_to_encode, + 2 * sizeof(rgbcx::bc4_block), sizeof(rgbcx::bc4_block), 4, 4, NUM_COMPONENTS, + (ert::color_rgba*)&block_pixels_g[16 * first_block_to_encode], ert_p, total_modified_local_g, + unpacker_funcs::unpack_bc4_block, &block_unpackers); + +#pragma omp critical + { + total_modified_r += total_modified_local_r; + total_modified_g += total_modified_local_g; + } + } // p + + clock_t rdo_end_t = clock(); + + if (m_params.m_status_output) + { + printf("Total RDO time: %f secs\n", (double)(rdo_end_t - rdo_start_t) / CLOCKS_PER_SEC); + + printf("Total blocks modified R: %u %3.2f%%\n", total_modified_r, total_modified_r * 100.0f / m_total_blocks); + printf("Total blocks modified G: %u %3.2f%%\n", total_modified_g, total_modified_g * 100.0f / m_total_blocks); + } + } + else if (m_params.m_dxgi_format == DXGI_FORMAT_BC4_UNORM) + { + // BC4 RDO - One BC4 block for R + + const uint32_t NUM_COMPONENTS = 1; + + ert_p.m_color_weights[1] = 0; + ert_p.m_color_weights[2] = 0; + ert_p.m_color_weights[3] = 0; + + if (!m_params.m_custom_rdo_smooth_block_error_scale) + { + // Attempt to compute a decent conservative smooth block MSE max scaling factor. + // No single smooth block scale setting can work for all textures (unless it's ridiuclously large, killing efficiency). + ert_p.m_smooth_block_max_mse_scale = lerp(10.0f, 30.0f, std::min(1.0f, ert_p.m_lambda / 4.0f)); + + if (m_params.m_status_output) + printf("Using an automatically computed smooth block error scale of %f (use -zb# to override)\n", ert_p.m_smooth_block_max_mse_scale); + } + + if (m_params.m_status_output) + { + printf("\nERT parameters:\n"); + ert_p.print(); + printf("\n"); + } + + uint32_t total_modified = 0; + + clock_t rdo_start_t = clock(); + +#pragma omp parallel for + for (int p = 0; p < rdo_total_threads; p++) + { + const int first_block_to_encode = first_block_index[p]; + const int num_blocks_to_encode = blocks_to_do[p]; + if (!num_blocks_to_encode) + continue; + + uint32_t total_modified_local = 0; + + ert::reduce_entropy(&m_packed_image8[first_block_to_encode], num_blocks_to_encode, + sizeof(rgbcx::bc4_block), sizeof(rgbcx::bc4_block), 4, 4, NUM_COMPONENTS, + (ert::color_rgba*)&block_pixels[16 * first_block_to_encode], ert_p, total_modified_local, + unpacker_funcs::unpack_bc4_block, &block_unpackers); + +#pragma omp critical + { + total_modified += total_modified_local; + } + } // p + + clock_t rdo_end_t = clock(); + + if (m_params.m_status_output) + { + printf("Total RDO time: %f secs\n", (double)(rdo_end_t - rdo_start_t) / CLOCKS_PER_SEC); + + printf("Total blocks modified: %u %3.2f%%\n", total_modified, total_modified * 100.0f / m_total_blocks); + } + } + else if (m_params.m_dxgi_format == DXGI_FORMAT_BC1_UNORM) + { + // BC1 RDO - One BC1 block + const uint32_t NUM_COMPONENTS = 3; + + ert_p.m_color_weights[3] = 0; + + if (!m_params.m_custom_rdo_smooth_block_error_scale) + { + // This is just a hack - no single setting can work for all textures. + ert_p.m_smooth_block_max_mse_scale = lerp(15.0f, 50.0f, std::min(1.0f, ert_p.m_lambda / 8.0f)); + + if (m_params.m_status_output) + printf("Using an automatically computed smooth block error scale of %f (use -zb# to override)\n", ert_p.m_smooth_block_max_mse_scale); + } + + for (uint32_t by = 0; by < m_blocks_y; by++) + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + float& s = block_rgb_mse_scales[bx + by * m_blocks_x]; + if (s > 0.0f) + s = std::max(ert_p.m_smooth_block_max_mse_scale, s * std::min(ert_p.m_lambda, 3.0f)); + } + + printf("\nERT parameters:\n"); + ert_p.print(); + printf("\n"); + + uint32_t total_modified = 0; + + clock_t rdo_start_t = clock(); + +#pragma omp parallel for + for (int p = 0; p < rdo_total_threads; p++) + { + const int first_block_to_encode = first_block_index[p]; + const int num_blocks_to_encode = blocks_to_do[p]; + if (!num_blocks_to_encode) + continue; + + uint32_t total_modified_local = 0; + + std::vector local_block_rgb_mse_scales(num_blocks_to_encode); + for (int i = 0; i < num_blocks_to_encode; i++) + local_block_rgb_mse_scales[i] = block_rgb_mse_scales[first_block_to_encode + i]; + + ert::reduce_entropy(&m_packed_image8[first_block_to_encode], num_blocks_to_encode, + sizeof(rgbcx::bc1_block), sizeof(rgbcx::bc1_block), 4, 4, NUM_COMPONENTS, + (ert::color_rgba*)&block_pixels[16 * first_block_to_encode], ert_p, total_modified_local, + unpacker_funcs::unpack_bc1_block, &block_unpackers, + m_params.m_rdo_ultrasmooth_block_handling ? &local_block_rgb_mse_scales : nullptr); + +#pragma omp critical + { + total_modified += total_modified_local; + } + } // p + + clock_t rdo_end_t = clock(); + + if (m_params.m_status_output) + { + printf("Total RDO time: %f secs\n", (double)(rdo_end_t - rdo_start_t) / CLOCKS_PER_SEC); + + printf("Total blocks modified: %u %3.2f%%\n", + total_modified, total_modified * 100.0f / m_total_blocks); + } + } + else if (m_params.m_dxgi_format == DXGI_FORMAT_BC3_UNORM) + { + // BC3 RDO - One BC4 block followed by one BC1 block + + ert_p.m_lookback_window_size = std::max(16U, m_params.m_lookback_window_size); + + std::vector block_pixels_a(m_total_blocks * 16); + + for (uint32_t by = 0; by < m_blocks_y; by++) + { + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + color_quad_u8 orig_block[16]; + m_source_image.get_block(bx, by, 4, 4, orig_block); + + color_quad_u8* pDst_block_a = (color_quad_u8*)&block_pixels_a[(bx + by * m_blocks_x) * 16]; + for (uint32_t i = 0; i < 16; i++) + pDst_block_a[i].set(orig_block[i].a, 0, 0, 0); + } + } + + ert_p.m_color_weights[3] = 0; + + ert::reduce_entropy_params ert_alpha_p(ert_p); + ert_alpha_p.m_color_weights[1] = 0; + ert_alpha_p.m_color_weights[2] = 0; + ert_alpha_p.m_color_weights[3] = 0; + + if (!m_params.m_custom_rdo_smooth_block_error_scale) + { + // This is just a hack - no single setting can work for all textures. + ert_p.m_smooth_block_max_mse_scale = lerp(15.0f, 50.0f, std::min(1.0f, ert_p.m_lambda / 8.0f)); + + if (m_params.m_status_output) + printf("Using an automatically computed smooth block error scale of %f (use -zb# to override) for RGB\n", ert_p.m_smooth_block_max_mse_scale); + + ert_alpha_p.m_smooth_block_max_mse_scale = lerp(10.0f, 30.0f, std::min(1.0f, ert_alpha_p.m_lambda / 4.0f)); + + if (m_params.m_status_output) + printf("Using an automatically computed smooth block error scale of %f for Alpha\n", ert_alpha_p.m_smooth_block_max_mse_scale); + } + + for (uint32_t by = 0; by < m_blocks_y; by++) + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + float& s = block_rgb_mse_scales[bx + by * m_blocks_x]; + if (s > 0.0f) + s = std::max(ert_p.m_smooth_block_max_mse_scale, s * std::min(ert_p.m_lambda, 3.0f)); + } + + if (m_params.m_status_output) + { + printf("\nERT RGB parameters:\n"); + ert_p.print(); + + printf("\nERT Alpha parameters:\n"); + ert_alpha_p.print(); + printf("\n"); + } + + uint32_t total_modified_rgb = 0, total_modified_alpha = 0; + + block_unpackers.m_allow_3color_mode = false; + block_unpackers.m_use_bc1_3color_mode_for_black = false; + + clock_t rdo_start_t = clock(); + +#pragma omp parallel for + for (int p = 0; p < rdo_total_threads; p++) + { + const int first_block_to_encode = first_block_index[p]; + const int num_blocks_to_encode = blocks_to_do[p]; + if (!num_blocks_to_encode) + continue; + + uint32_t total_modified_local_rgb = 0, total_modified_local_alpha = 0; + + ert::reduce_entropy((uint8_t*)&m_packed_image16[first_block_to_encode], num_blocks_to_encode, + sizeof(rgbcx::bc1_block) * 2, sizeof(rgbcx::bc4_block), 4, 4, 1, + (ert::color_rgba*)&block_pixels_a[16 * first_block_to_encode], ert_alpha_p, total_modified_local_alpha, + unpacker_funcs::unpack_bc4_block, &block_unpackers); + + std::vector local_block_rgb_mse_scales(num_blocks_to_encode); + for (int i = 0; i < num_blocks_to_encode; i++) + local_block_rgb_mse_scales[i] = block_rgb_mse_scales[first_block_to_encode + i]; + + ert::reduce_entropy((uint8_t*)&m_packed_image16[first_block_to_encode] + sizeof(rgbcx::bc1_block), num_blocks_to_encode, + sizeof(rgbcx::bc1_block) * 2, sizeof(rgbcx::bc1_block), 4, 4, 3, + (ert::color_rgba*)&block_pixels[16 * first_block_to_encode], ert_p, total_modified_local_rgb, + unpacker_funcs::unpack_bc1_block, &block_unpackers, + m_params.m_rdo_ultrasmooth_block_handling ? &local_block_rgb_mse_scales : nullptr); + +#pragma omp critical + { + total_modified_rgb += total_modified_local_rgb; + total_modified_alpha += total_modified_local_alpha; + } + } // p + + clock_t rdo_end_t = clock(); + + if (m_params.m_status_output) + { + printf("Total RDO time: %f secs\n", (double)(rdo_end_t - rdo_start_t) / CLOCKS_PER_SEC); + + printf("Total RGB blocks modified: %u %3.2f%%\n", total_modified_rgb, total_modified_rgb * 100.0f / m_total_blocks); + printf("Total Alpha blocks modified: %u %3.2f%%\n", total_modified_alpha, total_modified_alpha * 100.0f / m_total_blocks); + } + } + + return true; + } + + bool rdo_bc_encoder::unpack_blocks(image_u8& unpacked_image) const + { + unpacked_image.init(get_blocks_x() * 4, get_blocks_y() * 4); + + bool bc1_punchthrough_flag = false; + bool used_bc1_transparent_texels_for_black = false; + + bool unpack_failed = false; + +#pragma omp parallel for + for (int by = 0; by < (int)get_blocks_y(); by++) + { + for (uint32_t bx = 0; bx < get_blocks_x(); bx++) + { + const void* pBlock = (const uint8_t*)get_blocks() + (bx + by * get_blocks_x()) * get_bytes_per_block(); + + color_quad_u8 unpacked_pixels[16]; + for (uint32_t i = 0; i < 16; i++) + unpacked_pixels[i].set(0, 0, 0, 255); + + switch (m_params.m_dxgi_format) + { + case DXGI_FORMAT_BC1_UNORM: + { + const bool used_punchthrough = rgbcx::unpack_bc1(pBlock, unpacked_pixels, true, m_params.m_bc1_mode); + + if (used_punchthrough) + { + bc1_punchthrough_flag = true; + + const rgbcx::bc1_block* pBC1_block = (const rgbcx::bc1_block*)pBlock; + + for (uint32_t y = 0; y < 4; y++) + for (uint32_t x = 0; x < 4; x++) + if (pBC1_block->get_selector(x, y) == 3) + used_bc1_transparent_texels_for_black = true; + } + + break; + } + case DXGI_FORMAT_BC3_UNORM: + { + if (!rgbcx::unpack_bc3(pBlock, unpacked_pixels, m_params.m_bc1_mode)) + bc1_punchthrough_flag = true; + break; + } + case DXGI_FORMAT_BC4_UNORM: + { + rgbcx::unpack_bc4(pBlock, &unpacked_pixels[0][0], 4); + +#if DECODE_BC4_TO_GRAYSCALE + for (uint32_t i = 0; i < 16; i++) + { + unpacked_pixels[i][1] = unpacked_pixels[i][0]; + unpacked_pixels[i][2] = unpacked_pixels[i][0]; + } +#endif + break; + } + case DXGI_FORMAT_BC5_UNORM: + { + rgbcx::unpack_bc5(pBlock, &unpacked_pixels[0][0], 0, 1, 4); + break; + } + case DXGI_FORMAT_BC7_UNORM: + { + if (!bc7decomp::unpack_bc7((const uint8_t*)pBlock, (bc7decomp::color_rgba*)unpacked_pixels)) + { + fprintf(stderr, "bc7decomp::unpack_bc7() failed!\n"); + unpack_failed = true; + } + + // Now unpack the block using the non-SSE reference decoder, to make sure we get the same exact unpacked bits. + color_quad_u8 unpacked_pixels_ref[16]; + if (!bc7decomp_ref::unpack_bc7((const uint8_t*)pBlock, (bc7decomp::color_rgba*)unpacked_pixels_ref)) + { + fprintf(stderr, "bc7decomp::unpack_bc7_ref() failed!\n"); + unpack_failed = true; + } + + if (memcmp(unpacked_pixels, unpacked_pixels_ref, sizeof(unpacked_pixels)) != 0) + { + fprintf(stderr, "BC7 unpack verification failed!\n"); + unpack_failed = true; + } + + break; + } + default: + assert(0); + break; + } + + unpacked_image.set_block(bx, by, 4, 4, unpacked_pixels); + } // bx + } // by + + if (unpack_failed) + return false; + + // Sanity check the BC1/BC3 output + if (m_params.m_dxgi_format == DXGI_FORMAT_BC3_UNORM) + { + if (bc1_punchthrough_flag) + fprintf(stderr, "WARNING: BC3 mode selected, but rgbcx::unpack_bc3() returned one or more blocks using 3-color mode!\n"); + } + else if (m_params.m_dxgi_format == DXGI_FORMAT_BC1_UNORM) + { + if ((bc1_punchthrough_flag) && (!m_params.m_use_bc1_3color_mode)) + fprintf(stderr, "WARNING: BC1 output used 3-color mode, when this was disabled!\n"); + + if ((used_bc1_transparent_texels_for_black) && (!used_bc1_transparent_texels_for_black)) + fprintf(stderr, "WARNING: BC1 output used the transparent selector for black, when this was disabled!\n"); + } + + if (m_params.m_status_output) + { + if ((m_params.m_dxgi_format == DXGI_FORMAT_BC1_UNORM) || (m_params.m_dxgi_format == DXGI_FORMAT_BC3_UNORM)) + printf("Output used 3-color mode: %u, output used transparent texels for black: %u\n", bc1_punchthrough_flag, used_bc1_transparent_texels_for_black); + } + + return true; + } + +} // namespace rdo_bc diff --git a/libkram/bc7enc/rdo_bc_encoder.h b/libkram/bc7enc/rdo_bc_encoder.h new file mode 100644 index 00000000..469211e1 --- /dev/null +++ b/libkram/bc7enc/rdo_bc_encoder.h @@ -0,0 +1,269 @@ +// rdo_bc_encoder.h +#pragma once + +#ifndef SUPPORT_BC7E +#define SUPPORT_BC7E 0 +#endif + +#include "utils.h" +#include "ert.h" + +#include "bc7decomp.h" +#include "rgbcx.h" + +#include "bc7enc.h" + +#if SUPPORT_BC7E +#include "bc7e_ispc.h" +#endif + +//#include "dds_defs.h" + +// TODO: code below doesn't handle srgb case +enum DXGI_FORMAT +{ + DXGI_FORMAT_BC1_UNORM = 71, + DXGI_FORMAT_BC1_UNORM_SRGB = 72, + DXGI_FORMAT_BC2_UNORM = 74, + DXGI_FORMAT_BC2_UNORM_SRGB = 75, + DXGI_FORMAT_BC3_UNORM = 77, + DXGI_FORMAT_BC3_UNORM_SRGB = 78, + DXGI_FORMAT_BC4_UNORM = 80, + DXGI_FORMAT_BC4_SNORM = 81, + DXGI_FORMAT_BC5_UNORM = 83, + DXGI_FORMAT_BC5_SNORM = 84, + DXGI_FORMAT_BC6H_UF16 = 95, + DXGI_FORMAT_BC6H_SF16 = 96, + DXGI_FORMAT_BC7_UNORM = 98, + DXGI_FORMAT_BC7_UNORM_SRGB = 99, +}; + +namespace rdo_bc +{ + + struct rdo_bc_params + { + rdo_bc_params() + { + clear(); + } + + void clear() + { + m_bc7_uber_level = 6; // BC7ENC_MAX_UBER_LEVEL; + m_bc7enc_max_partitions_to_scan = BC7ENC_MAX_PARTITIONS; + m_perceptual = false; + m_y_flip = false; + m_bc45_channel0 = 0; + m_bc45_channel1 = 1; + + m_bc1_mode = rgbcx::bc1_approx_mode::cBC1Ideal; + m_use_bc1_3color_mode = true; + + // We're just turning this on by default now, like NVDXT.EXE used to do back in the old original Xbox days. + m_use_bc1_3color_mode_for_black = true; // false; + + m_bc1_quality_level = rgbcx::MAX_LEVEL; + + m_dxgi_format = DXGI_FORMAT_BC7_UNORM; + + m_rdo_lambda = 0.0f; + m_rdo_debug_output = false; + m_rdo_smooth_block_error_scale = 15.0f; + m_custom_rdo_smooth_block_error_scale = false; + m_lookback_window_size = 128; + m_custom_lookback_window_size = false; + m_bc7enc_rdo_bc7_quant_mode6_endpoints = true; + m_bc7enc_rdo_bc7_weight_modes = true; + m_bc7enc_rdo_bc7_weight_low_frequency_partitions = true; + m_bc7enc_rdo_bc7_pbit1_weighting = true; + m_rdo_max_smooth_block_std_dev = 18.0f; + m_rdo_allow_relative_movement = false; + m_rdo_try_2_matches = true; + m_rdo_ultrasmooth_block_handling = true; + + m_use_hq_bc345 = true; + m_bc345_search_rad = 5; + m_bc345_mode_mask = rgbcx::BC4_USE_ALL_MODES; + + m_bc7enc_mode6_only = false; + m_rdo_multithreading = true; + + m_bc7enc_reduce_entropy = false; + + m_use_bc7e = false; + +#if SUPPORT_BC7E + // By default, if they've compiled in BC7E.ispc, then use that. In a rate distortion sense it's better overall. + // https://richg42.blogspot.com/2021/02/average-rate-distortion-curves-for.html + m_use_bc7e = true; +#endif + + m_status_output = false; + + m_rdo_max_threads = 128; + } + + int m_bc7_uber_level; + int m_bc7enc_max_partitions_to_scan; + bool m_perceptual; + bool m_y_flip; + uint32_t m_bc45_channel0; + uint32_t m_bc45_channel1; + + rgbcx::bc1_approx_mode m_bc1_mode; + bool m_use_bc1_3color_mode; + + bool m_use_bc1_3color_mode_for_black; + + int m_bc1_quality_level; + + DXGI_FORMAT m_dxgi_format; + + float m_rdo_lambda; + bool m_rdo_debug_output; + float m_rdo_smooth_block_error_scale; + bool m_custom_rdo_smooth_block_error_scale; + uint32_t m_lookback_window_size; + bool m_custom_lookback_window_size; + bool m_bc7enc_rdo_bc7_quant_mode6_endpoints; + bool m_bc7enc_rdo_bc7_weight_modes; + bool m_bc7enc_rdo_bc7_weight_low_frequency_partitions; + bool m_bc7enc_rdo_bc7_pbit1_weighting; + float m_rdo_max_smooth_block_std_dev; + bool m_rdo_allow_relative_movement; + bool m_rdo_try_2_matches; + bool m_rdo_ultrasmooth_block_handling; + + bool m_use_hq_bc345; + int m_bc345_search_rad; + uint32_t m_bc345_mode_mask; + + bool m_bc7enc_mode6_only; + bool m_rdo_multithreading; + + bool m_bc7enc_reduce_entropy; + + bool m_use_bc7e; + bool m_status_output; + + uint32_t m_rdo_max_threads; + }; + + class rdo_bc_encoder + { + public: + rdo_bc_encoder(); + + void clear(); + + bool init(const utils::image_u8& src_image, rdo_bc_params& params); + bool encode(); + + const rdo_bc_params &get_params() const { return m_params; } + + const utils::image_u8* get_orig_source_image() const { return m_pOrig_source_image; } + const utils::image_u8& get_source_image() const { return m_source_image; } + + const void* get_prerdo_blocks() const { return m_prerdo_packed_image8.size() ? (void*)m_prerdo_packed_image8.data() : (void*)m_prerdo_packed_image16.data(); } + const void* get_blocks() const { return m_packed_image8.size() ? (void*)m_packed_image8.data() : (void*)m_packed_image16.data(); } + + bool unpack_blocks(utils::image_u8& unpacked_image) const; + + DXGI_FORMAT get_pixel_format() const { return m_params.m_dxgi_format; } + + uint32_t get_orig_width() const { return m_orig_width; } + uint32_t get_orig_height() const { return m_orig_height; } + uint32_t get_blocks_x() const { return m_blocks_x; } + uint32_t get_blocks_y() const { return m_blocks_y; } + uint32_t get_total_blocks() const { return m_total_blocks; } + uint32_t get_total_blocks_size_in_bytes() const { return m_total_blocks * m_bytes_per_block; } + uint32_t get_bytes_per_block() const { return m_bytes_per_block; } + uint32_t get_pixel_format_bpp() const { return m_pixel_format_bpp; } + uint32_t get_total_texels() const { return m_total_texels; } + bool get_has_alpha() const { return m_has_alpha; } + + private: + const utils::image_u8* m_pOrig_source_image; + utils::image_u8 m_source_image; + rdo_bc_params m_params; + + uint32_t m_orig_width, m_orig_height; + uint32_t m_blocks_x, m_blocks_y, m_total_blocks, m_bytes_per_block, m_pixel_format_bpp; + uint32_t m_total_texels; + bool m_has_alpha; + + utils::block8_vec m_packed_image8; + utils::block16_vec m_packed_image16; + + utils::block8_vec m_prerdo_packed_image8; + utils::block16_vec m_prerdo_packed_image16; + + bc7enc_compress_block_params m_bc7enc_pack_params; +#if SUPPORT_BC7E + ispc::bc7e_compress_block_params m_bc7e_pack_params; +#endif + + void init_encoders(); + bool init_source_image(); + bool init_encoder_params(); + bool encode_texture(); + + struct unpacker_funcs + { + rgbcx::bc1_approx_mode m_mode; + bool m_allow_3color_mode; + bool m_use_bc1_3color_mode_for_black; + + static bool unpack_bc1_block(const void* pBlock, ert::color_rgba* pPixels, uint32_t block_index, void* pUser_data) + { + (void)block_index; + const unpacker_funcs* pState = (const unpacker_funcs*)pUser_data; + + bool used_3color_mode = rgbcx::unpack_bc1(pBlock, pPixels, true, pState->m_mode); + + if (used_3color_mode) + { + if (!pState->m_allow_3color_mode) + return false; + + if (!pState->m_use_bc1_3color_mode_for_black) + { + rgbcx::bc1_block* pBC1_block = (rgbcx::bc1_block*)pBlock; + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + if (pBC1_block->get_selector(x, y) == 3) + return false; + } // x + } // y + } + } + + return true; + } + + // TODO: Enforce 6/8 color constraints + static bool unpack_bc4_block(const void* pBlock, ert::color_rgba* pPixels, uint32_t block_index, void* pUser_data) + { + (void)block_index; + (void)pUser_data; + memset(pPixels, 0, sizeof(ert::color_rgba) * 16); + rgbcx::unpack_bc4(pBlock, (uint8_t*)pPixels, 4); + return true; + } + + static bool unpack_bc7_block(const void* pBlock, ert::color_rgba* pPixels, uint32_t block_index, void* pUser_data) + { + (void)block_index; + (void)pUser_data; + return bc7decomp::unpack_bc7(pBlock, (bc7decomp::color_rgba*)pPixels); + } + }; + + bool postprocess_rdo(); + }; + +} // namespace rdo_bc diff --git a/libkram/bc7enc/rgbcx.cpp b/libkram/bc7enc/rgbcx.cpp new file mode 100644 index 00000000..b0c40880 --- /dev/null +++ b/libkram/bc7enc/rgbcx.cpp @@ -0,0 +1,3083 @@ +// rgbcx.cpp - see license at end of rgbcx.h +#include "rgbcx.h" +#include +#include +#include + +namespace rgbcx +{ + //const uint8_t g_bc1_to_linear[4] = { 0, 3, 1, 2 }; + + const uint32_t NUM_UNIQUE_TOTAL_ORDERINGS4 = 969; + +#ifdef _MSC_VER +#pragma region +#endif + // All total orderings for 16 pixels 2-bit selectors. + // BC1 selector order 0, 2, 3, 1 (i.e. the selectors are reordered into linear order). + static uint8_t g_unique_total_orders4[NUM_UNIQUE_TOTAL_ORDERINGS4][4] = + { + {0,8,2,6},{4,3,9,0},{4,8,1,3},{12,0,3,1},{11,3,2,0},{6,4,6,0},{7,5,0,4},{6,0,8,2},{1,0,0,15},{3,0,8,5},{1,1,13,1},{13,1,2,0},{0,14,1,1},{0,15,1,0},{0,13,0,3},{16,0,0,0},{4,3,4,5},{8,6,0,2},{0,10,0,6},{10,0,4,2},{7,2,1,6},{4,7,5,0},{1,4,7,4},{0,14,2,0},{2,7,2,5},{9,0,5,2},{9,2,2,3},{10,0,5,1},{2,3,7,4},{4,9,0,3},{1,5,0,10},{1,1,6,8}, + {6,6,4,0},{11,5,0,0},{11,2,0,3},{4,0,10,2},{2,3,10,1},{1,13,1,1},{0,14,0,2},{2,3,3,8},{12,3,1,0},{14,0,0,2},{9,1,3,3},{6,4,0,6},{1,1,5,9},{5,9,0,2},{2,10,1,3},{12,0,0,4},{4,6,6,0},{0,6,4,6},{3,7,4,2},{0,13,3,0},{3,10,0,3},{10,2,1,3},{1,12,1,2},{2,0,13,1},{11,0,5,0},{12,1,3,0},{6,4,5,1},{10,4,2,0},{3,6,1,6},{7,3,6,0},{10,4,0,2},{10,0,2,4}, + {0,5,9,2},{0,9,3,4},{6,4,2,4},{3,4,7,2},{3,3,5,5},{4,2,9,1},{6,2,8,0},{3,5,3,5},{4,10,1,1},{10,1,3,2},{5,7,0,4},{5,3,7,1},{6,8,1,1},{8,8,0,0},{11,1,0,4},{14,1,0,1},{9,3,2,2},{8,2,1,5},{0,0,2,14},{3,3,9,1},{10,1,5,0},{8,3,1,4},{1,5,8,2},{6,1,9,0},{3,2,1,10},{3,11,1,1},{7,6,3,0},{9,0,3,4},{5,2,5,4},{0,2,3,11},{15,0,0,1},{0,6,6,4}, + {3,4,9,0},{4,7,0,5},{0,4,4,8},{0,13,2,1},{2,4,1,9},{3,2,5,6},{10,6,0,0},{3,5,6,2},{8,0,4,4},{1,3,6,6},{7,7,0,2},{6,1,4,5},{0,11,1,4},{2,2,8,4},{0,1,2,13},{15,0,1,0},{7,2,6,1},{8,1,7,0},{1,8,4,3},{2,13,1,0},{1,0,7,8},{14,2,0,0},{1,8,1,6},{9,3,3,1},{0,0,7,9},{4,4,1,7},{9,0,6,1},{10,2,4,0},{1,7,3,5},{0,3,8,5},{5,2,4,5},{1,2,5,8}, + {0,8,7,1},{10,3,2,1},{12,0,4,0},{2,1,4,9},{5,2,2,7},{1,9,3,3},{15,1,0,0},{6,3,4,3},{9,5,0,2},{1,6,9,0},{6,6,0,4},{13,2,1,0},{5,1,8,2},{0,5,11,0},{7,1,0,8},{1,2,12,1},{0,3,3,10},{7,4,2,3},{5,1,4,6},{7,0,3,6},{3,12,0,1},{3,4,5,4},{1,10,0,5},{7,4,3,2},{10,5,0,1},{13,3,0,0},{2,5,4,5},{3,10,1,2},{5,1,2,8},{14,0,1,1},{1,5,4,6},{1,4,5,6}, + {2,3,11,0},{11,0,4,1},{11,2,2,1},{5,3,8,0},{1,3,10,2},{0,1,13,2},{3,1,4,8},{4,2,4,6},{1,5,6,4},{2,1,11,2},{1,2,9,4},{4,7,3,2},{6,2,5,3},{7,2,2,5},{8,1,4,3},{3,2,8,3},{12,1,0,3},{7,8,1,0},{7,0,2,7},{5,10,0,1},{0,2,14,0},{2,9,3,2},{7,0,0,9},{11,1,4,0},{10,4,1,1},{2,2,9,3},{5,7,2,2},{1,3,1,11},{13,2,0,1},{4,2,8,2},{2,3,1,10},{4,2,5,5}, + {7,0,7,2},{10,0,0,6},{0,8,5,3},{4,4,0,8},{12,4,0,0},{0,1,14,1},{8,0,1,7},{5,1,5,5},{11,0,3,2},{0,4,1,11},{0,8,8,0},{0,2,5,9},{7,3,2,4},{7,8,0,1},{1,0,3,12},{7,4,5,0},{1,6,7,2},{7,6,1,2},{9,6,1,0},{12,2,0,2},{4,1,6,5},{4,0,1,11},{8,4,4,0},{13,0,1,2},{8,6,2,0},{4,12,0,0},{2,7,5,2},{2,0,5,9},{5,4,5,2},{3,8,5,0},{7,3,3,3},{4,4,8,0}, + {2,1,3,10},{5,0,1,10},{6,4,3,3},{4,9,1,2},{1,4,0,11},{11,3,1,1},{4,0,12,0},{13,0,0,3},{6,1,6,3},{9,0,4,3},{8,0,0,8},{8,4,0,4},{0,12,1,3},{0,4,10,2},{3,4,8,1},{1,3,8,4},{9,2,5,0},{5,7,4,0},{1,0,11,4},{4,10,0,2},{1,3,12,0},{6,9,0,1},{5,0,9,2},{5,9,2,0},{13,1,0,2},{9,3,4,0},{9,4,0,3},{3,1,12,0},{2,4,3,7},{1,2,13,0},{2,2,4,8},{6,8,0,2}, + {9,2,1,4},{9,5,1,1},{2,0,4,10},{5,4,0,7},{0,0,6,10},{1,2,0,13},{4,7,2,3},{6,5,5,0},{3,3,1,9},{1,6,1,8},{12,2,1,1},{4,4,5,3},{1,0,6,9},{0,6,10,0},{4,8,3,1},{4,3,2,7},{2,1,7,6},{1,9,1,5},{3,1,3,9},{8,7,1,0},{1,2,3,10},{14,1,1,0},{5,4,4,3},{3,7,0,6},{7,4,1,4},{3,7,5,1},{1,1,0,14},{0,10,3,3},{0,4,3,9},{1,7,7,1},{2,0,10,4},{5,8,0,3}, + {6,7,3,0},{0,8,4,4},{5,7,3,1},{7,9,0,0},{7,6,2,1},{0,4,5,7},{6,3,5,2},{1,2,1,12},{5,2,0,9},{8,5,0,3},{4,6,1,5},{1,1,7,7},{10,5,1,0},{1,2,8,5},{1,8,2,5},{5,1,0,10},{6,9,1,0},{13,0,2,1},{8,3,5,0},{6,3,6,1},{2,11,3,0},{3,7,3,3},{1,5,2,8},{7,5,2,2},{0,6,7,3},{13,1,1,1},{5,3,4,4},{7,2,7,0},{5,8,3,0},{3,13,0,0},{0,7,9,0},{8,0,3,5}, + {1,3,7,5},{4,0,2,10},{12,0,1,3},{1,7,6,2},{3,9,0,4},{7,2,0,7},{0,1,7,8},{2,1,8,5},{0,13,1,2},{0,8,1,7},{5,0,11,0},{5,6,2,3},{0,3,0,13},{2,3,4,7},{5,6,3,2},{4,2,10,0},{3,3,7,3},{7,2,5,2},{1,1,11,3},{12,3,0,1},{5,1,1,9},{1,15,0,0},{9,7,0,0},{9,1,2,4},{0,7,3,6},{3,0,13,0},{3,0,11,2},{0,6,5,5},{8,2,2,4},{6,10,0,0},{4,8,4,0},{0,0,3,13}, + {0,4,12,0},{7,1,6,2},{3,5,0,8},{8,0,6,2},{6,2,3,5},{2,10,0,4},{4,11,0,1},{6,1,5,4},{5,1,3,7},{0,11,3,2},{4,6,0,6},{2,6,0,8},{3,1,7,5},{2,14,0,0},{2,9,2,3},{0,3,4,9},{11,0,1,4},{13,0,3,0},{8,3,0,5},{0,5,3,8},{5,11,0,0},{0,1,4,11},{2,1,9,4},{3,4,4,5},{7,1,2,6},{12,2,2,0},{9,4,1,2},{6,0,2,8},{4,6,2,4},{11,2,3,0},{3,2,2,9},{10,3,1,2}, + {1,1,2,12},{0,5,2,9},{0,1,11,4},{6,2,4,4},{2,8,2,4},{0,9,4,3},{11,0,2,3},{0,2,11,3},{6,0,7,3},{0,3,6,7},{4,5,5,2},{1,2,6,7},{7,5,1,3},{9,0,2,5},{2,6,4,4},{4,1,9,2},{4,8,2,2},{1,12,3,0},{0,9,6,1},{0,10,6,0},{3,1,5,7},{2,13,0,1},{2,2,1,11},{3,6,0,7},{5,6,5,0},{5,5,4,2},{4,0,3,9},{3,4,1,8},{0,11,2,3},{2,12,1,1},{7,1,3,5},{7,0,9,0}, + {8,0,8,0},{1,0,2,13},{3,3,10,0},{2,4,4,6},{2,3,8,3},{1,10,5,0},{7,3,0,6},{2,9,0,5},{1,4,6,5},{6,6,3,1},{5,6,0,5},{6,3,0,7},{3,10,2,1},{2,5,5,4},{3,8,4,1},{1,14,0,1},{10,3,3,0},{3,5,7,1},{1,1,3,11},{2,4,0,10},{9,3,1,3},{5,10,1,0},{3,0,6,7},{3,1,9,3},{11,2,1,2},{5,3,3,5},{0,5,1,10},{4,1,11,0},{10,2,0,4},{7,6,0,3},{2,7,0,7},{4,2,2,8}, + {6,1,7,2},{4,9,2,1},{0,0,8,8},{3,7,2,4},{9,6,0,1},{0,12,4,0},{6,7,1,2},{0,7,2,7},{1,0,10,5},{0,0,14,2},{2,7,3,4},{5,0,0,11},{7,7,1,1},{6,2,7,1},{4,5,3,4},{3,5,1,7},{5,9,1,1},{6,2,1,7},{3,2,0,11},{0,11,0,5},{3,11,2,0},{10,1,4,1},{7,0,4,5},{11,4,0,1},{10,3,0,3},{0,2,4,10},{0,15,0,1},{0,11,5,0},{6,7,2,1},{1,12,2,1},{4,1,3,8},{1,0,13,2}, + {1,8,5,2},{7,0,1,8},{3,12,1,0},{9,2,4,1},{1,7,4,4},{11,4,1,0},{4,3,8,1},{2,8,4,2},{1,11,3,1},{1,1,4,10},{4,10,2,0},{8,2,5,1},{1,0,9,6},{5,3,2,6},{0,9,7,0},{10,2,2,2},{5,8,1,2},{8,7,0,1},{0,3,12,1},{1,0,1,14},{4,8,0,4},{3,8,0,5},{4,6,5,1},{0,9,5,2},{10,2,3,1},{2,3,9,2},{1,0,12,3},{11,3,0,2},{4,5,2,5},{0,2,12,2},{9,1,0,6},{9,2,0,5}, + {1,2,7,6},{4,7,4,1},{0,12,2,2},{0,0,0,16},{2,8,3,3},{3,6,2,5},{0,6,3,7},{7,5,4,0},{3,3,3,7},{3,3,0,10},{5,0,6,5},{0,0,10,6},{8,5,3,0},{8,1,5,2},{6,0,9,1},{11,1,2,2},{2,11,2,1},{9,5,2,0},{3,0,4,9},{2,2,12,0},{2,6,6,2},{2,1,13,0},{6,0,5,5},{2,0,14,0},{2,11,1,2},{4,4,7,1},{2,0,11,3},{3,1,1,11},{2,9,4,1},{3,7,6,0},{14,0,2,0},{1,10,4,1}, + {8,0,7,1},{3,6,5,2},{0,3,11,2},{2,5,6,3},{11,1,3,1},{6,5,3,2},{3,8,1,4},{0,2,7,7},{2,10,2,2},{1,6,2,7},{11,0,0,5},{12,1,1,2},{12,1,2,1},{0,7,1,8},{0,3,9,4},{0,2,1,13},{7,1,4,4},{10,1,0,5},{4,0,8,4},{5,2,7,2},{0,2,0,14},{4,3,7,2},{2,7,1,6},{1,2,2,11},{6,3,3,4},{1,14,1,0},{2,4,6,4},{5,3,6,2},{5,3,5,3},{8,4,1,3},{1,3,0,12},{3,5,2,6}, + {1,8,7,0},{0,7,4,5},{2,1,6,7},{4,11,1,0},{7,2,4,3},{6,1,3,6},{4,5,4,3},{2,11,0,3},{1,5,7,3},{12,0,2,2},{5,0,4,7},{1,13,0,2},{7,7,2,0},{4,1,7,4},{4,5,0,7},{5,0,5,6},{6,5,4,1},{2,4,2,8},{1,10,1,4},{6,3,1,6},{3,3,8,2},{0,7,7,2},{4,4,2,6},{1,1,8,6},{1,12,0,3},{2,1,12,1},{1,9,2,4},{1,11,0,4},{2,5,2,7},{10,0,3,3},{4,6,3,3},{3,7,1,5}, + {1,9,0,6},{7,1,7,1},{1,6,5,4},{9,2,3,2},{6,2,2,6},{2,2,2,10},{8,3,3,2},{0,1,8,7},{2,0,8,6},{0,3,1,12},{9,4,2,1},{9,4,3,0},{6,2,6,2},{1,8,0,7},{5,1,10,0},{0,5,5,6},{8,2,4,2},{2,3,2,9},{6,0,3,7},{2,2,6,6},{2,6,2,6},{1,13,2,0},{9,3,0,4},{7,3,5,1},{6,5,2,3},{5,2,6,3},{2,0,12,2},{5,7,1,3},{8,1,3,4},{3,1,10,2},{1,0,15,0},{0,8,0,8}, + {5,0,7,4},{4,4,6,2},{0,1,0,15},{10,0,1,5},{7,3,4,2},{4,9,3,0},{2,5,7,2},{3,4,2,7},{8,3,2,3},{5,1,6,4},{0,10,2,4},{6,6,1,3},{6,0,0,10},{4,4,3,5},{1,3,9,3},{7,5,3,1},{3,0,7,6},{1,8,6,1},{4,3,0,9},{3,11,0,2},{6,0,6,4},{0,1,3,12},{0,4,2,10},{5,5,6,0},{4,1,4,7},{8,1,6,1},{5,6,4,1},{8,4,2,2},{4,3,1,8},{3,0,2,11},{1,11,4,0},{0,8,3,5}, + {5,1,7,3},{7,0,8,1},{4,3,5,4},{4,6,4,2},{3,2,4,7},{1,6,3,6},{0,7,8,1},{3,0,1,12},{9,1,4,2},{7,4,0,5},{1,7,0,8},{5,4,1,6},{9,1,5,1},{1,1,9,5},{4,1,1,10},{5,3,0,8},{2,2,5,7},{4,0,0,12},{9,0,7,0},{3,4,0,9},{0,2,6,8},{8,2,0,6},{3,2,6,5},{4,2,6,4},{3,6,4,3},{2,8,6,0},{5,0,3,8},{0,4,0,12},{0,16,0,0},{0,9,2,5},{4,0,11,1},{1,6,4,5}, + {0,1,6,9},{3,4,6,3},{3,0,10,3},{7,0,6,3},{1,4,9,2},{1,5,3,7},{8,5,2,1},{0,12,0,4},{7,2,3,4},{0,5,6,5},{11,1,1,3},{6,5,0,5},{2,1,5,8},{1,4,11,0},{9,1,1,5},{0,0,13,3},{5,8,2,1},{2,12,0,2},{3,3,6,4},{4,1,10,1},{4,0,5,7},{8,1,0,7},{5,1,9,1},{4,3,3,6},{0,2,2,12},{6,3,2,5},{0,0,12,4},{1,5,1,9},{2,6,5,3},{3,6,3,4},{2,12,2,0},{1,6,8,1}, + {10,1,1,4},{1,3,4,8},{7,4,4,1},{1,11,1,3},{1,2,10,3},{3,9,3,1},{8,5,1,2},{2,10,4,0},{4,2,0,10},{2,7,6,1},{8,2,3,3},{1,5,5,5},{3,1,0,12},{3,10,3,0},{8,0,5,3},{0,6,8,2},{0,3,13,0},{0,0,16,0},{1,9,4,2},{4,1,8,3},{1,6,6,3},{0,10,5,1},{0,1,12,3},{4,0,6,6},{3,8,3,2},{0,5,4,7},{1,0,14,1},{0,4,6,6},{3,9,1,3},{3,5,8,0},{3,6,6,1},{5,4,7,0}, + {3,0,12,1},{8,6,1,1},{2,9,5,0},{6,1,1,8},{4,1,2,9},{3,9,4,0},{5,2,9,0},{0,12,3,1},{1,4,10,1},{4,0,7,5},{3,1,2,10},{5,4,2,5},{5,5,5,1},{4,2,3,7},{1,7,5,3},{2,8,0,6},{8,1,2,5},{3,8,2,3},{6,1,2,7},{3,9,2,2},{9,0,0,7},{0,8,6,2},{8,4,3,1},{0,2,8,6},{6,5,1,4},{2,3,5,6},{2,10,3,1},{0,7,0,9},{4,2,7,3},{2,4,8,2},{7,1,1,7},{2,4,7,3}, + {2,4,10,0},{0,1,10,5},{4,7,1,4},{0,10,4,2},{9,0,1,6},{1,9,6,0},{3,3,4,6},{4,5,7,0},{5,5,2,4},{2,8,1,5},{2,3,6,5},{0,1,1,14},{3,2,3,8},{10,1,2,3},{9,1,6,0},{3,4,3,6},{2,2,0,12},{0,0,9,7},{4,0,9,3},{7,0,5,4},{4,5,6,1},{2,5,1,8},{2,5,9,0},{3,5,4,4},{1,3,11,1},{7,1,5,3},{3,2,7,4},{1,4,2,9},{1,11,2,2},{2,2,3,9},{5,0,10,1},{3,2,11,0}, + {1,10,3,2},{8,3,4,1},{3,6,7,0},{0,7,5,4},{1,3,3,9},{2,2,10,2},{1,9,5,1},{0,5,0,11},{3,0,3,10},{0,4,8,4},{2,7,7,0},{2,0,2,12},{1,2,11,2},{6,3,7,0},{0,6,2,8},{0,10,1,5},{0,9,0,7},{6,4,4,2},{6,0,1,9},{1,5,10,0},{5,4,6,1},{5,5,3,3},{0,0,4,12},{0,3,2,11},{1,4,1,10},{3,0,9,4},{5,5,0,6},{1,7,8,0},{2,0,3,11},{6,4,1,5},{10,0,6,0},{0,6,0,10}, + {0,4,11,1},{3,1,6,6},{2,5,8,1},{0,2,10,4},{3,1,11,1},{6,6,2,2},{1,1,10,4},{2,1,2,11},{6,1,8,1},{0,2,13,1},{0,7,6,3},{6,8,2,0},{3,0,0,13},{4,4,4,4},{6,2,0,8},{7,3,1,5},{0,11,4,1},{6,7,0,3},{2,6,3,5},{5,2,1,8},{7,1,8,0},{5,5,1,5},{1,8,3,4},{8,2,6,0},{6,0,10,0},{5,6,1,4},{1,4,4,7},{2,7,4,3},{1,4,8,3},{5,4,3,4},{1,10,2,3},{2,9,1,4}, + {2,2,11,1},{2,5,0,9},{0,0,1,15},{0,0,11,5},{0,4,7,5},{0,1,15,0},{2,1,0,13},{0,3,10,3},{8,0,2,6},{3,3,2,8},{3,5,5,3},{1,7,1,7},{1,3,2,10},{4,0,4,8},{2,0,9,5},{1,1,1,13},{2,2,7,5},{2,1,10,3},{4,2,1,9},{4,3,6,3},{1,3,5,7},{2,5,3,6},{1,0,8,7},{5,0,2,9},{2,8,5,1},{1,6,0,9},{0,0,5,11},{0,4,9,3},{2,0,7,7},{1,7,2,6},{2,1,1,12},{2,4,9,1}, + {0,5,7,4},{6,0,4,6},{3,2,10,1},{0,6,1,9},{2,6,1,7},{0,5,8,3},{4,1,0,11},{1,2,4,9},{4,1,5,6},{6,1,0,9},{1,4,3,8},{4,5,1,6},{1,0,5,10},{5,3,1,7},{0,9,1,6},{2,0,1,13},{2,0,6,8},{8,1,1,6},{1,5,9,1},{0,6,9,1},{0,3,5,8},{0,2,9,5},{5,2,8,1},{1,1,14,0},{3,2,9,2},{5,0,8,3},{0,5,10,1},{5,2,3,6},{2,6,7,1},{2,3,0,11},{0,1,9,6},{1,0,4,11}, + {3,0,5,8},{0,0,15,1},{2,4,5,5},{0,3,7,6},{2,0,0,14},{1,1,12,2},{2,6,8,0},{3,1,8,4},{0,1,5,10} + }; + + // All total orderings for 16 pixels [0,2] 2-bit selectors. + // BC1 selector order: 0, 1, 2 + // Note this is different from g_unique_total_orders4[], which reorders the selectors into linear order. + const uint32_t NUM_UNIQUE_TOTAL_ORDERINGS3 = 153; + static uint8_t g_unique_total_orders3[NUM_UNIQUE_TOTAL_ORDERINGS3][3] = + { + {6,0,10},{3,6,7},{3,0,13},{13,3,0},{12,4,0},{9,1,6},{2,13,1},{4,7,5},{7,5,4},{9,6,1},{7,4,5},{8,6,2},{16,0,0},{10,6,0},{2,7,7}, + {0,0,16},{0,3,13},{1,15,0},{0,2,14},{1,4,11},{15,1,0},{1,12,3},{9,2,5},{14,1,1},{8,2,6},{3,3,10},{4,2,10},{14,0,2},{0,14,2},{1,7,8},{6,6,4}, + {11,5,0},{6,4,6},{11,3,2},{4,3,9},{7,1,8},{10,4,2},{12,1,3},{11,0,5},{9,3,4},{1,0,15},{9,0,7},{2,6,8},{12,2,2},{6,2,8},{6,8,2},{15,0,1}, + {4,8,4},{0,4,12},{8,5,3},{5,9,2},{11,2,3},{12,3,1},{6,3,7},{1,1,14},{2,9,5},{1,8,7},{4,10,2},{7,7,2},{13,1,2},{0,15,1},{3,2,11},{7,0,9}, + {4,4,8},{3,8,5},{0,5,11},{13,2,1},{1,10,5},{4,11,1},{3,10,3},{5,10,1},{10,2,4},{0,6,10},{14,2,0},{11,4,1},{3,12,1},{1,13,2},{1,5,10},{5,11,0}, + {12,0,4},{8,1,7},{6,10,0},{3,13,0},{7,2,7},{0,7,9},{5,8,3},{0,12,4},{11,1,4},{13,0,3},{0,16,0},{5,7,4},{10,3,3},{10,0,6},{0,13,3},{4,6,6}, + {2,8,6},{2,5,9},{7,8,1},{2,1,13},{2,0,14},{7,3,6},{5,1,10},{3,11,2},{5,4,7},{8,3,5},{10,5,1},{6,9,1},{1,3,12},{4,5,7},{2,2,12},{4,1,11}, + {0,8,8},{4,12,0},{6,5,5},{8,7,1},{5,5,6},{3,7,6},{7,9,0},{4,9,3},{0,10,6},{8,0,8},{5,3,8},{10,1,5},{6,1,9},{7,6,3},{9,5,2},{0,1,15}, + {9,7,0},{2,14,0},{3,4,9},{8,4,4},{9,4,3},{0,9,7},{1,9,6},{3,9,4},{5,2,9},{2,3,11},{5,6,5},{1,14,1},{6,7,3},{2,4,10},{2,12,2},{8,8,0}, + {2,10,4},{4,0,12},{0,11,5},{2,11,3},{1,11,4},{3,5,8},{5,0,11},{3,1,12},{1,2,13},{1,6,9} + }; + + // For each total ordering, this table indicates which other total orderings are likely to improve quality using a least squares pass. Each array is sorted by usefulness. + static uint16_t g_best_total_orderings4[NUM_UNIQUE_TOTAL_ORDERINGS4][MAX_TOTAL_ORDERINGS4] = + { +#if RGBCX_USE_SMALLER_TABLES + #include "rgbcx_table4_small.h" +#else + #include "rgbcx_table4.h" +#endif + }; + + static uint8_t g_best_total_orderings3[NUM_UNIQUE_TOTAL_ORDERINGS3][32] = + { + { 12,1,3,5,27,2,4,38,8,7,16,18,6,10,41,79,40,23,46,9,20,88,22,37,14,19,24,126,99,119,35,11 }, + { 7,64,116,14,94,30,8,42,1,108,47,55,137,10,134,95,96,115,69,32,63,29,90,113,11,148,16,103,19,9,34,25 }, + { 12,1,0,5,3,7,4,27,8,6,38,40,41,16,18,46,9,10,20,23,79,62,14,22,88,99,37,126,92,19,120,11 }, + { 16,88,27,18,46,48,126,107,79,19,59,38,37,65,23,66,0,2,3,43,12,151,28,25,5,87,72,40,1,20,52,92 }, + { 79,48,88,16,27,65,18,38,46,19,37,4,72,33,126,41,52,0,12,92,5,1,2,107,3,77,23,91,43,51,22,74 }, + { 1,8,41,122,10,22,2,0,87,24,37,120,38,7,39,4,5,3,9,92,62,59,23,16,104,11,27,79,19,26,25,32 }, + { 2,76,99,28,40,86,93,21,138,60,6,0,17,128,145,119,98,144,141,82,147,54,67,75,5,12,27,132,146,1,38,14 }, + { 47,7,64,90,1,118,116,85,57,14,30,94,50,45,137,134,8,42,69,139,55,68,58,108,95,29,10,115,0,32,2,11 }, + { 49,8,10,30,124,11,32,113,130,58,125,9,100,53,104,115,131,103,24,7,1,39,45,36,139,0,137,22,90,44,114,105 }, + { 9,38,72,125,49,41,84,11,13,5,27,0,16,92,8,2,65,105,10,18,48,29,127,131,36,14,1,46,111,79,130,12 }, + { 130,8,10,100,104,131,49,32,53,39,30,36,113,24,11,22,124,44,83,58,7,103,1,4,9,125,5,0,91,33,115,74 }, + { 114,11,58,8,120,49,9,124,142,111,41,30,10,0,97,130,62,84,38,5,72,125,92,127,100,27,139,113,13,132,32,1 }, + { 60,46,28,27,40,20,0,17,18,2,126,16,6,38,86,23,79,54,1,93,5,88,41,14,21,111,7,48,3,84,72,62 }, + { 72,92,38,65,84,48,41,79,27,16,29,111,88,5,18,46,1,0,152,14,37,19,77,42,132,7,22,13,119,56,12,2 }, + { 7,55,1,95,29,56,64,116,143,8,14,30,47,94,152,90,65,67,10,133,42,72,146,84,16,48,6,0,25,108,77,21 }, + { 27,23,20,5,0,79,38,2,3,1,59,46,4,41,33,86,37,87,88,92,7,126,43,8,22,152,151,150,149,148,147,146 }, + { 12,0,1,2,7,6,3,5,28,4,8,14,60,40,17,19,21,86,126,93,10,18,9,29,48,99,65,25,84,119,72,41 }, + { 60,40,99,2,54,12,0,1,19,28,98,93,6,138,21,5,27,17,151,14,76,46,16,18,38,29,86,144,107,7,25,41 }, + { 12,0,1,2,3,5,6,7,4,28,8,60,14,40,16,17,21,10,19,9,86,38,126,41,93,27,29,48,62,84,79,99 }, + { 0,1,2,10,5,8,3,25,4,29,32,34,63,7,77,26,16,48,65,56,14,22,129,103,72,24,18,152,140,53,96,42 }, + { 46,126,18,54,12,16,1,0,5,2,27,98,20,23,6,3,88,48,28,7,19,8,4,60,151,38,37,21,79,14,65,40 }, + { 76,6,141,86,119,2,138,67,28,145,0,93,17,1,40,60,146,99,147,14,21,144,132,7,5,29,55,27,16,75,19,12 }, + { 71,5,51,39,22,80,0,43,10,122,8,62,41,24,104,87,35,37,2,91,33,120,36,38,1,131,9,100,130,66,3,4 }, + { 126,18,46,27,20,16,88,23,12,79,54,59,48,0,73,1,37,151,5,19,28,38,2,66,60,3,65,98,14,26,6,43 }, + { 22,10,8,5,0,71,35,80,104,39,24,51,100,1,62,32,2,130,11,41,7,9,53,43,49,83,122,120,30,44,37,38 }, + { 1,34,14,129,53,63,42,26,121,148,7,44,96,10,0,24,100,32,64,116,140,22,5,19,29,103,135,108,8,61,39,83 }, + { 1,7,34,63,44,25,135,14,24,108,22,0,83,94,5,129,35,101,47,121,2,19,42,53,6,110,103,8,148,10,16,123 }, + { 12,28,16,60,18,1,6,21,14,0,86,19,2,48,93,17,38,29,7,5,65,126,46,72,41,79,84,119,40,56,54,88 }, + { 0,2,12,27,5,46,38,40,41,79,88,99,3,23,1,62,20,4,22,37,92,35,18,8,16,24,10,60,7,120,98,54 }, + { 1,7,14,56,8,0,84,67,10,2,133,72,42,111,5,30,21,4,9,3,25,94,16,116,47,11,65,18,132,90,55,64 }, + { 30,8,124,139,45,11,58,90,113,137,7,115,10,32,1,49,94,85,9,47,108,103,0,97,63,14,50,114,53,106,100,25 }, + { 65,38,48,27,16,79,72,18,88,19,46,77,84,92,37,41,0,29,1,14,12,111,2,5,31,36,87,74,105,40,28,51 }, + { 10,8,30,113,130,100,53,32,115,103,104,7,1,121,39,49,131,44,24,36,63,137,34,45,22,90,108,83,26,11,94,139 }, + { 51,52,43,33,5,74,16,37,71,91,38,3,36,87,48,22,4,0,122,41,39,18,66,27,79,24,65,88,59,23,62,92 }, + { 1,7,63,53,108,121,94,44,103,100,14,10,129,47,32,26,24,25,148,42,135,22,0,61,83,8,39,104,5,64,115,34 }, + { 1,8,10,7,5,0,80,32,62,2,24,44,53,83,9,41,30,22,100,11,14,25,120,4,26,6,3,16,122,34,19,35 }, + { 74,4,36,48,33,91,39,79,22,16,65,5,131,38,24,71,27,52,0,105,51,18,88,104,3,31,10,37,72,19,41,130 }, + { 59,43,38,79,23,27,92,51,0,16,46,5,18,88,41,37,66,3,87,20,48,2,122,4,22,12,1,126,19,65,33,24 }, + { 12,28,1,27,0,16,2,46,65,60,21,3,5,18,6,19,48,14,4,7,79,88,86,29,22,72,93,40,23,8,17,41 }, + { 22,91,39,33,24,71,5,131,36,10,51,0,130,8,104,2,35,125,9,43,52,49,83,80,100,41,122,3,37,38,4,16 }, + { 12,0,1,2,5,3,4,8,7,27,18,38,10,6,16,46,9,20,41,23,126,79,22,14,19,99,88,54,37,48,62,35 }, + { 12,27,1,2,3,0,46,4,38,16,8,28,7,79,18,5,84,6,88,10,14,21,23,20,40,22,60,19,9,29,72,65 }, + { 1,14,7,55,95,29,8,94,30,56,10,108,77,116,152,64,32,48,63,42,143,148,16,25,137,65,11,0,115,9,19,72 }, + { 37,79,66,38,16,52,48,59,43,27,87,33,41,4,23,51,3,5,88,18,92,46,73,122,22,71,20,0,65,19,2,120 }, + { 24,32,83,22,53,1,8,10,7,30,35,5,103,0,100,101,121,113,34,123,63,2,44,25,71,115,80,14,26,108,51,39 }, + { 97,45,111,58,85,139,0,90,47,7,120,106,142,30,50,132,41,62,84,1,119,114,14,56,117,8,38,29,2,64,116,5 }, + { 12,28,16,18,1,60,6,14,2,21,0,86,126,19,48,93,7,27,17,29,5,65,54,38,72,79,84,88,119,145,8,111 }, + { 118,47,64,116,57,85,7,14,50,1,42,0,45,68,86,69,2,111,134,28,90,55,16,29,56,48,84,144,60,30,112,41 }, + { 12,1,2,0,7,6,28,5,3,4,8,14,60,21,18,40,17,86,10,9,16,29,19,93,126,79,38,84,72,27,111,119 }, + { 11,8,49,130,10,125,9,124,100,114,131,30,58,104,32,39,24,113,36,105,0,41,22,120,5,53,111,38,142,44,83,35 }, + { 50,70,47,118,85,57,106,0,45,7,64,90,81,14,2,134,28,62,86,55,69,1,78,119,68,56,18,67,16,60,29,21 }, + { 43,37,33,87,51,41,66,5,122,38,22,59,92,0,23,91,27,16,71,79,18,52,120,4,3,24,46,20,73,39,62,36 }, + { 79,48,4,16,27,88,43,33,18,38,65,37,46,3,19,51,52,22,66,87,74,5,41,91,23,59,0,71,122,72,20,92 }, + { 32,100,10,8,30,104,24,44,39,113,83,103,1,7,22,53,115,63,135,121,26,35,34,5,0,108,137,90,91,45,2,130 }, + { 0,1,2,5,16,12,6,7,14,3,19,18,29,20,4,21,40,8,17,35,23,48,126,22,25,56,26,10,98,27,38,65 }, + { 143,67,56,146,1,7,133,55,64,141,134,69,6,47,14,29,84,21,111,147,57,16,95,72,118,132,50,0,2,18,119,42 }, + { 1,7,67,14,133,111,8,84,0,21,2,47,64,132,55,10,95,147,119,42,16,5,72,56,4,3,6,29,9,25,18,30 }, + { 68,57,69,112,144,86,102,2,134,55,0,70,118,64,75,47,14,28,93,143,67,7,50,149,1,21,29,56,119,95,60,78 }, + { 58,97,114,30,124,45,11,139,8,90,0,142,7,10,41,113,84,62,49,111,85,1,9,5,137,120,32,14,2,117,47,38 }, + { 23,66,18,79,38,20,43,27,16,88,46,59,126,37,87,12,73,92,3,5,48,0,19,54,2,51,28,1,41,65,122,22 }, + { 0,12,2,27,5,40,46,38,1,41,3,79,88,23,99,4,20,62,22,54,92,18,8,37,16,35,10,7,19,120,144,24 }, + { 1,14,25,26,0,7,44,34,129,42,24,5,135,22,19,148,6,96,83,2,29,16,63,35,101,64,140,136,116,110,3,10 }, + { 12,1,2,27,3,4,38,5,7,8,18,16,46,6,0,40,41,10,79,23,88,9,20,22,14,19,37,92,48,126,28,21 }, + { 7,1,10,32,108,103,94,47,8,53,25,14,34,115,100,129,121,130,148,42,64,116,63,26,44,0,24,30,113,4,104,22 }, + { 47,134,7,14,55,69,64,95,1,29,85,118,56,116,45,57,102,143,50,90,42,30,16,94,0,8,67,75,133,2,18,48 }, + { 12,1,2,0,7,6,28,8,14,5,3,4,40,21,17,18,60,86,16,93,126,10,9,29,99,38,119,25,19,54,27,84 }, + { 59,16,27,18,23,88,79,37,46,66,38,20,73,126,3,43,48,87,92,51,41,12,19,5,52,107,65,0,151,122,54,2 }, + { 1,21,147,7,119,14,76,132,55,0,86,145,2,6,69,67,16,143,111,138,17,28,29,60,18,93,8,19,40,56,84,5 }, + { 144,86,112,2,68,102,69,0,149,93,75,28,57,55,145,60,21,67,99,134,143,40,146,119,82,110,62,6,29,26,78,14 }, + { 102,57,55,69,143,75,146,67,56,68,134,2,29,141,0,21,6,14,133,118,64,1,7,95,47,84,111,28,147,82,72,119 }, + { 0,70,57,119,50,145,2,86,28,118,69,78,149,47,60,68,67,55,93,81,134,21,14,62,64,7,5,1,132,85,41,16 }, + { 51,5,43,71,122,87,41,37,91,39,0,22,33,36,38,24,66,120,62,2,80,16,92,10,59,4,27,23,35,79,8,3 }, + { 12,1,2,0,7,6,28,5,8,14,3,21,40,4,60,17,86,18,16,93,10,9,126,119,99,29,19,41,38,27,25,92 }, + { 27,18,46,126,23,16,88,79,20,151,59,73,48,38,0,54,12,2,37,1,19,5,28,60,66,41,3,109,86,65,40,6 }, + { 48,79,4,33,16,74,65,38,88,27,91,52,18,36,22,19,46,0,37,3,51,5,71,39,72,43,24,41,92,87,2,10 }, + { 86,2,144,93,28,112,141,6,102,21,99,60,75,0,68,82,69,146,67,149,55,40,145,76,111,147,56,119,110,143,26,132 }, + { 6,138,2,99,86,17,40,93,28,21,145,141,0,60,119,147,128,76,67,54,1,12,5,27,144,14,38,98,146,41,29,19 }, + { 1,8,0,10,2,29,7,5,3,56,4,25,14,152,63,32,65,72,96,42,34,108,48,9,26,16,84,103,67,148,22,129 }, + { 149,145,0,86,2,28,93,144,62,60,119,101,21,41,5,35,78,99,26,40,12,68,57,67,110,120,69,18,55,76,132,70 }, + { 12,28,16,1,48,19,6,60,2,14,18,21,0,27,46,65,86,29,5,7,72,93,40,3,17,84,56,88,126,4,38,8 }, + { 1,8,5,10,7,24,2,62,0,41,22,122,120,9,4,3,32,87,11,37,38,83,100,44,25,104,16,26,39,80,14,6 }, + { 0,119,62,86,145,149,28,132,93,2,120,67,60,41,35,5,144,21,123,38,111,81,84,56,12,44,24,50,92,55,40,22 }, + { 2,93,99,28,40,144,60,0,86,150,76,21,149,98,6,25,1,61,82,26,12,5,54,141,7,18,145,16,27,138,110,38 }, + { 24,8,10,22,32,35,100,5,1,53,0,7,71,80,30,123,83,104,51,11,2,39,44,113,9,62,25,103,34,101,43,41 }, + { 12,1,2,0,7,6,28,5,40,60,8,16,3,18,14,4,86,21,17,93,41,10,9,99,27,119,38,19,126,22,48,145 }, + { 45,47,50,7,85,90,97,1,64,139,116,118,30,58,14,106,70,111,0,57,94,42,137,142,29,120,8,56,18,134,84,41 }, + { 12,0,2,5,27,38,1,46,41,40,79,144,3,22,88,23,28,60,99,62,6,24,26,7,4,16,10,35,37,18,14,20 }, + { 37,38,59,92,0,5,23,51,79,41,27,22,2,3,87,16,46,4,1,43,20,33,18,88,24,71,8,10,48,19,126,122 }, + { 12,28,16,60,1,18,6,21,19,14,48,0,2,86,93,5,46,29,17,27,65,7,3,72,38,126,119,40,84,37,56,4 }, + { 0,2,5,1,16,6,27,28,18,38,60,7,14,21,46,40,86,41,19,48,93,8,3,79,22,4,10,37,62,23,24,111 }, + { 85,7,90,30,47,139,45,50,94,58,137,1,8,64,14,116,118,115,113,11,124,108,0,10,97,57,32,70,42,106,29,114 }, + { 33,36,22,71,51,5,91,39,0,52,43,24,131,74,16,37,38,122,41,3,87,48,4,104,35,80,10,2,105,62,27,18 }, + { 12,1,27,2,0,16,3,28,46,18,4,6,5,72,21,79,38,7,14,60,88,8,65,19,48,29,23,40,22,20,86,126 }, + { 0,12,2,27,5,38,46,41,1,40,79,3,88,23,22,99,20,37,62,4,18,6,16,35,60,28,24,7,92,8,14,10 }, + { 7,47,1,30,137,8,116,94,90,64,14,115,108,118,57,10,148,113,42,85,32,11,63,50,103,45,124,134,55,9,69,34 }, + { 55,7,1,29,56,143,64,47,67,133,14,146,95,72,84,8,116,111,6,134,141,21,65,0,69,30,16,45,85,42,50,10 }, + { 14,1,42,8,10,29,108,63,55,148,95,32,7,19,25,115,103,34,56,129,77,0,16,152,94,30,113,26,2,5,48,4 }, + { 111,120,142,97,58,0,41,45,62,132,114,84,139,30,5,8,38,2,7,85,119,90,117,1,124,11,56,47,28,27,35,72 }, + { 1,0,14,2,6,5,16,19,7,29,42,18,3,25,12,35,21,8,26,17,40,4,20,48,109,99,22,96,55,101,10,61 }, + { 12,0,1,5,3,2,4,7,27,8,38,6,40,18,16,10,20,46,9,41,23,22,79,14,62,19,37,126,88,11,92,48 }, + { 10,8,104,39,24,32,22,83,44,100,30,130,53,91,113,5,11,1,35,33,7,49,0,2,103,71,36,124,9,80,131,34 }, + { 1,7,0,14,8,34,5,25,35,26,6,63,10,123,2,16,103,19,44,32,135,121,108,80,62,30,115,94,149,144,53,18 }, + { 75,68,146,141,102,67,2,21,6,57,69,143,0,55,82,86,28,144,147,29,93,112,56,119,133,14,76,60,84,134,111,145 }, + { 10,32,115,7,8,53,1,108,30,113,94,137,100,63,90,34,130,103,121,47,44,25,104,39,24,26,85,14,49,36,22,131 }, + { 39,24,10,22,8,130,91,104,83,49,5,33,100,11,0,35,32,131,71,36,9,44,53,2,80,51,30,1,41,7,43,62 }, + { 38,36,65,105,27,72,31,79,41,131,5,48,125,39,0,16,92,46,22,13,18,84,24,37,88,2,33,74,91,71,130,49 }, + { 0,106,62,50,45,119,85,81,132,28,2,86,41,47,38,60,35,117,5,29,7,30,145,90,55,70,14,111,18,67,93,56 }, + { 0,2,5,1,3,25,19,26,4,34,29,10,22,16,8,7,24,14,48,65,53,18,6,77,44,56,72,61,121,21,136,40 }, + { 7,1,94,8,47,115,10,32,113,103,30,108,137,63,14,64,116,148,129,42,90,25,34,118,53,57,11,49,85,9,96,50 }, + { 14,0,1,26,19,5,42,2,25,24,29,22,6,44,61,16,7,96,136,3,140,34,35,55,135,18,48,77,83,4,8,10 }, + { 1,7,14,0,25,6,34,5,26,16,63,2,19,8,35,101,108,29,94,10,18,42,123,144,129,47,61,21,3,62,149,4 }, + { 12,0,2,1,28,5,6,120,7,60,40,16,18,86,27,14,21,93,8,62,41,38,3,17,4,119,99,48,19,126,10,9 }, + { 86,144,93,2,28,149,0,60,99,112,110,145,40,21,102,26,75,62,69,1,12,101,119,25,76,67,7,68,55,5,6,14 }, + { 8,30,10,32,113,49,115,137,124,103,45,90,7,139,11,1,58,53,130,94,108,100,9,63,85,125,34,47,0,24,44,104 }, + { 120,142,111,41,58,114,97,0,11,62,84,124,5,30,8,38,132,127,27,139,92,10,72,45,49,9,28,2,29,56,16,1 }, + { 8,113,30,137,7,32,10,90,94,115,1,103,108,63,47,85,49,53,11,45,34,50,14,25,9,124,100,130,139,121,42,26 }, + { 64,7,14,47,134,55,1,42,95,69,116,90,94,30,8,29,56,137,45,108,85,10,57,16,102,143,118,19,63,32,11,50 }, + { 62,132,0,119,120,41,111,86,35,28,5,84,56,38,2,93,145,60,67,12,92,27,29,72,55,117,21,24,133,149,22,45 }, + { 57,68,69,118,134,64,50,47,55,14,7,2,102,144,0,112,70,86,85,1,95,29,116,143,42,75,16,56,28,45,21,48 }, + { 0,12,2,1,5,28,6,40,60,27,7,38,16,14,86,18,93,41,62,46,99,35,8,23,3,17,22,21,10,19,79,20 }, + { 12,1,2,27,16,3,38,111,4,0,18,5,7,46,40,8,79,6,14,28,88,10,48,41,19,84,21,9,22,23,20,72 }, + { 53,103,32,7,1,100,22,63,71,44,10,115,108,24,92,104,26,30,122,94,8,39,83,34,137,135,90,91,121,5,87,47 }, + { 87,37,41,0,22,38,2,92,1,24,4,8,3,59,10,5,39,23,71,79,122,27,16,46,33,7,91,20,18,51,9,120 }, + { 1,7,8,10,0,5,35,32,53,44,14,30,2,80,25,34,6,62,26,103,16,19,63,9,149,24,121,41,22,11,113,83 }, + { 11,58,8,30,124,49,10,113,9,114,139,45,97,32,7,137,90,1,0,130,115,125,100,24,5,94,53,41,14,13,35,38 }, + { 125,105,9,36,131,49,8,130,39,11,10,5,22,38,41,104,0,31,13,24,27,16,2,72,65,91,48,32,84,18,100,74 }, + { 12,1,0,2,6,3,7,5,4,8,14,28,16,60,18,10,21,17,19,9,40,27,86,93,29,38,54,11,25,48,46,41 }, + { 84,41,38,72,92,29,111,5,65,120,79,0,27,56,48,14,132,16,119,22,86,88,46,28,62,12,1,2,93,18,24,127 }, + { 99,28,40,60,2,93,138,0,98,17,86,54,76,12,27,1,21,144,128,38,5,14,46,18,25,16,109,6,41,145,7,29 }, + { 1,63,10,32,148,14,103,34,42,7,8,108,116,53,64,96,25,121,26,94,140,0,29,19,55,24,100,136,5,4,44,115 }, + { 131,100,130,49,10,8,36,104,39,0,48,41,11,38,4,24,27,22,16,44,79,5,33,2,53,9,125,74,91,120,32,83 }, + { 36,39,131,74,4,91,22,33,125,104,130,48,10,24,16,5,49,8,100,105,79,0,9,65,71,2,18,83,31,11,19,44 }, + { 0,12,2,1,6,5,7,28,40,60,16,14,18,62,86,27,93,8,17,38,21,41,35,99,3,19,10,23,22,4,9,48 }, + { 1,7,67,14,21,147,111,55,132,119,0,8,2,76,64,16,47,84,6,18,86,95,145,10,42,29,133,5,56,134,17,72 }, + { 69,55,47,134,102,143,7,57,118,95,14,64,29,56,1,50,75,67,146,2,0,133,68,16,21,6,141,85,116,18,72,65 }, + { 1,44,7,24,83,63,34,103,22,121,53,32,25,35,0,115,108,5,14,8,10,101,94,30,2,123,110,26,137,47,90,19 }, + { 14,1,25,42,34,0,26,96,19,29,140,5,53,10,2,121,3,24,44,22,55,77,129,7,63,16,8,4,6,61,100,48 }, + { 30,90,7,8,137,94,85,1,47,113,115,108,45,139,124,11,10,32,50,58,103,14,63,64,9,116,49,42,25,148,0,53 }, + { 40,99,2,60,28,17,0,54,93,98,86,138,6,12,21,76,1,5,27,144,128,38,19,46,14,41,145,7,16,67,3,109 }, + { 45,58,30,139,90,7,85,137,97,8,124,47,1,11,106,114,50,94,0,113,10,115,14,32,9,64,108,41,49,29,62,116 }, + { 14,42,10,1,63,96,32,25,34,8,129,29,0,103,55,19,26,53,77,5,95,2,4,7,3,16,148,56,18,24,121,108 }, + { 21,2,75,86,6,76,144,28,119,99,93,147,141,67,102,145,60,132,146,128,0,82,40,138,55,111,143,17,133,112,69,14 }, + { 111,120,41,62,84,132,0,5,38,119,56,92,72,142,27,28,29,35,58,80,2,86,65,79,12,14,1,24,145,16,21,48 }, + { 146,67,141,69,133,21,6,143,57,55,111,147,56,1,14,132,7,2,134,102,0,119,29,84,76,64,86,72,28,68,47,75 }, + { 12,1,0,5,27,3,7,4,38,8,6,41,16,40,46,10,18,79,2,9,23,86,20,22,62,14,37,88,92,19,24,11 }, + { 0,12,2,1,27,5,38,28,60,6,40,7,16,46,18,14,41,99,93,62,3,79,86,23,149,8,22,35,88,17,19,10 }, + { 141,6,21,67,147,102,146,2,76,119,132,69,55,111,86,75,28,133,143,0,1,145,14,128,56,99,17,60,29,93,84,68 }, + { 21,76,1,119,86,145,2,0,14,7,6,138,146,55,17,28,132,93,67,40,60,143,29,147,111,16,69,141,5,56,19,133 }, + { 1,8,108,14,7,116,64,42,10,63,94,32,115,103,113,96,30,34,55,47,95,148,29,140,129,25,134,53,69,26,19,11 }, + { 12,1,3,5,4,2,0,7,8,38,27,16,18,6,10,20,41,40,79,46,9,23,22,88,92,37,14,24,62,19,48,99 }, + { 1,14,7,0,6,25,5,16,19,2,42,26,29,35,61,8,18,129,101,21,3,110,34,148,96,10,17,4,22,40,12,20 }, + { 0,2,5,1,3,19,22,26,16,24,29,7,14,6,4,25,18,44,8,48,12,61,20,21,10,35,65,56,23,40,17,107 }, + { 1,7,8,29,56,0,10,14,2,42,72,5,4,65,3,30,84,94,67,9,25,133,111,11,32,108,16,63,21,96,26,48 } + }; +#ifdef _MSC_VER +#pragma endregion +#endif + + static inline uint32_t iabs(int32_t i) { return (i < 0) ? static_cast(-i) : static_cast(i); } + //static inline uint64_t iabs(int64_t i) { return (i < 0) ? static_cast(-i) : static_cast(i); } + + static inline uint8_t to_5(uint32_t v) { v = v * 31 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } + static inline uint8_t to_6(uint32_t v) { v = v * 63 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } + + template inline T square(T a) { return a * a; } + + static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high; return value; } + + template inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); } + static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high; return value; } + + static inline int squarei(int a) { return a * a; } + //static inline int absi(int a) { return (a < 0) ? -a : a; } + + template inline F lerp(F a, F b, F s) { return a + (b - a) * s; } + + static const uint32_t TOTAL_ORDER_4_0_16 = 15; + static const uint32_t TOTAL_ORDER_4_1_16 = 700; + static const uint32_t TOTAL_ORDER_4_2_16 = 753; + static const uint32_t TOTAL_ORDER_4_3_16 = 515; + static uint16_t g_total_ordering4_hash[4096]; + static float g_selector_factors4[NUM_UNIQUE_TOTAL_ORDERINGS4][3]; + + static const uint32_t TOTAL_ORDER_3_0_16 = 12; + static const uint32_t TOTAL_ORDER_3_1_16 = 15; + static const uint32_t TOTAL_ORDER_3_2_16 = 89; + static uint16_t g_total_ordering3_hash[256]; + static float g_selector_factors3[NUM_UNIQUE_TOTAL_ORDERINGS3][3]; + + struct hist4 + { + uint8_t m_hist[4]; + + hist4() + { + memset(m_hist, 0, sizeof(m_hist)); + } + + hist4(uint32_t i, uint32_t j, uint32_t k, uint32_t l) + { + m_hist[0] = (uint8_t)i; + m_hist[1] = (uint8_t)j; + m_hist[2] = (uint8_t)k; + m_hist[3] = (uint8_t)l; + } + + inline bool operator== (const hist4& h) const + { + if (m_hist[0] != h.m_hist[0]) return false; + if (m_hist[1] != h.m_hist[1]) return false; + if (m_hist[2] != h.m_hist[2]) return false; + if (m_hist[3] != h.m_hist[3]) return false; + return true; + } + + inline bool any_16() const + { + return (m_hist[0] == 16) || (m_hist[1] == 16) || (m_hist[2] == 16) || (m_hist[3] == 16); + } + + inline uint32_t lookup_total_ordering_index() const + { + if (m_hist[0] == 16) + return TOTAL_ORDER_4_0_16; + else if (m_hist[1] == 16) + return TOTAL_ORDER_4_1_16; + else if (m_hist[2] == 16) + return TOTAL_ORDER_4_2_16; + else if (m_hist[3] == 16) + return TOTAL_ORDER_4_3_16; + + // Must sum to 16, so m_hist[3] isn't needed. + return g_total_ordering4_hash[m_hist[0] | (m_hist[1] << 4) | (m_hist[2] << 8)]; + } + }; + + struct hist3 + { + uint8_t m_hist[3]; + + hist3() + { + memset(m_hist, 0, sizeof(m_hist)); + } + + hist3(uint32_t i, uint32_t j, uint32_t k) + { + m_hist[0] = (uint8_t)i; + m_hist[1] = (uint8_t)j; + m_hist[2] = (uint8_t)k; + } + + inline bool operator== (const hist3& h) const + { + if (m_hist[0] != h.m_hist[0]) return false; + if (m_hist[1] != h.m_hist[1]) return false; + if (m_hist[2] != h.m_hist[2]) return false; + return true; + } + + inline bool any_16() const + { + return (m_hist[0] == 16) || (m_hist[1] == 16) || (m_hist[2] == 16); + } + + inline uint32_t lookup_total_ordering_index() const + { + if (m_hist[0] == 16) + return TOTAL_ORDER_3_0_16; + else if (m_hist[1] == 16) + return TOTAL_ORDER_3_1_16; + else if (m_hist[2] == 16) + return TOTAL_ORDER_3_2_16; + + // Must sum to 16, so m_hist[2] isn't needed. + return g_total_ordering3_hash[m_hist[0] | (m_hist[1] << 4)]; + } + }; + + struct bc1_match_entry + { + uint8_t m_hi; + uint8_t m_lo; + uint8_t m_e; + }; + + static bc1_approx_mode g_bc1_approx_mode; + static bc1_match_entry g_bc1_match5_equals_1[256], g_bc1_match6_equals_1[256]; + static bc1_match_entry g_bc1_match5_half[256], g_bc1_match6_half[256]; + + static inline int scale_5_to_8(int v) { return (v << 3) | (v >> 2); } + static inline int scale_6_to_8(int v) { return (v << 2) | (v >> 4); } + + // v0, v1 = unexpanded DXT1 endpoint values (5/6-bits) + // c0, c1 = expanded DXT1 endpoint values (8-bits) + static inline int interp_5_6_ideal(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 2 + c1) / 3; } + static inline int interp_5_6_ideal_round(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 2 + c1 + 1) / 3; } + static inline int interp_half_5_6_ideal(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 + c1) / 2; } + + static inline int interp_5_nv(int v0, int v1) { assert(v0 < 32 && v1 < 32); return ((2 * v0 + v1) * 22) / 8; } + static inline int interp_6_nv(int c0, int c1) { assert(c0 < 256 && c1 < 256); const int gdiff = c1 - c0; return (256 * c0 + (gdiff / 4) + 128 + gdiff * 80) / 256; } + + static inline int interp_half_5_nv(int v0, int v1) { assert(v0 < 32 && v1 < 32); return ((v0 + v1) * 33) / 8; } + static inline int interp_half_6_nv(int c0, int c1) { assert(c0 < 256 && c1 < 256); const int gdiff = c1 - c0; return (256 * c0 + gdiff / 4 + 128 + gdiff * 128) / 256; } + + static inline int interp_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 43 + c1 * 21 + 32) >> 6; } + static inline int interp_half_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 + c1 + 1) >> 1; } + + static inline int interp_5(int v0, int v1, int c0, int c1, bc1_approx_mode mode) + { + assert(scale_5_to_8(v0) == c0 && scale_5_to_8(v1) == c1); + switch (mode) + { + case bc1_approx_mode::cBC1NVidia: return interp_5_nv(v0, v1); + case bc1_approx_mode::cBC1AMD: return interp_5_6_amd(c0, c1); + default: + case bc1_approx_mode::cBC1Ideal: return interp_5_6_ideal(c0, c1); + case bc1_approx_mode::cBC1IdealRound4: return interp_5_6_ideal_round(c0, c1); + } + } + + static inline int interp_6(int v0, int v1, int c0, int c1, bc1_approx_mode mode) + { + (void)v0; (void)v1; + assert(scale_6_to_8(v0) == c0 && scale_6_to_8(v1) == c1); + switch (mode) + { + case bc1_approx_mode::cBC1NVidia: return interp_6_nv(c0, c1); + case bc1_approx_mode::cBC1AMD: return interp_5_6_amd(c0, c1); + default: + case bc1_approx_mode::cBC1Ideal: return interp_5_6_ideal(c0, c1); + case bc1_approx_mode::cBC1IdealRound4: return interp_5_6_ideal_round(c0, c1); + } + } + + static inline int interp_half_5(int v0, int v1, int c0, int c1, bc1_approx_mode mode) + { + assert(scale_5_to_8(v0) == c0 && scale_5_to_8(v1) == c1); + switch (mode) + { + case bc1_approx_mode::cBC1NVidia: return interp_half_5_nv(v0, v1); + case bc1_approx_mode::cBC1AMD: return interp_half_5_6_amd(c0, c1); + case bc1_approx_mode::cBC1Ideal: + case bc1_approx_mode::cBC1IdealRound4: + default: + return interp_half_5_6_ideal(c0, c1); + } + } + + static inline int interp_half_6(int v0, int v1, int c0, int c1, bc1_approx_mode mode) + { + (void)v0; (void)v1; + assert(scale_6_to_8(v0) == c0 && scale_6_to_8(v1) == c1); + switch (mode) + { + case bc1_approx_mode::cBC1NVidia: return interp_half_6_nv(c0, c1); + case bc1_approx_mode::cBC1AMD: return interp_half_5_6_amd(c0, c1); + case bc1_approx_mode::cBC1Ideal: + case bc1_approx_mode::cBC1IdealRound4: + default: + return interp_half_5_6_ideal(c0, c1); + } + } + + static void prepare_bc1_single_color_table_half(bc1_match_entry* pTable, const uint8_t* pExpand, int size, bc1_approx_mode mode) + { + for (int i = 0; i < 256; i++) + { + int lowest_e = 256; + for (int lo = 0; lo < size; lo++) + { + const int lo_e = pExpand[lo]; + + for (int hi = 0; hi < size; hi++) + { + const int hi_e = pExpand[hi]; + + const int v = (size == 32) ? interp_half_5(hi, lo, hi_e, lo_e, mode) : interp_half_6(hi, lo, hi_e, lo_e, mode); + + int e = iabs(v - i); + + // We only need to factor in 3% error in BC1 ideal mode. + if ((mode == bc1_approx_mode::cBC1Ideal) || (mode == bc1_approx_mode::cBC1IdealRound4)) + e += (iabs(hi_e - lo_e) * 3) / 100; + + // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation. + if ((e < lowest_e) || ((e == lowest_e) && (lo == hi))) + { + pTable[i].m_hi = static_cast(hi); + pTable[i].m_lo = static_cast(lo); + + assert(e <= UINT8_MAX); + pTable[i].m_e = static_cast(e); + + lowest_e = e; + } + + } // hi + } // lo + } + } + + static void prepare_bc1_single_color_table(bc1_match_entry* pTable, const uint8_t* pExpand, int size, bc1_approx_mode mode) + { + for (int i = 0; i < 256; i++) + { + int lowest_e = 256; + for (int lo = 0; lo < size; lo++) + { + const int lo_e = pExpand[lo]; + + for (int hi = 0; hi < size; hi++) + { + const int hi_e = pExpand[hi]; + + const int v = (size == 32) ? interp_5(hi, lo, hi_e, lo_e, mode) : interp_6(hi, lo, hi_e, lo_e, mode); + + int e = iabs(v - i); + + if ((mode == bc1_approx_mode::cBC1Ideal) || (mode == bc1_approx_mode::cBC1IdealRound4)) + e += (iabs(hi_e - lo_e) * 3) / 100; + + // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation. + if ((e < lowest_e) || ((e == lowest_e) && (lo == hi))) + { + pTable[i].m_hi = static_cast(hi); + pTable[i].m_lo = static_cast(lo); + + assert(e <= UINT8_MAX); + pTable[i].m_e = static_cast(e); + + lowest_e = e; + } + + } // hi + } // lo + } + } + + // This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w)) + // where w is [0,1/3,2/3,1]. 9 is the perfect multiplier. + static const uint32_t g_weight_vals4[4] = { 0x000009, 0x010204, 0x040201, 0x090000 }; + + // multiplier is 4 for 3-color + static const uint32_t g_weight_vals3[3] = { 0x000004, 0x040000, 0x010101 }; + + static inline void compute_selector_factors4(const hist4& h, float& iz00, float& iz10, float& iz11) + { + uint32_t weight_accum = 0; + for (uint32_t sel = 0; sel < 4; sel++) + weight_accum += g_weight_vals4[sel] * h.m_hist[sel]; + + float z00 = (float)((weight_accum >> 16) & 0xFF); + float z10 = (float)((weight_accum >> 8) & 0xFF); + float z11 = (float)(weight_accum & 0xFF); + float z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + det = 0.0f; + else + det = (3.0f / 255.0f) / det; + + iz00 = z11 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + } + + static inline void compute_selector_factors3(const hist3& h, float& iz00, float& iz10, float& iz11) + { + uint32_t weight_accum = 0; + for (uint32_t sel = 0; sel < 3; sel++) + weight_accum += g_weight_vals3[sel] * h.m_hist[sel]; + + float z00 = (float)((weight_accum >> 16) & 0xFF); + float z10 = (float)((weight_accum >> 8) & 0xFF); + float z11 = (float)(weight_accum & 0xFF); + float z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + det = 0.0f; + else + det = (2.0f / 255.0f) / det; + + iz00 = z11 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + } + + static bool g_initialized; + + void init(bc1_approx_mode mode) + { + g_bc1_approx_mode = mode; + + uint8_t bc1_expand5[32]; + for (int i = 0; i < 32; i++) + bc1_expand5[i] = static_cast((i << 3) | (i >> 2)); + prepare_bc1_single_color_table(g_bc1_match5_equals_1, bc1_expand5, 32, mode); + prepare_bc1_single_color_table_half(g_bc1_match5_half, bc1_expand5, 32, mode); + + uint8_t bc1_expand6[64]; + for (int i = 0; i < 64; i++) + bc1_expand6[i] = static_cast((i << 2) | (i >> 4)); + prepare_bc1_single_color_table(g_bc1_match6_equals_1, bc1_expand6, 64, mode); + prepare_bc1_single_color_table_half(g_bc1_match6_half, bc1_expand6, 64, mode); + + for (uint32_t i = 0; i < NUM_UNIQUE_TOTAL_ORDERINGS4; i++) + { + hist4 h; + h.m_hist[0] = (uint8_t)g_unique_total_orders4[i][0]; + h.m_hist[1] = (uint8_t)g_unique_total_orders4[i][1]; + h.m_hist[2] = (uint8_t)g_unique_total_orders4[i][2]; + h.m_hist[3] = (uint8_t)g_unique_total_orders4[i][3]; + + if (!h.any_16()) + { + const uint32_t index = h.m_hist[0] | (h.m_hist[1] << 4) | (h.m_hist[2] << 8); + assert(index < 4096); + g_total_ordering4_hash[index] = (uint16_t)i; + } + + compute_selector_factors4(h, g_selector_factors4[i][0], g_selector_factors4[i][1], g_selector_factors4[i][2]); + } + + for (uint32_t i = 0; i < NUM_UNIQUE_TOTAL_ORDERINGS3; i++) + { + hist3 h; + h.m_hist[0] = (uint8_t)g_unique_total_orders3[i][0]; + h.m_hist[1] = (uint8_t)g_unique_total_orders3[i][1]; + h.m_hist[2] = (uint8_t)g_unique_total_orders3[i][2]; + + if (!h.any_16()) + { + const uint32_t index = h.m_hist[0] | (h.m_hist[1] << 4); + assert(index < 256); + g_total_ordering3_hash[index] = (uint16_t)i; + } + + compute_selector_factors3(h, g_selector_factors3[i][0], g_selector_factors3[i][1], g_selector_factors3[i][2]); + } + + g_initialized = true; + } + + void encode_bc1_solid_block(void* pDst, uint32_t fr, uint32_t fg, uint32_t fb, bool allow_3color) + { + bc1_block* pDst_block = static_cast(pDst); + + uint32_t mask = 0xAA; + int max16 = -1, min16 = 0; + + if (allow_3color) + { + const uint32_t err4 = g_bc1_match5_equals_1[fr].m_e + g_bc1_match6_equals_1[fg].m_e + g_bc1_match5_equals_1[fb].m_e; + const uint32_t err3 = g_bc1_match5_half[fr].m_e + g_bc1_match6_half[fg].m_e + g_bc1_match5_half[fb].m_e; + + if (err3 < err4) + { + max16 = (g_bc1_match5_half[fr].m_hi << 11) | (g_bc1_match6_half[fg].m_hi << 5) | g_bc1_match5_half[fb].m_hi; + min16 = (g_bc1_match5_half[fr].m_lo << 11) | (g_bc1_match6_half[fg].m_lo << 5) | g_bc1_match5_half[fb].m_lo; + + if (max16 > min16) + std::swap(max16, min16); + } + } + + if (max16 == -1) + { + max16 = (g_bc1_match5_equals_1[fr].m_hi << 11) | (g_bc1_match6_equals_1[fg].m_hi << 5) | g_bc1_match5_equals_1[fb].m_hi; + min16 = (g_bc1_match5_equals_1[fr].m_lo << 11) | (g_bc1_match6_equals_1[fg].m_lo << 5) | g_bc1_match5_equals_1[fb].m_lo; + + if (min16 == max16) + { + // Always forbid 3 color blocks + // This is to guarantee that BC3 blocks never use punchthrough alpha (3 color) mode, which isn't supported on some (all?) GPU's. + mask = 0; + + // Make l > h + if (min16 > 0) + min16--; + else + { + // l = h = 0 + assert(min16 == max16 && max16 == 0); + + max16 = 1; + min16 = 0; + mask = 0x55; + } + + assert(max16 > min16); + } + + if (max16 < min16) + { + std::swap(max16, min16); + mask ^= 0x55; + } + } + + pDst_block->set_low_color(static_cast(max16)); + pDst_block->set_high_color(static_cast(min16)); + pDst_block->m_selectors[0] = static_cast(mask); + pDst_block->m_selectors[1] = static_cast(mask); + pDst_block->m_selectors[2] = static_cast(mask); + pDst_block->m_selectors[3] = static_cast(mask); + } + + static const float g_midpoint5[32] = { .015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f, .370588f, .403922f, .435294f, .466667f, .5f, .533333f, .564706f, .596078f, .629412f, .662745f, .694118f, .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f }; + static const float g_midpoint6[64] = { .007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f, .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f, .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f, .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f, .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f, .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f }; + + struct vec3F { float c[3]; }; + + static inline void compute_least_squares_endpoints4_rgb( + vec3F* pXl, vec3F* pXh, + int total_r, int total_g, int total_b, + float iz00, float iz10, float iz11, + uint32_t s, const uint32_t r_sum[17], const uint32_t g_sum[17], const uint32_t b_sum[17]) + { + const float iz01 = iz10; + + const uint32_t f1 = g_unique_total_orders4[s][0]; + const uint32_t f2 = g_unique_total_orders4[s][0] + g_unique_total_orders4[s][1]; + const uint32_t f3 = g_unique_total_orders4[s][0] + g_unique_total_orders4[s][1] + g_unique_total_orders4[s][2]; + uint32_t uq00_r = (r_sum[f2] - r_sum[f1]) + (r_sum[f3] - r_sum[f2]) * 2 + (r_sum[16] - r_sum[f3]) * 3; + uint32_t uq00_g = (g_sum[f2] - g_sum[f1]) + (g_sum[f3] - g_sum[f2]) * 2 + (g_sum[16] - g_sum[f3]) * 3; + uint32_t uq00_b = (b_sum[f2] - b_sum[f1]) + (b_sum[f3] - b_sum[f2]) * 2 + (b_sum[16] - b_sum[f3]) * 3; + + float q10_r = (float)(total_r * 3 - uq00_r); + float q10_g = (float)(total_g * 3 - uq00_g); + float q10_b = (float)(total_b * 3 - uq00_b); + + pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; + pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; + + pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; + pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; + + pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; + pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; + } + + static inline bool compute_least_squares_endpoints4_rgb(const color32* pColors, const uint8_t* pSelectors, vec3F* pXl, vec3F* pXh, int total_r, int total_g, int total_b) + { + uint32_t uq00_r = 0, uq00_g = 0, uq00_b = 0; + uint32_t weight_accum = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint8_t r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2]; + const uint8_t sel = pSelectors[i]; + + weight_accum += g_weight_vals4[sel]; + uq00_r += sel * r; + uq00_g += sel * g; + uq00_b += sel * b; + } + + int q10_r = total_r * 3 - uq00_r; + int q10_g = total_g * 3 - uq00_g; + int q10_b = total_b * 3 - uq00_b; + + float z00 = (float)((weight_accum >> 16) & 0xFF); + float z10 = (float)((weight_accum >> 8) & 0xFF); + float z11 = (float)(weight_accum & 0xFF); + float z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; + + det = (3.0f / 255.0f) / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; + pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; + + pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; + pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; + + pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; + pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; + + return true; + } + + static inline void compute_least_squares_endpoints3_rgb( + vec3F* pXl, vec3F* pXh, + int total_r, int total_g, int total_b, + float iz00, float iz10, float iz11, + uint32_t s, const uint32_t r_sum[17], const uint32_t g_sum[17], const uint32_t b_sum[17]) + { + const float iz01 = iz10; + + // Compensates for BC1 3-color ordering, which is selector 0, 2, 1 + const uint32_t f1 = g_unique_total_orders3[s][0]; + const uint32_t f2 = g_unique_total_orders3[s][0] + g_unique_total_orders3[s][2]; + uint32_t uq00_r = (r_sum[16] - r_sum[f2]) * 2 + (r_sum[f2] - r_sum[f1]); + uint32_t uq00_g = (g_sum[16] - g_sum[f2]) * 2 + (g_sum[f2] - g_sum[f1]); + uint32_t uq00_b = (b_sum[16] - b_sum[f2]) * 2 + (b_sum[f2] - b_sum[f1]); + + float q10_r = (float)(total_r * 2 - uq00_r); + float q10_g = (float)(total_g * 2 - uq00_g); + float q10_b = (float)(total_b * 2 - uq00_b); + + pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; + pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; + + pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; + pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; + + pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; + pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; + } + + static inline bool compute_least_squares_endpoints3_rgb(bool use_black, const color32* pColors, const uint8_t* pSelectors, vec3F* pXl, vec3F* pXh) + { + int uq00_r = 0, uq00_g = 0, uq00_b = 0; + uint32_t weight_accum = 0; + int total_r = 0, total_g = 0, total_b = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint8_t r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2]; + if (use_black) + { + if ((r | g | b) < 4) + continue; + } + + const uint8_t sel = pSelectors[i]; + assert(sel <= 3); + if (sel == 3) + continue; + + weight_accum += g_weight_vals3[sel]; + + static const uint8_t s_tran[3] = { 0, 2, 1 }; + const uint8_t tsel = s_tran[sel]; + uq00_r += tsel * r; + uq00_g += tsel * g; + uq00_b += tsel * b; + + total_r += r; + total_g += g; + total_b += b; + } + + int q10_r = total_r * 2 - uq00_r; + int q10_g = total_g * 2 - uq00_g; + int q10_b = total_b * 2 - uq00_b; + + float z00 = (float)((weight_accum >> 16) & 0xFF); + float z10 = (float)((weight_accum >> 8) & 0xFF); + float z11 = (float)(weight_accum & 0xFF); + float z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; + + det = (2.0f / 255.0f) / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; + pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; + + pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; + pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; + + pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; + pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; + + return true; + } + + static inline void bc1_get_block_colors4(uint32_t block_r[4], uint32_t block_g[4], uint32_t block_b[4], uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb) + { + block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); + block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2); + + if (g_bc1_approx_mode == bc1_approx_mode::cBC1Ideal) + { + block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3; + block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3; + } + else if (g_bc1_approx_mode == bc1_approx_mode::cBC1IdealRound4) + { + block_r[1] = (block_r[0] * 2 + block_r[3] + 1) / 3; block_g[1] = (block_g[0] * 2 + block_g[3] + 1) / 3; block_b[1] = (block_b[0] * 2 + block_b[3] + 1) / 3; + block_r[2] = (block_r[3] * 2 + block_r[0] + 1) / 3; block_g[2] = (block_g[3] * 2 + block_g[0] + 1) / 3; block_b[2] = (block_b[3] * 2 + block_b[0] + 1) / 3; + } + else if (g_bc1_approx_mode == bc1_approx_mode::cBC1AMD) + { + block_r[1] = interp_5_6_amd(block_r[0], block_r[3]); block_g[1] = interp_5_6_amd(block_g[0], block_g[3]); block_b[1] = interp_5_6_amd(block_b[0], block_b[3]); + block_r[2] = interp_5_6_amd(block_r[3], block_r[0]); block_g[2] = interp_5_6_amd(block_g[3], block_g[0]); block_b[2] = interp_5_6_amd(block_b[3], block_b[0]); + } + else + { + block_r[1] = interp_5_nv(lr, hr); block_g[1] = interp_6_nv(block_g[0], block_g[3]); block_b[1] = interp_5_nv(lb, hb); + block_r[2] = interp_5_nv(hr, lr); block_g[2] = interp_6_nv(block_g[3], block_g[0]); block_b[2] = interp_5_nv(hb, lb); + } + } + + static inline void bc1_get_block_colors3(uint32_t block_r[3], uint32_t block_g[3], uint32_t block_b[3], uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb) + { + block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); + block_r[1] = (hr << 3) | (hr >> 2); block_g[1] = (hg << 2) | (hg >> 4); block_b[1] = (hb << 3) | (hb >> 2); + + if ((g_bc1_approx_mode == bc1_approx_mode::cBC1Ideal) || (g_bc1_approx_mode == bc1_approx_mode::cBC1IdealRound4)) + { + block_r[2] = (block_r[0] + block_r[1]) / 2; block_g[2] = (block_g[0] + block_g[1]) / 2; block_b[2] = (block_b[0] + block_b[1]) / 2; + } + else if (g_bc1_approx_mode == bc1_approx_mode::cBC1AMD) + { + block_r[2] = interp_half_5_6_amd(block_r[0], block_r[1]); block_g[2] = interp_half_5_6_amd(block_g[0], block_g[1]); block_b[2] = interp_half_5_6_amd(block_b[0], block_b[1]); + } + else + { + block_r[2] = interp_half_5_nv(lr, hr); block_g[2] = interp_half_6_nv(block_g[0], block_g[1]); block_b[2] = interp_half_5_nv(lb, hb); + } + } + + static inline void bc1_find_sels4_noerr(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], const uint8_t* pForce_selectors) + { + if (pForce_selectors) + { + memcpy(sels, pForce_selectors, 16); + return; + } + + uint32_t block_r[4], block_g[4], block_b[4]; + bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); + + int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; + + int dots[4]; + for (uint32_t i = 0; i < 4; i++) + dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; + + int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; + + ar *= 2; ag *= 2; ab *= 2; + + static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; + + for (uint32_t i = 0; i < 16; i += 4) + { + const int d0 = pSrc_pixels[i + 0].r * ar + pSrc_pixels[i + 0].g * ag + pSrc_pixels[i + 0].b * ab; + const int d1 = pSrc_pixels[i + 1].r * ar + pSrc_pixels[i + 1].g * ag + pSrc_pixels[i + 1].b * ab; + const int d2 = pSrc_pixels[i + 2].r * ar + pSrc_pixels[i + 2].g * ag + pSrc_pixels[i + 2].b * ab; + const int d3 = pSrc_pixels[i + 3].r * ar + pSrc_pixels[i + 3].g * ag + pSrc_pixels[i + 3].b * ab; + + sels[i + 0] = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)]; + sels[i + 1] = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)]; + sels[i + 2] = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)]; + sels[i + 3] = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)]; + } + } + + static inline uint32_t bc1_find_sels4_fasterr(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) + { + uint32_t block_r[4], block_g[4], block_b[4]; + bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); + + int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; + + int dots[4]; + for (uint32_t i = 0; i < 4; i++) + dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; + + int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; + + ar *= 2; ag *= 2; ab *= 2; + + static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; + + uint32_t total_err = 0; + + for (uint32_t i = 0; i < 16; i += 4) + { + const int d0 = pSrc_pixels[i + 0].r * ar + pSrc_pixels[i + 0].g * ag + pSrc_pixels[i + 0].b * ab; + const int d1 = pSrc_pixels[i + 1].r * ar + pSrc_pixels[i + 1].g * ag + pSrc_pixels[i + 1].b * ab; + const int d2 = pSrc_pixels[i + 2].r * ar + pSrc_pixels[i + 2].g * ag + pSrc_pixels[i + 2].b * ab; + const int d3 = pSrc_pixels[i + 3].r * ar + pSrc_pixels[i + 3].g * ag + pSrc_pixels[i + 3].b * ab; + + uint8_t sel0 = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)]; + uint8_t sel1 = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)]; + uint8_t sel2 = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)]; + uint8_t sel3 = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)]; + + sels[i + 0] = sel0; + sels[i + 1] = sel1; + sels[i + 2] = sel2; + sels[i + 3] = sel3; + + total_err += squarei(pSrc_pixels[i + 0].r - block_r[sel0]) + squarei(pSrc_pixels[i + 0].g - block_g[sel0]) + squarei(pSrc_pixels[i + 0].b - block_b[sel0]); + total_err += squarei(pSrc_pixels[i + 1].r - block_r[sel1]) + squarei(pSrc_pixels[i + 1].g - block_g[sel1]) + squarei(pSrc_pixels[i + 1].b - block_b[sel1]); + total_err += squarei(pSrc_pixels[i + 2].r - block_r[sel2]) + squarei(pSrc_pixels[i + 2].g - block_g[sel2]) + squarei(pSrc_pixels[i + 2].b - block_b[sel2]); + total_err += squarei(pSrc_pixels[i + 3].r - block_r[sel3]) + squarei(pSrc_pixels[i + 3].g - block_g[sel3]) + squarei(pSrc_pixels[i + 3].b - block_b[sel3]); + + if (total_err >= cur_err) + break; + } + + return total_err; + } + + static inline uint32_t bc1_find_sels4_check2_err(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) + { + uint32_t block_r[4], block_g[4], block_b[4]; + bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); + + int dr = block_r[3] - block_r[0], dg = block_g[3] - block_g[0], db = block_b[3] - block_b[0]; + + const float f = 4.0f / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f); + + uint32_t total_err = 0; + + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r; + const int g = pSrc_pixels[i].g; + const int b = pSrc_pixels[i].b; + + int sel = (int)((float)((r - (int)block_r[0]) * dr + (g - (int)block_g[0]) * dg + (b - (int)block_b[0]) * db) * f + .5f); + sel = clampi(sel, 1, 3); + + uint32_t err0 = squarei((int)block_r[sel - 1] - (int)r) + squarei((int)block_g[sel - 1] - (int)g) + squarei((int)block_b[sel - 1] - (int)b); + uint32_t err1 = squarei((int)block_r[sel] - (int)r) + squarei((int)block_g[sel] - (int)g) + squarei((int)block_b[sel] - (int)b); + + int best_sel = sel; + uint32_t best_err = err1; + if (err0 == err1) + { + // Prefer non-interpolation + if ((best_sel - 1) == 0) + best_sel = 0; + } + else if (err0 < best_err) + { + best_sel = sel - 1; + best_err = err0; + } + + total_err += best_err; + + if (total_err >= cur_err) + break; + + sels[i] = (uint8_t)best_sel; + } + return total_err; + } + + static inline uint32_t bc1_find_sels4_fullerr(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) + { + uint32_t block_r[4], block_g[4], block_b[4]; + bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); + + uint32_t total_err = 0; + + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r; + const int g = pSrc_pixels[i].g; + const int b = pSrc_pixels[i].b; + + uint32_t best_err = squarei((int)block_r[0] - (int)r) + squarei((int)block_g[0] - (int)g) + squarei((int)block_b[0] - (int)b); + uint8_t best_sel = 0; + + for (uint32_t j = 1; (j < 4) && best_err; j++) + { + uint32_t err = squarei((int)block_r[j] - (int)r) + squarei((int)block_g[j] - (int)g) + squarei((int)block_b[j] - (int)b); + if ((err < best_err) || ((err == best_err) && (j == 3))) + { + best_err = err; + best_sel = (uint8_t)j; + } + } + + total_err += best_err; + + if (total_err >= cur_err) + break; + + sels[i] = (uint8_t)best_sel; + } + return total_err; + } + + static inline uint32_t bc1_find_sels4(uint32_t flags, const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err, const uint8_t* pForce_selectors) + { + uint32_t err; + + if (pForce_selectors) + { + memcpy(sels, pForce_selectors, 16); + + uint32_t block_r[4], block_g[4], block_b[4]; + bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); + + err = 0; + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r; + const int g = pSrc_pixels[i].g; + const int b = pSrc_pixels[i].b; + + const uint32_t sel = pForce_selectors[i]; + assert(sel <= 3); + + err += squarei((int)block_r[sel] - (int)r) + squarei((int)block_g[sel] - (int)g) + squarei((int)block_b[sel] - (int)b); + } + } + else + { + if (flags & cEncodeBC1UseFasterMSEEval) + err = bc1_find_sels4_fasterr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err); + else if (flags & cEncodeBC1UseFullMSEEval) + err = bc1_find_sels4_fullerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err); + else + err = bc1_find_sels4_check2_err(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err); + } + + return err; + } + + static inline uint32_t bc1_find_sels3_fullerr(bool use_black, const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err, const uint8_t* pForce_selectors) + { + uint32_t block_r[4], block_g[4], block_b[4]; + bc1_get_block_colors3(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); + + uint32_t total_err = 0; + + if (pForce_selectors) + { + memcpy(sels, pForce_selectors, 16); + + //uint32_t block_r[4], block_g[4], block_b[4]; + //bc1_get_block_colors3(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); + + block_r[3] = 0; block_g[3] = 0; block_b[3] = 0; + + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r; + const int g = pSrc_pixels[i].g; + const int b = pSrc_pixels[i].b; + + const uint32_t sel = pForce_selectors[i]; + assert(sel <= 3); + + total_err += squarei((int)block_r[sel] - (int)r) + squarei((int)block_g[sel] - (int)g) + squarei((int)block_b[sel] - (int)b); + } + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r; + const int g = pSrc_pixels[i].g; + const int b = pSrc_pixels[i].b; + + uint32_t best_err = squarei((int)block_r[0] - (int)r) + squarei((int)block_g[0] - (int)g) + squarei((int)block_b[0] - (int)b); + uint32_t best_sel = 0; + + uint32_t err1 = squarei((int)block_r[1] - (int)r) + squarei((int)block_g[1] - (int)g) + squarei((int)block_b[1] - (int)b); + if (err1 < best_err) + { + best_err = err1; + best_sel = 1; + } + + uint32_t err2 = squarei((int)block_r[2] - (int)r) + squarei((int)block_g[2] - (int)g) + squarei((int)block_b[2] - (int)b); + if (err2 < best_err) + { + best_err = err2; + best_sel = 2; + } + + if (use_black) + { + uint32_t err3 = squarei(r) + squarei(g) + squarei(b); + if (err3 < best_err) + { + best_err = err3; + best_sel = 3; + } + } + + total_err += best_err; + if (total_err >= cur_err) + return total_err; + + sels[i] = (uint8_t)best_sel; + } + } + + return total_err; + } + + static inline void precise_round_565(const vec3F& xl, const vec3F& xh, + int& trial_lr, int& trial_lg, int& trial_lb, + int& trial_hr, int& trial_hg, int& trial_hb) + { + trial_lr = (int)(xl.c[0] * 31.0f); + trial_lg = (int)(xl.c[1] * 63.0f); + trial_lb = (int)(xl.c[2] * 31.0f); + + trial_hr = (int)(xh.c[0] * 31.0f); + trial_hg = (int)(xh.c[1] * 63.0f); + trial_hb = (int)(xh.c[2] * 31.0f); + + if ((uint32_t)(trial_lr | trial_lb | trial_hr | trial_hb) > 31U) + { + trial_lr = ((uint32_t)trial_lr > 31U) ? (~trial_lr >> 31) & 31 : trial_lr; + trial_hr = ((uint32_t)trial_hr > 31U) ? (~trial_hr >> 31) & 31 : trial_hr; + + trial_lb = ((uint32_t)trial_lb > 31U) ? (~trial_lb >> 31) & 31 : trial_lb; + trial_hb = ((uint32_t)trial_hb > 31U) ? (~trial_hb >> 31) & 31 : trial_hb; + } + + if ((uint32_t)(trial_lg | trial_hg) > 63U) + { + trial_lg = ((uint32_t)trial_lg > 63U) ? (~trial_lg >> 31) & 63 : trial_lg; + trial_hg = ((uint32_t)trial_hg > 63U) ? (~trial_hg >> 31) & 63 : trial_hg; + } + + trial_lr = (trial_lr + (xl.c[0] > g_midpoint5[trial_lr])) & 31; + trial_lg = (trial_lg + (xl.c[1] > g_midpoint6[trial_lg])) & 63; + trial_lb = (trial_lb + (xl.c[2] > g_midpoint5[trial_lb])) & 31; + + trial_hr = (trial_hr + (xh.c[0] > g_midpoint5[trial_hr])) & 31; + trial_hg = (trial_hg + (xh.c[1] > g_midpoint6[trial_hg])) & 63; + trial_hb = (trial_hb + (xh.c[2] > g_midpoint5[trial_hb])) & 31; + } + + static inline void precise_round_565_noscale(vec3F xl, vec3F xh, + int& trial_lr, int& trial_lg, int& trial_lb, + int& trial_hr, int& trial_hg, int& trial_hb) + { + xl.c[0] *= 1.0f / 255.0f; + xl.c[1] *= 1.0f / 255.0f; + xl.c[2] *= 1.0f / 255.0f; + + xh.c[0] *= 1.0f / 255.0f; + xh.c[1] *= 1.0f / 255.0f; + xh.c[2] *= 1.0f / 255.0f; + + precise_round_565(xl, xh, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb); + } + + static inline void bc1_encode4(bc1_block* pDst_block, int lr, int lg, int lb, int hr, int hg, int hb, const uint8_t sels[16]) + { + uint32_t lc16 = bc1_block::pack_unscaled_color(lr, lg, lb); + uint32_t hc16 = bc1_block::pack_unscaled_color(hr, hg, hb); + + // Always forbid 3 color blocks + if (lc16 == hc16) + { + uint8_t mask = 0; + + // Make l > h + if (hc16 > 0) + hc16--; + else + { + // lc16 = hc16 = 0 + assert(lc16 == hc16 && hc16 == 0); + + hc16 = 0; + lc16 = 1; + mask = 0x55; // select hc16 + } + + assert(lc16 > hc16); + pDst_block->set_low_color(static_cast(lc16)); + pDst_block->set_high_color(static_cast(hc16)); + + pDst_block->m_selectors[0] = mask; + pDst_block->m_selectors[1] = mask; + pDst_block->m_selectors[2] = mask; + pDst_block->m_selectors[3] = mask; + } + else + { + uint8_t invert_mask = 0; + if (lc16 < hc16) + { + std::swap(lc16, hc16); + invert_mask = 0x55; + } + + assert(lc16 > hc16); + pDst_block->set_low_color((uint16_t)lc16); + pDst_block->set_high_color((uint16_t)hc16); + + uint32_t packed_sels = 0; + static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 }; + for (uint32_t i = 0; i < 16; i++) + packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2)); + + pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask; + pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask; + pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask; + pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask; + } + } + + static inline void bc1_encode3(bc1_block* pDst_block, int lr, int lg, int lb, int hr, int hg, int hb, const uint8_t sels[16]) + { + uint32_t lc16 = bc1_block::pack_unscaled_color(lr, lg, lb); + uint32_t hc16 = bc1_block::pack_unscaled_color(hr, hg, hb); + + bool invert_flag = false; + if (lc16 > hc16) + { + std::swap(lc16, hc16); + invert_flag = true; + } + + assert(lc16 <= hc16); + + pDst_block->set_low_color((uint16_t)lc16); + pDst_block->set_high_color((uint16_t)hc16); + + uint32_t packed_sels = 0; + + if (invert_flag) + { + static const uint8_t s_sel_trans_inv[4] = { 1, 0, 2, 3 }; + + for (uint32_t i = 0; i < 16; i++) + packed_sels |= ((uint32_t)s_sel_trans_inv[sels[i]] << (i * 2)); + } + else + { + for (uint32_t i = 0; i < 16; i++) + packed_sels |= ((uint32_t)sels[i] << (i * 2)); + } + + pDst_block->m_selectors[0] = (uint8_t)packed_sels; + pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8); + pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16); + pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24); + } + + struct bc1_encode_results + { + int lr, lg, lb; + int hr, hg, hb; + uint8_t sels[16]; + bool m_3color; + }; + + static bool try_3color_block_useblack(const color32* pSrc_pixels, uint32_t flags, uint32_t& cur_err, bc1_encode_results& results, const uint8_t* pForce_selectors) + { + int total_r = 0, total_g = 0, total_b = 0; + int max_r = 0, max_g = 0, max_b = 0; + int min_r = 255, min_g = 255, min_b = 255; + int total_pixels = 0; + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; + if ((r | g | b) < 4) + continue; + + max_r = std::max(max_r, r); max_g = std::max(max_g, g); max_b = std::max(max_b, b); + min_r = std::min(min_r, r); min_g = std::min(min_g, g); min_b = std::min(min_b, b); + total_r += r; total_g += g; total_b += b; + + total_pixels++; + } + + if (!total_pixels) + return false; + + int half_total_pixels = total_pixels >> 1; + int avg_r = (total_r + half_total_pixels) / total_pixels; + int avg_g = (total_g + half_total_pixels) / total_pixels; + int avg_b = (total_b + half_total_pixels) / total_pixels; + + uint32_t low_c = 0, high_c = 0; + + int icov[6] = { 0, 0, 0, 0, 0, 0 }; + for (uint32_t i = 0; i < 16; i++) + { + int r = (int)pSrc_pixels[i].r; + int g = (int)pSrc_pixels[i].g; + int b = (int)pSrc_pixels[i].b; + + if ((r | g | b) < 4) + continue; + + r -= avg_r; + g -= avg_g; + b -= avg_b; + + icov[0] += r * r; + icov[1] += r * g; + icov[2] += r * b; + icov[3] += g * g; + icov[4] += g * b; + icov[5] += b * b; + } + + float cov[6]; + for (uint32_t i = 0; i < 6; i++) + cov[i] = (float)(icov[i]) * (1.0f / 255.0f); + + float xr = (float)(max_r - min_r); + float xg = (float)(max_g - min_g); + float xb = (float)(max_b - min_b); + + if (icov[2] < 0) + xr = -xr; + + if (icov[4] < 0) + xg = -xg; + + for (uint32_t power_iter = 0; power_iter < 4; power_iter++) + { + float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; + float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; + float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; + xr = r; xg = g; xb = b; + } + + float k = maximum(fabsf(xr), fabsf(xg), fabsf(xb)); + int saxis_r = 306, saxis_g = 601, saxis_b = 117; + if (k >= 2) + { + float m = 1024.0f / k; + saxis_r = (int)(xr * m); + saxis_g = (int)(xg * m); + saxis_b = (int)(xb * m); + } + + int low_dot = INT_MAX, high_dot = INT_MIN; + for (uint32_t i = 0; i < 16; i++) + { + int r = (int)pSrc_pixels[i].r, g = (int)pSrc_pixels[i].g, b = (int)pSrc_pixels[i].b; + + if ((r | g | b) < 4) + continue; + + int dot = r * saxis_r + g * saxis_g + b * saxis_b; + if (dot < low_dot) + { + low_dot = dot; + low_c = i; + } + if (dot > high_dot) + { + high_dot = dot; + high_c = i; + } + } + + int lr = to_5(pSrc_pixels[low_c].r); + int lg = to_6(pSrc_pixels[low_c].g); + int lb = to_5(pSrc_pixels[low_c].b); + + int hr = to_5(pSrc_pixels[high_c].r); + int hg = to_6(pSrc_pixels[high_c].g); + int hb = to_5(pSrc_pixels[high_c].b); + + uint8_t trial_sels[16]; + uint32_t trial_err = bc1_find_sels3_fullerr(true, pSrc_pixels, lr, lg, lb, hr, hg, hb, trial_sels, UINT32_MAX, pForce_selectors); + + if (trial_err) + { + const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; + for (uint32_t trials = 0; trials < total_ls_passes; trials++) + { + vec3F xl, xh; + int lr2, lg2, lb2, hr2, hg2, hb2; + if (!compute_least_squares_endpoints3_rgb(true, pSrc_pixels, trial_sels, &xl, &xh)) + { + lr2 = g_bc1_match5_half[avg_r].m_hi; + lg2 = g_bc1_match6_half[avg_g].m_hi; + lb2 = g_bc1_match5_half[avg_b].m_hi; + + hr2 = g_bc1_match5_half[avg_r].m_lo; + hg2 = g_bc1_match6_half[avg_g].m_lo; + hb2 = g_bc1_match5_half[avg_b].m_lo; + } + else + { + precise_round_565(xl, xh, hr2, hg2, hb2, lr2, lg2, lb2); + } + + if ((lr == lr2) && (lg == lg2) && (lb == lb2) && (hr == hr2) && (hg == hg2) && (hb == hb2)) + break; + + uint8_t trial_sels2[16]; + uint32_t trial_err2 = bc1_find_sels3_fullerr(true, pSrc_pixels, lr2, lg2, lb2, hr2, hg2, hb2, trial_sels2, trial_err, pForce_selectors); + + if (trial_err2 < trial_err) + { + trial_err = trial_err2; + lr = lr2; lg = lg2; lb = lb2; + hr = hr2; hg = hg2; hb = hb2; + memcpy(trial_sels, trial_sels2, sizeof(trial_sels)); + } + else + break; + } + } + + if (trial_err < cur_err) + { + results.m_3color = true; + results.lr = lr; + results.lg = lg; + results.lb = lb; + results.hr = hr; + results.hg = hg; + results.hb = hb; + memcpy(results.sels, trial_sels, 16); + + cur_err = trial_err; + + return true; + } + + return false; + } + + static bool try_3color_block(const color32* pSrc_pixels, uint32_t flags, uint32_t& cur_err, + int avg_r, int avg_g, int avg_b, int lr, int lg, int lb, int hr, int hg, int hb, int total_r, int total_g, int total_b, uint32_t total_orderings_to_try, + bc1_encode_results& results, const uint8_t* pForce_selectors) + { + if (pForce_selectors) + { + for (uint32_t i = 0; i < 16; i++) + if (pForce_selectors[i] == 3) + return false; + } + + uint8_t trial_sels[16]; + uint32_t trial_err = bc1_find_sels3_fullerr(false, pSrc_pixels, lr, lg, lb, hr, hg, hb, trial_sels, UINT32_MAX, pForce_selectors); + + if (trial_err) + { + const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; + for (uint32_t trials = 0; trials < total_ls_passes; trials++) + { + vec3F xl, xh; + int lr2, lg2, lb2, hr2, hg2, hb2; + if (!compute_least_squares_endpoints3_rgb(false, pSrc_pixels, trial_sels, &xl, &xh)) + { + lr2 = g_bc1_match5_half[avg_r].m_hi; + lg2 = g_bc1_match6_half[avg_g].m_hi; + lb2 = g_bc1_match5_half[avg_b].m_hi; + + hr2 = g_bc1_match5_half[avg_r].m_lo; + hg2 = g_bc1_match6_half[avg_g].m_lo; + hb2 = g_bc1_match5_half[avg_b].m_lo; + } + else + { + precise_round_565(xl, xh, hr2, hg2, hb2, lr2, lg2, lb2); + } + + if ((lr == lr2) && (lg == lg2) && (lb == lb2) && (hr == hr2) && (hg == hg2) && (hb == hb2)) + break; + + uint8_t trial_sels2[16]; + uint32_t trial_err2 = bc1_find_sels3_fullerr(false, pSrc_pixels, lr2, lg2, lb2, hr2, hg2, hb2, trial_sels2, trial_err, pForce_selectors); + + if (trial_err2 < trial_err) + { + trial_err = trial_err2; + lr = lr2; lg = lg2; lb = lb2; + hr = hr2; hg = hg2; hb = hb2; + memcpy(trial_sels, trial_sels2, sizeof(trial_sels)); + } + else + break; + } + } + + if ((trial_err) && (flags & cEncodeBC1UseLikelyTotalOrderings) && (total_orderings_to_try)) + { + hist3 h; + for (uint32_t i = 0; i < 16; i++) + { + assert(trial_sels[i] < 3); + h.m_hist[trial_sels[i]]++; + } + + const uint32_t orig_total_order_index = h.lookup_total_ordering_index(); + + int r0, g0, b0, r3, g3, b3; + r0 = (lr << 3) | (lr >> 2); g0 = (lg << 2) | (lg >> 4); b0 = (lb << 3) | (lb >> 2); + r3 = (hr << 3) | (hr >> 2); g3 = (hg << 2) | (hg >> 4); b3 = (hb << 3) | (hb >> 2); + + int ar = r3 - r0, ag = g3 - g0, ab = b3 - b0; + + int dots[16]; + for (uint32_t i = 0; i < 16; i++) + { + int r = pSrc_pixels[i].r; + int g = pSrc_pixels[i].g; + int b = pSrc_pixels[i].b; + int d = 0x1000000 + (r * ar + g * ag + b * ab); + assert(d >= 0); + dots[i] = (d << 4) + i; + } + + std::sort(dots, dots + 16); + + uint32_t r_sum[17], g_sum[17], b_sum[17]; + uint32_t r = 0, g = 0, b = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t p = dots[i] & 15; + + r_sum[i] = r; + g_sum[i] = g; + b_sum[i] = b; + + r += pSrc_pixels[p].r; + g += pSrc_pixels[p].g; + b += pSrc_pixels[p].b; + } + + r_sum[16] = total_r; + g_sum[16] = total_g; + b_sum[16] = total_b; + + const uint32_t q_total = (flags & cEncodeBC1Exhaustive) ? NUM_UNIQUE_TOTAL_ORDERINGS3 : std::min(total_orderings_to_try, MAX_TOTAL_ORDERINGS3); + for (uint32_t q = 0; q < q_total; q++) + { + const uint32_t s = (flags & cEncodeBC1Exhaustive) ? q : g_best_total_orderings3[orig_total_order_index][q]; + + int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; + + vec3F xl, xh; + + if ((s == TOTAL_ORDER_3_0_16) || (s == TOTAL_ORDER_3_1_16) || (s == TOTAL_ORDER_3_2_16)) + { + trial_lr = g_bc1_match5_half[avg_r].m_hi; + trial_lg = g_bc1_match6_half[avg_g].m_hi; + trial_lb = g_bc1_match5_half[avg_b].m_hi; + + trial_hr = g_bc1_match5_half[avg_r].m_lo; + trial_hg = g_bc1_match6_half[avg_g].m_lo; + trial_hb = g_bc1_match5_half[avg_b].m_lo; + } + else + { + compute_least_squares_endpoints3_rgb(&xl, &xh, total_r, total_g, total_b, + g_selector_factors3[s][0], g_selector_factors3[s][1], g_selector_factors3[s][2], s, r_sum, g_sum, b_sum); + + precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); + } + + uint8_t trial_sels2[16]; + uint32_t trial_err2 = bc1_find_sels3_fullerr(false, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels2, UINT32_MAX, pForce_selectors); + + if (trial_err2 < trial_err) + { + trial_err = trial_err2; + + lr = trial_lr; + lg = trial_lg; + lb = trial_lb; + + hr = trial_hr; + hg = trial_hg; + hb = trial_hb; + + memcpy(trial_sels, trial_sels2, sizeof(trial_sels)); + } + + } // s + } + + if (trial_err < cur_err) + { + results.m_3color = true; + results.lr = lr; + results.lg = lg; + results.lb = lb; + results.hr = hr; + results.hg = hg; + results.hb = hb; + memcpy(results.sels, trial_sels, 16); + + cur_err = trial_err; + + return true; + } + + return false; + } + + void encode_bc1(uint32_t level, void* pDst, const uint8_t* pPixels, bool allow_3color, bool allow_transparent_texels_for_black, const uint8_t* pForce_selectors) + { + uint32_t flags = 0, total_orderings4 = 1, total_orderings3 = 1; + + static_assert(MAX_TOTAL_ORDERINGS3 >= 32, "MAX_TOTAL_ORDERINGS3 >= 32"); + static_assert(MAX_TOTAL_ORDERINGS4 >= 32, "MAX_TOTAL_ORDERINGS4 >= 32"); + + switch (level) + { + case 0: + // Faster/higher quality than stb_dxt default. + flags = cEncodeBC1BoundingBoxInt; + break; + case 1: + // Faster/higher quality than stb_dxt default. A bit higher average quality vs. mode 0. + flags = cEncodeBC1Use2DLS; + break; + case 2: + // On average mode 2 is a little weaker than modes 0/1, but it's stronger on outliers (very tough textures). + // Slightly stronger than stb_dxt. + flags = 0; + break; + case 3: + // Slightly stronger than stb_dxt HIGHQUAL. + flags = cEncodeBC1TwoLeastSquaresPasses; + break; + case 4: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1Use6PowerIters; + break; + default: + case 5: + // stb_dxt HIGHQUAL + permit 3 color (if it's enabled). + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + break; + case 6: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + break; + case 7: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 4; + break; + case 8: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 8; + break; + case 9: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 11; + total_orderings3 = 3; + break; + case 10: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 20; + total_orderings3 = 8; + break; + case 11: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 28; + total_orderings3 = 16; + break; + case 12: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 32; + total_orderings3 = 32; + break; + case 13: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (20 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 32; + total_orderings3 = 32; + break; + case 14: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (32 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 32; + total_orderings3 = 32; + break; + case 15: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (32 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = ((((32 + MAX_TOTAL_ORDERINGS4) / 2) + 32) / 2); + total_orderings3 = 32; + break; + case 16: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = (32 + MAX_TOTAL_ORDERINGS4) / 2; + total_orderings3 = 32; + break; + case 17: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = MAX_TOTAL_ORDERINGS4; + total_orderings3 = 32; + break; + case 18: + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | cEncodeBC1Iterative | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = MAX_TOTAL_ORDERINGS4; + total_orderings3 = 32; + break; + case 19: + // This hidden mode is *extremely* slow and abuses the encoder. It's just for testing/training. + flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | cEncodeBC1Exhaustive | cEncodeBC1Iterative | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; + flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); + total_orderings4 = 32; + total_orderings3 = 32; + break; + } + + encode_bc1(pDst, pPixels, flags, total_orderings4, total_orderings3, pForce_selectors); + } + + static inline void encode_bc1_pick_initial(const color32* pSrc_pixels, uint32_t flags, bool grayscale_flag, + int min_r, int min_g, int min_b, int max_r, int max_g, int max_b, + int avg_r, int avg_g, int avg_b, int total_r, int total_g, int total_b, + int& lr, int& lg, int& lb, int& hr, int& hg, int& hb) + { + if (grayscale_flag) + { + const int fr = pSrc_pixels[0].r; + + // Grayscale blocks are a common enough case to specialize. + if ((max_r - min_r) < 2) + { + lr = lb = hr = hb = to_5(fr); + lg = hg = to_6(fr); + } + else + { + lr = lb = to_5(min_r); + lg = to_6(min_r); + + hr = hb = to_5(max_r); + hg = to_6(max_r); + } + } + else if (flags & cEncodeBC1Use2DLS) + { + // 2D Least Squares approach from Humus's example, with added inset and optimal rounding. + int big_chan = 0, min_chan_val = min_r, max_chan_val = max_r; + if ((max_g - min_g) > (max_chan_val - min_chan_val)) + big_chan = 1, min_chan_val = min_g, max_chan_val = max_g; + + if ((max_b - min_b) > (max_chan_val - min_chan_val)) + big_chan = 2, min_chan_val = min_b, max_chan_val = max_b; + + int sum_xy_r = 0, sum_xy_g = 0, sum_xy_b = 0; + vec3F l, h; + if (big_chan == 0) + { + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; + sum_xy_r += r * r, sum_xy_g += r * g, sum_xy_b += r * b; + } + + int sum_x = total_r; + int sum_x2 = sum_xy_r; + + float div = (float)(16 * sum_x2 - sum_x * sum_x); + float b_y = 0.0f, b_z = 0.0f; + if (fabs(div) > 1e-8f) + { + div = 1.0f / div; + b_y = (16 * sum_xy_g - sum_x * total_g) * div; + b_z = (16 * sum_xy_b - sum_x * total_b) * div; + } + + float a_y = (total_g - b_y * sum_x) / 16.0f; + float a_z = (total_b - b_z * sum_x) / 16.0f; + + l.c[1] = a_y + b_y * min_chan_val; + l.c[2] = a_z + b_z * min_chan_val; + + h.c[1] = a_y + b_y * max_chan_val; + h.c[2] = a_z + b_z * max_chan_val; + + float dg = (h.c[1] - l.c[1]); + float db = (h.c[2] - l.c[2]); + + h.c[1] = l.c[1] + dg * (15.0f / 16.0f); + h.c[2] = l.c[2] + db * (15.0f / 16.0f); + + l.c[1] = l.c[1] + dg * (1.0f / 16.0f); + l.c[2] = l.c[2] + db * (1.0f / 16.0f); + + float d = (float)(max_chan_val - min_chan_val); + float fmin_chan_val = min_chan_val + d * (1.0f / 16.0f); + float fmax_chan_val = min_chan_val + d * (15.0f / 16.0f); + + l.c[0] = fmin_chan_val; + h.c[0] = fmax_chan_val; + } + else if (big_chan == 1) + { + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; + sum_xy_r += g * r, sum_xy_g += g * g, sum_xy_b += g * b; + } + + int sum_x = total_g; + int sum_x2 = sum_xy_g; + + float div = (float)(16 * sum_x2 - sum_x * sum_x); + float b_x = 0.0f, b_z = 0.0f; + if (fabs(div) > 1e-8f) + { + div = 1.0f / div; + b_x = (16 * sum_xy_r - sum_x * total_r) * div; + b_z = (16 * sum_xy_b - sum_x * total_b) * div; + } + + float a_x = (total_r - b_x * sum_x) / 16.0f; + float a_z = (total_b - b_z * sum_x) / 16.0f; + + l.c[0] = a_x + b_x * min_chan_val; + l.c[2] = a_z + b_z * min_chan_val; + + h.c[0] = a_x + b_x * max_chan_val; + h.c[2] = a_z + b_z * max_chan_val; + + float dr = (h.c[0] - l.c[0]); + float db = (h.c[2] - l.c[2]); + + h.c[0] = l.c[0] + dr * (15.0f / 16.0f); + h.c[2] = l.c[2] + db * (15.0f / 16.0f); + + l.c[0] = l.c[0] + dr * (1.0f / 16.0f); + l.c[2] = l.c[2] + db * (1.0f / 16.0f); + + float d = (float)(max_chan_val - min_chan_val); + float fmin_chan_val = min_chan_val + d * (1.0f / 16.0f); + float fmax_chan_val = min_chan_val + d * (15.0f / 16.0f); + + l.c[1] = fmin_chan_val; + h.c[1] = fmax_chan_val; + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; + sum_xy_r += b * r, sum_xy_g += b * g, sum_xy_b += b * b; + } + + int sum_x = total_b; + int sum_x2 = sum_xy_b; + + float div = (float)(16 * sum_x2 - sum_x * sum_x); + float b_x = 0.0f, b_y = 0.0f; + if (fabs(div) > 1e-8f) + { + div = 1.0f / div; + b_x = (16 * sum_xy_r - sum_x * total_r) * div; + b_y = (16 * sum_xy_g - sum_x * total_g) * div; + } + + float a_x = (total_r - b_x * sum_x) / 16.0f; + float a_y = (total_g - b_y * sum_x) / 16.0f; + + l.c[0] = a_x + b_x * min_chan_val; + l.c[1] = a_y + b_y * min_chan_val; + + h.c[0] = a_x + b_x * max_chan_val; + h.c[1] = a_y + b_y * max_chan_val; + + float dr = (h.c[0] - l.c[0]); + float dg = (h.c[1] - l.c[1]); + + h.c[0] = l.c[0] + dr * (15.0f / 16.0f); + h.c[1] = l.c[1] + dg * (15.0f / 16.0f); + + l.c[0] = l.c[0] + dr * (1.0f / 16.0f); + l.c[1] = l.c[1] + dg * (1.0f / 16.0f); + + float d = (float)(max_chan_val - min_chan_val); + float fmin_chan_val = min_chan_val + d * (1.0f / 16.0f); + float fmax_chan_val = min_chan_val + d * (15.0f / 16.0f); + + l.c[2] = fmin_chan_val; + h.c[2] = fmax_chan_val; + } + + precise_round_565_noscale(l, h, lr, lg, lb, hr, hg, hb); + } + else if (flags & cEncodeBC1BoundingBox) + { + // Algorithm from icbc.h compress_dxt1_fast() + vec3F l, h; + l.c[0] = min_r * (1.0f / 255.0f); + l.c[1] = min_g * (1.0f / 255.0f); + l.c[2] = min_b * (1.0f / 255.0f); + + h.c[0] = max_r * (1.0f / 255.0f); + h.c[1] = max_g * (1.0f / 255.0f); + h.c[2] = max_b * (1.0f / 255.0f); + + const float bias = 8.0f / 255.0f; + float inset_r = (h.c[0] - l.c[0] - bias) * (1.0f / 16.0f); + float inset_g = (h.c[1] - l.c[1] - bias) * (1.0f / 16.0f); + float inset_b = (h.c[2] - l.c[2] - bias) * (1.0f / 16.0f); + + l.c[0] = clampf(l.c[0] + inset_r, 0.0f, 1.0f); + l.c[1] = clampf(l.c[1] + inset_g, 0.0f, 1.0f); + l.c[2] = clampf(l.c[2] + inset_b, 0.0f, 1.0f); + + h.c[0] = clampf(h.c[0] - inset_r, 0.0f, 1.0f); + h.c[1] = clampf(h.c[1] - inset_g, 0.0f, 1.0f); + h.c[2] = clampf(h.c[2] - inset_b, 0.0f, 1.0f); + + int icov_xz = 0, icov_yz = 0; + for (uint32_t i = 0; i < 16; i++) + { + int r = (int)pSrc_pixels[i].r - avg_r; + int g = (int)pSrc_pixels[i].g - avg_g; + int b = (int)pSrc_pixels[i].b - avg_b; + icov_xz += r * b; + icov_yz += g * b; + } + + if (icov_xz < 0) + std::swap(l.c[0], h.c[0]); + + if (icov_yz < 0) + std::swap(l.c[1], h.c[1]); + + precise_round_565(l, h, lr, lg, lb, hr, hg, hb); + } + else if (flags & cEncodeBC1BoundingBoxInt) + { + // Algorithm from icbc.h compress_dxt1_fast(), but converted to integer. + int inset_r = (max_r - min_r - 8) >> 4; + int inset_g = (max_g - min_g - 8) >> 4; + int inset_b = (max_b - min_b - 8) >> 4; + + min_r += inset_r; + min_g += inset_g; + min_b += inset_b; + if ((uint32_t)(min_r | min_g | min_b) > 255U) + { + min_r = clampi(min_r, 0, 255); + min_g = clampi(min_g, 0, 255); + min_b = clampi(min_b, 0, 255); + } + + max_r -= inset_r; + max_g -= inset_g; + max_b -= inset_b; + if ((uint32_t)(max_r | max_g | max_b) > 255U) + { + max_r = clampi(max_r, 0, 255); + max_g = clampi(max_g, 0, 255); + max_b = clampi(max_b, 0, 255); + } + + int icov_xz = 0, icov_yz = 0; + for (uint32_t i = 0; i < 16; i++) + { + int r = (int)pSrc_pixels[i].r - avg_r; + int g = (int)pSrc_pixels[i].g - avg_g; + int b = (int)pSrc_pixels[i].b - avg_b; + icov_xz += r * b; + icov_yz += g * b; + } + + int x0 = min_r; + int y0 = min_g; + int x1 = max_r; + int y1 = max_g; + + if (icov_xz < 0) + std::swap(x0, x1); + + if (icov_yz < 0) + std::swap(y0, y1); + + lr = to_5(x0); + lg = to_6(y0); + lb = to_5(min_b); + + hr = to_5(x1); + hg = to_6(y1); + hb = to_5(max_b); + } + else + { + // Select 2 colors along the principle axis. (There must be a faster/simpler way.) + uint32_t low_c = 0, high_c = 0; + + int icov[6] = { 0, 0, 0, 0, 0, 0 }; + for (uint32_t i = 0; i < 16; i++) + { + int r = (int)pSrc_pixels[i].r - avg_r; + int g = (int)pSrc_pixels[i].g - avg_g; + int b = (int)pSrc_pixels[i].b - avg_b; + icov[0] += r * r; + icov[1] += r * g; + icov[2] += r * b; + icov[3] += g * g; + icov[4] += g * b; + icov[5] += b * b; + } + + int saxis_r = 306, saxis_g = 601, saxis_b = 117; + + float xr = (float)(max_r - min_r); + float xg = (float)(max_g - min_g); + float xb = (float)(max_b - min_b); + + if (icov[2] < 0) + xr = -xr; + + if (icov[4] < 0) + xg = -xg; + + float cov[6]; + for (uint32_t i = 0; i < 6; i++) + cov[i] = (float)(icov[i]) * (1.0f / 255.0f); + + const uint32_t total_power_iters = (flags & cEncodeBC1Use6PowerIters) ? 6 : 4; + for (uint32_t power_iter = 0; power_iter < total_power_iters; power_iter++) + { + float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; + float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; + float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; + xr = r; xg = g; xb = b; + } + + float k = maximum(fabsf(xr), fabsf(xg), fabsf(xb)); + if (k >= 2) + { + float m = 2048.0f / k; + saxis_r = (int)(xr * m); + saxis_g = (int)(xg * m); + saxis_b = (int)(xb * m); + } + + int low_dot = INT_MAX, high_dot = INT_MIN; + + saxis_r = (int)((uint32_t)saxis_r << 4U); + saxis_g = (int)((uint32_t)saxis_g << 4U); + saxis_b = (int)((uint32_t)saxis_b << 4U); + + for (uint32_t i = 0; i < 16; i += 4) + { + int dot0 = ((pSrc_pixels[i].r * saxis_r + pSrc_pixels[i].g * saxis_g + pSrc_pixels[i].b * saxis_b) & ~0xF) + i; + int dot1 = ((pSrc_pixels[i + 1].r * saxis_r + pSrc_pixels[i + 1].g * saxis_g + pSrc_pixels[i + 1].b * saxis_b) & ~0xF) + i + 1; + int dot2 = ((pSrc_pixels[i + 2].r * saxis_r + pSrc_pixels[i + 2].g * saxis_g + pSrc_pixels[i + 2].b * saxis_b) & ~0xF) + i + 2; + int dot3 = ((pSrc_pixels[i + 3].r * saxis_r + pSrc_pixels[i + 3].g * saxis_g + pSrc_pixels[i + 3].b * saxis_b) & ~0xF) + i + 3; + + int min_d01 = std::min(dot0, dot1); + int max_d01 = std::max(dot0, dot1); + + int min_d23 = std::min(dot2, dot3); + int max_d23 = std::max(dot2, dot3); + + int min_d = std::min(min_d01, min_d23); + int max_d = std::max(max_d01, max_d23); + + low_dot = std::min(low_dot, min_d); + high_dot = std::max(high_dot, max_d); + } + low_c = low_dot & 15; + high_c = high_dot & 15; + + lr = to_5(pSrc_pixels[low_c].r); + lg = to_6(pSrc_pixels[low_c].g); + lb = to_5(pSrc_pixels[low_c].b); + + hr = to_5(pSrc_pixels[high_c].r); + hg = to_6(pSrc_pixels[high_c].g); + hb = to_5(pSrc_pixels[high_c].b); + } + } + + static const int8_t s_adjacent_voxels[16][4] = + { + { 1,0,0, 3 }, // 0 + { 0,1,0, 4 }, // 1 + { 0,0,1, 5 }, // 2 + { -1,0,0, 0 }, // 3 + { 0,-1,0, 1 }, // 4 + { 0,0,-1, 2 }, // 5 + { 1,1,0, 9 }, // 6 + { 1,0,1, 10 }, // 7 + { 0,1,1, 11 }, // 8 + { -1,-1,0, 6 }, // 9 + { -1,0,-1, 7 }, // 10 + { 0,-1,-1, 8 }, // 11 + { -1,1,0, 13 }, // 12 + { 1,-1,0, 12 }, // 13 + { 0,-1,1, 15 }, // 14 + { 0,1,-1, 14 }, // 15 + }; + + // From icbc's high quality mode. + static inline void encode_bc1_endpoint_search(const color32* pSrc_pixels, bool any_black_pixels, + uint32_t flags, bc1_encode_results& results, uint32_t cur_err, const uint8_t* pForce_selectors) + { + int& lr = results.lr, & lg = results.lg, & lb = results.lb, & hr = results.hr, & hg = results.hg, & hb = results.hb; + uint8_t* sels = results.sels; + + int prev_improvement_index = 0, forbidden_direction = -1; + + const int endpoint_search_rounds = (flags & cEncodeBC1EndpointSearchRoundsMask) >> cEncodeBC1EndpointSearchRoundsShift; + for (int i = 0; i < endpoint_search_rounds; i++) + { + assert(s_adjacent_voxels[s_adjacent_voxels[i & 15][3]][3] == (i & 15)); + + if (forbidden_direction == (i & 31)) + continue; + + const int8_t delta[3] = { s_adjacent_voxels[i & 15][0], s_adjacent_voxels[i & 15][1], s_adjacent_voxels[i & 15][2] }; + + int trial_lr = lr, trial_lg = lg, trial_lb = lb, trial_hr = hr, trial_hg = hg, trial_hb = hb; + + if ((i >> 4) & 1) + { + trial_lr = clampi(trial_lr + delta[0], 0, 31); + trial_lg = clampi(trial_lg + delta[1], 0, 63); + trial_lb = clampi(trial_lb + delta[2], 0, 31); + } + else + { + trial_hr = clampi(trial_hr + delta[0], 0, 31); + trial_hg = clampi(trial_hg + delta[1], 0, 63); + trial_hb = clampi(trial_hb + delta[2], 0, 31); + } + + uint8_t trial_sels[16]; + + uint32_t trial_err; + if (results.m_3color) + { + trial_err = bc1_find_sels3_fullerr( + ((any_black_pixels) && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)), + pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err, pForce_selectors); + } + else + { + trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err, pForce_selectors); + } + + if (trial_err < cur_err) + { + cur_err = trial_err; + + forbidden_direction = s_adjacent_voxels[i & 15][3] | (i & 16); + + lr = trial_lr, lg = trial_lg, lb = trial_lb, hr = trial_hr, hg = trial_hg, hb = trial_hb; + + memcpy(sels, trial_sels, 16); + + prev_improvement_index = i; + } + + if (i - prev_improvement_index > 32) + break; + } + } + + void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try, uint32_t total_orderings_to_try3, const uint8_t* pForce_selectors) + { + assert(g_initialized); + + const color32* pSrc_pixels = (const color32*)pPixels; + bc1_block* pDst_block = static_cast(pDst); + + int avg_r, avg_g, avg_b, min_r, min_g, min_b, max_r, max_g, max_b; + + const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b; + + uint32_t j; + for (j = 15; j >= 1; --j) + if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb)) + break; + + if (j == 0) + { + encode_bc1_solid_block(pDst, fr, fg, fb, (flags & (cEncodeBC1Use3ColorBlocks | cEncodeBC1Use3ColorBlocksForBlackPixels)) != 0); + return; + } + + int total_r = fr, total_g = fg, total_b = fb; + + max_r = fr, max_g = fg, max_b = fb; + min_r = fr, min_g = fg, min_b = fb; + + uint32_t grayscale_flag = (fr == fg) && (fr == fb); + uint32_t any_black_pixels = (fr | fg | fb) < 4; + + for (uint32_t i = 1; i < 16; i++) + { + const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; + + grayscale_flag &= ((r == g) && (r == b)); + any_black_pixels |= ((r | g | b) < 4); + + max_r = std::max(max_r, r); max_g = std::max(max_g, g); max_b = std::max(max_b, b); + min_r = std::min(min_r, r); min_g = std::min(min_g, g); min_b = std::min(min_b, b); + total_r += r; total_g += g; total_b += b; + } + + avg_r = (total_r + 8) >> 4, avg_g = (total_g + 8) >> 4, avg_b = (total_b + 8) >> 4; + + bc1_encode_results results; + results.m_3color = false; + + uint8_t* sels = results.sels; + int& lr = results.lr, & lg = results.lg, & lb = results.lb, & hr = results.hr, & hg = results.hg, & hb = results.hb; + int orig_lr = 0, orig_lg = 0, orig_lb = 0, orig_hr = 0, orig_hg = 0, orig_hb = 0; + + lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0; + + const bool needs_block_error = ((flags & (cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use3ColorBlocks | cEncodeBC1UseFullMSEEval | cEncodeBC1EndpointSearchRoundsMask)) != 0) || + (any_black_pixels && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)); + + uint32_t cur_err = UINT32_MAX; + + if (!needs_block_error) + { + assert((flags & cEncodeBC1TryAllInitialEndponts) == 0); + + encode_bc1_pick_initial(pSrc_pixels, flags, grayscale_flag != 0, + min_r, min_g, min_b, max_r, max_g, max_b, + avg_r, avg_g, avg_b, total_r, total_g, total_b, + lr, lg, lb, hr, hg, hb); + + orig_lr = lr, orig_lg = lg, orig_lb = lb, orig_hr = hr, orig_hg = hg, orig_hb = hb; + + bc1_find_sels4_noerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, pForce_selectors); + + const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; + for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) + { + int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; + + vec3F xl, xh; + if (!compute_least_squares_endpoints4_rgb(pSrc_pixels, sels, &xl, &xh, total_r, total_g, total_b)) + { + // All selectors equal - treat it as a solid block which should always be equal or better. + trial_lr = g_bc1_match5_equals_1[avg_r].m_hi; + trial_lg = g_bc1_match6_equals_1[avg_g].m_hi; + trial_lb = g_bc1_match5_equals_1[avg_b].m_hi; + + trial_hr = g_bc1_match5_equals_1[avg_r].m_lo; + trial_hg = g_bc1_match6_equals_1[avg_g].m_lo; + trial_hb = g_bc1_match5_equals_1[avg_b].m_lo; + + // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. + } + else + { + precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); + } + + if ((lr == trial_lr) && (lg == trial_lg) && (lb == trial_lb) && (hr == trial_hr) && (hg == trial_hg) && (hb == trial_hb)) + break; + + bc1_find_sels4_noerr(pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, sels, pForce_selectors); + + lr = trial_lr; + lg = trial_lg; + lb = trial_lb; + hr = trial_hr; + hg = trial_hg; + hb = trial_hb; + + } // ls_pass + } + else + { + const uint32_t total_rounds = (flags & cEncodeBC1TryAllInitialEndponts) ? 2 : 1; + for (uint32_t round = 0; round < total_rounds; round++) + { + uint32_t modified_flags = flags; + if (round == 1) + { + modified_flags &= ~(cEncodeBC1Use2DLS | cEncodeBC1BoundingBox); + modified_flags |= cEncodeBC1BoundingBox; + } + + int round_lr, round_lg, round_lb, round_hr, round_hg, round_hb; + uint8_t round_sels[16]; + + encode_bc1_pick_initial(pSrc_pixels, modified_flags, grayscale_flag != 0, + min_r, min_g, min_b, max_r, max_g, max_b, + avg_r, avg_g, avg_b, total_r, total_g, total_b, + round_lr, round_lg, round_lb, round_hr, round_hg, round_hb); + + int orig_round_lr = round_lr, orig_round_lg = round_lg, orig_round_lb = round_lb, orig_round_hr = round_hr, orig_round_hg = round_hg, orig_round_hb = round_hb; + + uint32_t round_err = bc1_find_sels4(flags, pSrc_pixels, round_lr, round_lg, round_lb, round_hr, round_hg, round_hb, round_sels, UINT32_MAX, pForce_selectors); + + const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; + for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) + { + int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; + + vec3F xl, xh; + if (!compute_least_squares_endpoints4_rgb(pSrc_pixels, round_sels, &xl, &xh, total_r, total_g, total_b)) + { + // All selectors equal - treat it as a solid block which should always be equal or better. + trial_lr = g_bc1_match5_equals_1[avg_r].m_hi; + trial_lg = g_bc1_match6_equals_1[avg_g].m_hi; + trial_lb = g_bc1_match5_equals_1[avg_b].m_hi; + + trial_hr = g_bc1_match5_equals_1[avg_r].m_lo; + trial_hg = g_bc1_match6_equals_1[avg_g].m_lo; + trial_hb = g_bc1_match5_equals_1[avg_b].m_lo; + + // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. + } + else + { + precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); + } + + if ((round_lr == trial_lr) && (round_lg == trial_lg) && (round_lb == trial_lb) && (round_hr == trial_hr) && (round_hg == trial_hg) && (round_hb == trial_hb)) + break; + + uint8_t trial_sels[16]; + uint32_t trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, round_err, pForce_selectors); + + if (trial_err < round_err) + { + round_lr = trial_lr; + round_lg = trial_lg; + round_lb = trial_lb; + + round_hr = trial_hr; + round_hg = trial_hg; + round_hb = trial_hb; + + round_err = trial_err; + memcpy(round_sels, trial_sels, 16); + } + else + break; + + } // ls_pass + + if (round_err <= cur_err) + { + cur_err = round_err; + + lr = round_lr; + lg = round_lg; + lb = round_lb; + hr = round_hr; + hg = round_hg; + hb = round_hb; + + orig_lr = orig_round_lr; + orig_lg = orig_round_lg; + orig_lb = orig_round_lb; + orig_hr = orig_round_hr; + orig_hg = orig_round_hg; + orig_hb = orig_round_hb; + + memcpy(sels, round_sels, 16); + } + + } // round + } + + if ((cur_err) && (flags & cEncodeBC1UseLikelyTotalOrderings)) + { + assert(needs_block_error); + + const uint32_t total_iters = (flags & cEncodeBC1Iterative) ? 2 : 1; + for (uint32_t iter_index = 0; iter_index < total_iters; iter_index++) + { + const uint32_t orig_err = cur_err; + + hist4 h; + for (uint32_t i = 0; i < 16; i++) + { + assert(sels[i] < 4); + h.m_hist[sels[i]]++; + } + + const uint32_t orig_total_order_index = h.lookup_total_ordering_index(); + + int r0, g0, b0, r3, g3, b3; + r0 = (lr << 3) | (lr >> 2); g0 = (lg << 2) | (lg >> 4); b0 = (lb << 3) | (lb >> 2); + r3 = (hr << 3) | (hr >> 2); g3 = (hg << 2) | (hg >> 4); b3 = (hb << 3) | (hb >> 2); + + int ar = r3 - r0, ag = g3 - g0, ab = b3 - b0; + + int dots[16]; + for (uint32_t i = 0; i < 16; i++) + { + int r = pSrc_pixels[i].r; + int g = pSrc_pixels[i].g; + int b = pSrc_pixels[i].b; + int d = 0x1000000 + (r * ar + g * ag + b * ab); + assert(d >= 0); + dots[i] = (d << 4) + i; + } + + std::sort(dots, dots + 16); + + uint32_t r_sum[17], g_sum[17], b_sum[17]; + uint32_t r = 0, g = 0, b = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t p = dots[i] & 15; + + r_sum[i] = r; + g_sum[i] = g; + b_sum[i] = b; + + r += pSrc_pixels[p].r; + g += pSrc_pixels[p].g; + b += pSrc_pixels[p].b; + } + + r_sum[16] = total_r; + g_sum[16] = total_g; + b_sum[16] = total_b; + + const uint32_t q_total = (flags & cEncodeBC1Exhaustive) ? NUM_UNIQUE_TOTAL_ORDERINGS4 : clampi(total_orderings_to_try, MIN_TOTAL_ORDERINGS, MAX_TOTAL_ORDERINGS4); + for (uint32_t q = 0; q < q_total; q++) + { + const uint32_t s = (flags & cEncodeBC1Exhaustive) ? q : g_best_total_orderings4[orig_total_order_index][q]; + + int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; + + vec3F xl, xh; + + if ((s == TOTAL_ORDER_4_0_16) || (s == TOTAL_ORDER_4_1_16) || (s == TOTAL_ORDER_4_2_16) || (s == TOTAL_ORDER_4_3_16)) + { + trial_lr = g_bc1_match5_equals_1[avg_r].m_hi; + trial_lg = g_bc1_match6_equals_1[avg_g].m_hi; + trial_lb = g_bc1_match5_equals_1[avg_b].m_hi; + + trial_hr = g_bc1_match5_equals_1[avg_r].m_lo; + trial_hg = g_bc1_match6_equals_1[avg_g].m_lo; + trial_hb = g_bc1_match5_equals_1[avg_b].m_lo; + } + else + { + compute_least_squares_endpoints4_rgb(&xl, &xh, total_r, total_g, total_b, + g_selector_factors4[s][0], g_selector_factors4[s][1], g_selector_factors4[s][2], s, r_sum, g_sum, b_sum); + + precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); + } + + uint8_t trial_sels[16]; + + uint32_t trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err, pForce_selectors); + + if (trial_err < cur_err) + { + cur_err = trial_err; + + lr = trial_lr; + lg = trial_lg; + lb = trial_lb; + + hr = trial_hr; + hg = trial_hg; + hb = trial_hb; + + memcpy(sels, trial_sels, 16); + } + + } // s + + if ((!cur_err) || (cur_err == orig_err)) + break; + + } // iter_index + } + + if (((flags & (cEncodeBC1Use3ColorBlocks | cEncodeBC1Use3ColorBlocksForBlackPixels)) != 0) && (cur_err)) + { + if (flags & cEncodeBC1Use3ColorBlocks) + { + assert(needs_block_error); + try_3color_block(pSrc_pixels, flags, cur_err, avg_r, avg_g, avg_b, orig_lr, orig_lg, orig_lb, orig_hr, orig_hg, orig_hb, total_r, total_g, total_b, total_orderings_to_try3, results, pForce_selectors); + } + + if ((any_black_pixels) && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)) + { + assert(needs_block_error); + try_3color_block_useblack(pSrc_pixels, flags, cur_err, results, pForce_selectors); + } + } + + if ((flags & cEncodeBC1EndpointSearchRoundsMask) && (cur_err)) + { + assert(needs_block_error); + + encode_bc1_endpoint_search(pSrc_pixels, any_black_pixels != 0, flags, results, cur_err, pForce_selectors); + } + + if (results.m_3color) + bc1_encode3(pDst_block, results.lr, results.lg, results.lb, results.hr, results.hg, results.hb, results.sels); + else + bc1_encode4(pDst_block, results.lr, results.lg, results.lb, results.hr, results.hg, results.hb, results.sels); + } + + // BC3-5 + + void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride) + { + assert(g_initialized); + + uint32_t min0_v, max0_v, min1_v, max1_v, min2_v, max2_v, min3_v, max3_v; + + { + min0_v = max0_v = pPixels[0 * stride]; + min1_v = max1_v = pPixels[1 * stride]; + min2_v = max2_v = pPixels[2 * stride]; + min3_v = max3_v = pPixels[3 * stride]; + } + + { + uint32_t v0 = pPixels[4 * stride]; min0_v = std::min(min0_v, v0); max0_v = std::max(max0_v, v0); + uint32_t v1 = pPixels[5 * stride]; min1_v = std::min(min1_v, v1); max1_v = std::max(max1_v, v1); + uint32_t v2 = pPixels[6 * stride]; min2_v = std::min(min2_v, v2); max2_v = std::max(max2_v, v2); + uint32_t v3 = pPixels[7 * stride]; min3_v = std::min(min3_v, v3); max3_v = std::max(max3_v, v3); + } + + { + uint32_t v0 = pPixels[8 * stride]; min0_v = std::min(min0_v, v0); max0_v = std::max(max0_v, v0); + uint32_t v1 = pPixels[9 * stride]; min1_v = std::min(min1_v, v1); max1_v = std::max(max1_v, v1); + uint32_t v2 = pPixels[10 * stride]; min2_v = std::min(min2_v, v2); max2_v = std::max(max2_v, v2); + uint32_t v3 = pPixels[11 * stride]; min3_v = std::min(min3_v, v3); max3_v = std::max(max3_v, v3); + } + + { + uint32_t v0 = pPixels[12 * stride]; min0_v = std::min(min0_v, v0); max0_v = std::max(max0_v, v0); + uint32_t v1 = pPixels[13 * stride]; min1_v = std::min(min1_v, v1); max1_v = std::max(max1_v, v1); + uint32_t v2 = pPixels[14 * stride]; min2_v = std::min(min2_v, v2); max2_v = std::max(max2_v, v2); + uint32_t v3 = pPixels[15 * stride]; min3_v = std::min(min3_v, v3); max3_v = std::max(max3_v, v3); + } + + const uint32_t min_v = minimum(min0_v, min1_v, min2_v, min3_v); + const uint32_t max_v = maximum(max0_v, max1_v, max2_v, max3_v); + + uint8_t* pDst_bytes = static_cast(pDst); + pDst_bytes[0] = (uint8_t)max_v; + pDst_bytes[1] = (uint8_t)min_v; + + if (max_v == min_v) + { + memset(pDst_bytes + 2, 0, 6); + return; + } + + const uint32_t delta = max_v - min_v; + + // min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors. + const int t0 = delta * 13; + const int t1 = delta * 11; + const int t2 = delta * 9; + const int t3 = delta * 7; + const int t4 = delta * 5; + const int t5 = delta * 3; + const int t6 = delta * 1; + + // BC4 floors in its divisions, which we compensate for with the 4 bias. + // This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one). + const int bias = 4 - min_v * 14; + + static const uint32_t s_tran0[8] = { 1U , 7U , 6U , 5U , 4U , 3U , 2U , 0U }; + static const uint32_t s_tran1[8] = { 1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U }; + static const uint32_t s_tran2[8] = { 1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U }; + static const uint32_t s_tran3[8] = { 1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U }; + + uint64_t a0, a1, a2, a3; + { + const int v0 = pPixels[0 * stride] * 14 + bias; + const int v1 = pPixels[1 * stride] * 14 + bias; + const int v2 = pPixels[2 * stride] * 14 + bias; + const int v3 = pPixels[3 * stride] * 14 + bias; + a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]; + a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]; + a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]; + a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]; + } + + { + const int v0 = pPixels[4 * stride] * 14 + bias; + const int v1 = pPixels[5 * stride] * 14 + bias; + const int v2 = pPixels[6 * stride] * 14 + bias; + const int v3 = pPixels[7 * stride] * 14 + bias; + a0 |= (uint64_t)(s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U); + a1 |= (uint64_t)(s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U); + a2 |= (uint64_t)(s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U); + a3 |= (uint64_t)(s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U); + } + + { + const int v0 = pPixels[8 * stride] * 14 + bias; + const int v1 = pPixels[9 * stride] * 14 + bias; + const int v2 = pPixels[10 * stride] * 14 + bias; + const int v3 = pPixels[11 * stride] * 14 + bias; + a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U); + a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U); + a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U); + a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U); + } + + { + const int v0 = pPixels[12 * stride] * 14 + bias; + const int v1 = pPixels[13 * stride] * 14 + bias; + const int v2 = pPixels[14 * stride] * 14 + bias; + const int v3 = pPixels[15 * stride] * 14 + bias; + a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U); + a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U); + a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U); + a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U); + } + + const uint64_t f = a0 | a1 | a2 | a3; + + pDst_bytes[2] = (uint8_t)f; + pDst_bytes[3] = (uint8_t)(f >> 8U); + pDst_bytes[4] = (uint8_t)(f >> 16U); + pDst_bytes[5] = (uint8_t)(f >> 24U); + pDst_bytes[6] = (uint8_t)(f >> 32U); + pDst_bytes[7] = (uint8_t)(f >> 40U); + } + + uint32_t encode_bc4_hq(void* pDst, const uint8_t* pPixels, uint32_t stride, uint32_t search_rad, uint32_t mode_flag, const uint8_t* pForce_selectors) + { + assert(mode_flag); + + uint8_t* pDst_bytes = static_cast(pDst); + + uint32_t min_val = 255, max_val = 0; + for (uint32_t i = 0; i < 16; i++) + { + uint32_t val = pPixels[i * stride]; + min_val = std::min(val, min_val); + max_val = std::max(val, max_val); + } + + if (min_val == max_val) + { + if (mode_flag & BC4_USE_MODE6_FLAG) + { + pDst_bytes[0] = (uint8_t)min_val; + pDst_bytes[1] = (uint8_t)min_val; + + memset(pDst_bytes + 2, 0, 6); + + assert(!(pDst_bytes[0] > pDst_bytes[1])); + } + else + { + // Use an 8 value encoding + if (min_val > 0) + { + pDst_bytes[0] = (uint8_t)min_val; + pDst_bytes[1] = (uint8_t)min_val - 1; + + memset(pDst_bytes + 2, 0, 6); + } + else + { + static const uint8_t s_const_1_vals[8] = { 1, 0, 0x49, 0x92, 0x24, 0x49, 0x92, 0x24 }; + memcpy(pDst_bytes, s_const_1_vals, 8); + } + + assert(pDst_bytes[0] > pDst_bytes[1]); + } + +#if defined(_DEBUG) || defined(DEBUG) + { + bc4_block* pBlock = (bc4_block*)pDst; + uint8_t pixels[16]; + unpack_bc4(pDst, pixels, 1); + for (uint32_t i = 0; i < 16; i++) + assert(pixels[i] == min_val); + if (mode_flag & BC4_USE_MODE6_FLAG) + { + assert(pBlock->is_alpha6_block()); + } + else + { + assert(!pBlock->is_alpha6_block()); + } + } +#endif + + return 0; + } + + uint32_t best_err = UINT32_MAX; + for (uint32_t mode = 0; mode < 2; mode++) + { + if ((mode_flag & (1 << mode)) == 0) + continue; + + for (int lo_delta = -(int)search_rad; lo_delta <= (int)search_rad; lo_delta++) + { + for (int hi_delta = -(int)search_rad; hi_delta <= (int)search_rad; hi_delta++) + { + bc4_block trial_block; + trial_block.m_endpoints[0] = (uint8_t)clamp(max_val + hi_delta, 0, 255); + trial_block.m_endpoints[1] = (uint8_t)clamp(min_val + lo_delta, 0, 255); + + if (trial_block.m_endpoints[0] == trial_block.m_endpoints[1]) + continue; + + if (mode == 0) + { + if (trial_block.is_alpha6_block()) + std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]); + } + else if (!trial_block.is_alpha6_block()) + std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]); + + uint8_t block_vals[8]; + trial_block.get_block_values(block_vals, trial_block.m_endpoints[0], trial_block.m_endpoints[1]); + + uint32_t trial_err = 0; + uint8_t trial_sels[16]; + + if (pForce_selectors) + { + memcpy(trial_sels, pForce_selectors, 16); + + for (uint32_t i = 0; i < 16; i++) + trial_err += squarei(block_vals[pForce_selectors[i]] - pPixels[i * stride]); + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + uint32_t best_index_err = UINT32_MAX; + uint32_t best_index = 0; + for (uint32_t j = 0; j < 8; j++) + { + uint32_t err = squarei(block_vals[j] - pPixels[i * stride]); + if (err < best_index_err) + { + best_index_err = err; + best_index = j; + if (!err) + break; + } + } + + trial_err += best_index_err; + if (trial_err >= best_err) + break; + + trial_sels[i] = (uint8_t)best_index; + } // i + } + + if (trial_err < best_err) + { + best_err = trial_err; + + uint64_t sel_vals = 0; + for (uint32_t i = 0; i < 16; i++) + sel_vals |= ((uint64_t)trial_sels[i] << (i * 3)); + + trial_block.m_selectors[0] = (uint8_t)sel_vals; + trial_block.m_selectors[1] = (uint8_t)(sel_vals >> 8); + trial_block.m_selectors[2] = (uint8_t)(sel_vals >> 16); + trial_block.m_selectors[3] = (uint8_t)(sel_vals >> 24); + trial_block.m_selectors[4] = (uint8_t)(sel_vals >> 32); + trial_block.m_selectors[5] = (uint8_t)(sel_vals >> 40); + + memcpy(pDst_bytes, &trial_block, sizeof(bc4_block)); + } // if (trial_err < best_err) + + } // hi_delta + + } // lo_delta + + } // mode + + return best_err; + } + + void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try) + { + assert(g_initialized); + + // 3-color blocks are not allowed with BC3 (on most GPU's). + flags &= ~(cEncodeBC1Use3ColorBlocksForBlackPixels | cEncodeBC1Use3ColorBlocks); + + encode_bc4(pDst, pPixels + 3, 4); + encode_bc1(static_cast(pDst) + 8, pPixels, flags, total_orderings_to_try); + } + + void encode_bc3(uint32_t level, void* pDst, const uint8_t* pPixels) + { + assert(g_initialized); + + encode_bc4(pDst, pPixels + 3, 4); + encode_bc1(level, static_cast(pDst) + 8, pPixels, false, false); + } + + void encode_bc3_hq(uint32_t level, void* pDst, const uint8_t* pPixels, uint32_t alpha_search_rad, uint32_t alpha_modes) + { + assert(g_initialized); + + encode_bc4_hq(pDst, pPixels + 3, 4, alpha_search_rad, alpha_modes); + encode_bc1(level, static_cast(pDst) + 8, pPixels, false, false); + } + + void encode_bc5(void* pDst, const uint8_t* pPixels, uint32_t chan0, uint32_t chan1, uint32_t stride) + { + assert(g_initialized); + + encode_bc4(pDst, pPixels + chan0, stride); + encode_bc4(static_cast(pDst) + 8, pPixels + chan1, stride); + } + + void encode_bc5_hq(void* pDst, const uint8_t* pPixels, uint32_t chan0, uint32_t chan1, uint32_t stride, uint32_t alpha_search_rad, uint32_t alpha_modes) + { + assert(g_initialized); + + encode_bc4_hq(pDst, pPixels + chan0, stride, alpha_search_rad, alpha_modes); + encode_bc4_hq(static_cast(pDst) + 8, pPixels + chan1, stride, alpha_search_rad, alpha_modes); + } + + bool unpack_bc1_block_colors(const void* pBlock_bits, color32* c, bc1_approx_mode mode) + { + const bc1_block* pBlock = static_cast(pBlock_bits); + + const uint32_t l = pBlock->get_low_color(); + const uint32_t h = pBlock->get_high_color(); + + const int cr0 = (l >> 11) & 31; + const int cg0 = (l >> 5) & 63; + const int cb0 = l & 31; + const int r0 = (cr0 << 3) | (cr0 >> 2); + const int g0 = (cg0 << 2) | (cg0 >> 4); + const int b0 = (cb0 << 3) | (cb0 >> 2); + + const int cr1 = (h >> 11) & 31; + const int cg1 = (h >> 5) & 63; + const int cb1 = h & 31; + const int r1 = (cr1 << 3) | (cr1 >> 2); + const int g1 = (cg1 << 2) | (cg1 >> 4); + const int b1 = (cb1 << 3) | (cb1 >> 2); + + bool used_punchthrough = false; + + if (l > h) + { + c[0].set_noclamp_rgba(r0, g0, b0, 255); + c[1].set_noclamp_rgba(r1, g1, b1, 255); + switch (mode) + { + case bc1_approx_mode::cBC1Ideal: + c[2].set_noclamp_rgba((r0 * 2 + r1) / 3, (g0 * 2 + g1) / 3, (b0 * 2 + b1) / 3, 255); + c[3].set_noclamp_rgba((r1 * 2 + r0) / 3, (g1 * 2 + g0) / 3, (b1 * 2 + b0) / 3, 255); + break; + case bc1_approx_mode::cBC1IdealRound4: + c[2].set_noclamp_rgba((r0 * 2 + r1 + 1) / 3, (g0 * 2 + g1 + 1) / 3, (b0 * 2 + b1 + 1) / 3, 255); + c[3].set_noclamp_rgba((r1 * 2 + r0 + 1) / 3, (g1 * 2 + g0 + 1) / 3, (b1 * 2 + b0 + 1) / 3, 255); + break; + case bc1_approx_mode::cBC1NVidia: + c[2].set_noclamp_rgba(interp_5_nv(cr0, cr1), interp_6_nv(g0, g1), interp_5_nv(cb0, cb1), 255); + c[3].set_noclamp_rgba(interp_5_nv(cr1, cr0), interp_6_nv(g1, g0), interp_5_nv(cb1, cb0), 255); + break; + case bc1_approx_mode::cBC1AMD: + c[2].set_noclamp_rgba(interp_5_6_amd(r0, r1), interp_5_6_amd(g0, g1), interp_5_6_amd(b0, b1), 255); + c[3].set_noclamp_rgba(interp_5_6_amd(r1, r0), interp_5_6_amd(g1, g0), interp_5_6_amd(b1, b0), 255); + break; + } + } + else + { + c[0].set_noclamp_rgba(r0, g0, b0, 255); + c[1].set_noclamp_rgba(r1, g1, b1, 255); + switch (mode) + { + case bc1_approx_mode::cBC1Ideal: + case bc1_approx_mode::cBC1IdealRound4: + c[2].set_noclamp_rgba((r0 + r1) / 2, (g0 + g1) / 2, (b0 + b1) / 2, 255); + break; + case bc1_approx_mode::cBC1NVidia: + c[2].set_noclamp_rgba(interp_half_5_nv(cr0, cr1), interp_half_6_nv(g0, g1), interp_half_5_nv(cb0, cb1), 255); + break; + case bc1_approx_mode::cBC1AMD: + c[2].set_noclamp_rgba(interp_half_5_6_amd(r0, r1), interp_half_5_6_amd(g0, g1), interp_half_5_6_amd(b0, b1), 255); + break; + } + + c[3].set_noclamp_rgba(0, 0, 0, 0); + used_punchthrough = true; + } + + return used_punchthrough; + } + + // Returns true if the block uses 3 color punchthrough alpha mode. + bool unpack_bc1(const void* pBlock_bits, void* pPixels, bool set_alpha, bc1_approx_mode mode) + { + color32* pDst_pixels = static_cast(pPixels); + + static_assert(sizeof(bc1_block) == 8, "sizeof(bc1_block) == 8"); + static_assert(sizeof(bc4_block) == 8, "sizeof(bc4_block) == 8"); + + const bc1_block* pBlock = static_cast(pBlock_bits); + + color32 c[4]; + const bool used_punchthrough = unpack_bc1_block_colors(pBlock_bits, c, mode); + + if (set_alpha) + { + for (uint32_t y = 0; y < 4; y++, pDst_pixels += 4) + { + pDst_pixels[0] = c[pBlock->get_selector(0, y)]; + pDst_pixels[1] = c[pBlock->get_selector(1, y)]; + pDst_pixels[2] = c[pBlock->get_selector(2, y)]; + pDst_pixels[3] = c[pBlock->get_selector(3, y)]; + } + } + else + { + for (uint32_t y = 0; y < 4; y++, pDst_pixels += 4) + { + pDst_pixels[0].set_rgb(c[pBlock->get_selector(0, y)]); + pDst_pixels[1].set_rgb(c[pBlock->get_selector(1, y)]); + pDst_pixels[2].set_rgb(c[pBlock->get_selector(2, y)]); + pDst_pixels[3].set_rgb(c[pBlock->get_selector(3, y)]); + } + } + + return used_punchthrough; + } + + void unpack_bc4(const void* pBlock_bits, uint8_t* pPixels, uint32_t stride) + { + static_assert(sizeof(bc4_block) == 8, "sizeof(bc4_block) == 8"); + + const bc4_block* pBlock = static_cast(pBlock_bits); + + uint8_t sel_values[8]; + bc4_block::get_block_values(sel_values, pBlock->get_low_alpha(), pBlock->get_high_alpha()); + + const uint64_t selector_bits = pBlock->get_selector_bits(); + + for (uint32_t y = 0; y < 4; y++, pPixels += (stride * 4U)) + { + pPixels[0] = sel_values[pBlock->get_selector(0, y, selector_bits)]; + pPixels[stride * 1] = sel_values[pBlock->get_selector(1, y, selector_bits)]; + pPixels[stride * 2] = sel_values[pBlock->get_selector(2, y, selector_bits)]; + pPixels[stride * 3] = sel_values[pBlock->get_selector(3, y, selector_bits)]; + } + } + + // Returns false if the block uses 3-color punchthrough alpha mode, which isn't supported on some GPU's for BC3. + bool unpack_bc3(const void* pBlock_bits, void* pPixels, bc1_approx_mode mode) + { + color32* pDst_pixels = static_cast(pPixels); + + bool success = true; + + if (unpack_bc1((const uint8_t*)pBlock_bits + sizeof(bc4_block), pDst_pixels, true, mode)) + success = false; + + unpack_bc4(pBlock_bits, &pDst_pixels[0].a, sizeof(color32)); + + return success; + } + + // writes RG + void unpack_bc5(const void* pBlock_bits, void* pPixels, uint32_t chan0, uint32_t chan1, uint32_t stride) + { + unpack_bc4(pBlock_bits, (uint8_t*)pPixels + chan0, stride); + unpack_bc4((const uint8_t*)pBlock_bits + sizeof(bc4_block), (uint8_t*)pPixels + chan1, stride); + } + +} // namespace rgbcx + + + diff --git a/libkram/bc7enc/rgbcx.h b/libkram/bc7enc/rgbcx.h index 748d39e2..cf793921 100644 --- a/libkram/bc7enc/rgbcx.h +++ b/libkram/bc7enc/rgbcx.h @@ -1,7 +1,9 @@ -// rgbcx.h v1.12 -// High-performance scalar BC1-5 encoders. Public Domain or MIT license (you choose - see below), written by Richard Geldreich 2020 . +// rgbcx.h v1.13 +// High-performance scalar encoders and RDO (Rate Distortion Optimization) post processors for BC1-5. +// Public Domain or MIT license (you choose - see below), written by Richard Geldreich 2020 . // // Influential references: +// https://tinyurl.com/y3vxz457 (Ortego and Ramchandran, "Rate-distortion Methods for Image and Video Compression", 1998) // http://sjbrown.co.uk/2006/01/19/dxt-compression-techniques/ // https://github.com/nothings/stb/blob/master/stb_dxt.h // https://gist.github.com/castano/c92c7626f288f9e99e158520b14a61cf @@ -56,9 +58,13 @@ #ifndef RGBCX_INCLUDE_H #define RGBCX_INCLUDE_H +#ifdef _MSC_VER +#pragma warning (disable:4201) //nameless struct/union +#endif + #include #include -//#include +#include #include #include @@ -87,6 +93,57 @@ namespace rgbcx cBC1IdealRound4 = 3 }; + enum class eNoClamp { cNoClamp }; + static inline uint8_t clamp255(int32_t i) { return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i); } + + template inline S maximum(S a, S b) { return (a > b) ? a : b; } + template inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); } + template inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); } + + template inline S minimum(S a, S b) { return (a < b) ? a : b; } + template inline S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); } + template inline S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); } + + struct color32 + { + union + { + struct + { + uint8_t r; + uint8_t g; + uint8_t b; + uint8_t a; + }; + + uint8_t c[4]; + + uint32_t m; + }; + + color32() { } + + color32(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); } + color32(eNoClamp unused, uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { (void)unused; set_noclamp_rgba(vr, vg, vb, va); } + + void set(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { c[0] = static_cast(vr); c[1] = static_cast(vg); c[2] = static_cast(vb); c[3] = static_cast(va); } + + void set_noclamp_rgb(uint32_t vr, uint32_t vg, uint32_t vb) { c[0] = static_cast(vr); c[1] = static_cast(vg); c[2] = static_cast(vb); } + void set_noclamp_rgba(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); } + + void set_clamped(int vr, int vg, int vb, int va) { c[0] = clamp255(vr); c[1] = clamp255(vg); c[2] = clamp255(vb); c[3] = clamp255(va); } + + uint8_t operator[] (uint32_t idx) const { assert(idx < 4); return c[idx]; } + uint8_t& operator[] (uint32_t idx) { assert(idx < 4); return c[idx]; } + + bool operator== (const color32& rhs) const { return m == rhs.m; } + + void set_rgb(const color32& other) { c[0] = static_cast(other.c[0]); c[1] = static_cast(other.c[1]); c[2] = static_cast(other.c[2]); } + + static color32 comp_min(const color32& a, const color32& b) { return color32(eNoClamp::cNoClamp, std::min(a[0], b[0]), std::min(a[1], b[1]), std::min(a[2], b[2]), std::min(a[3], b[3])); } + static color32 comp_max(const color32& a, const color32& b) { return color32(eNoClamp::cNoClamp, std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); } + }; + // init() MUST be called once before using the BC1 encoder. // This function may be called multiple times to change the BC1 approximation mode. // This function initializes global state, so don't call it while other threads inside the encoder. @@ -177,30 +234,41 @@ namespace rgbcx // Note that the 3 color modes won't be used at all until level 5 or higher. // No transparency supported, however if you set use_transparent_texels_for_black to true the encocer will use transparent selectors on very dark/black texels to reduce MSE. const uint32_t MIN_LEVEL = 0, MAX_LEVEL = 18; - void encode_bc1(uint32_t level, void* pDst, const uint8_t* pPixels, bool allow_3color, bool use_transparent_texels_for_black); + void encode_bc1(uint32_t level, void* pDst, const uint8_t* pPixels, bool allow_3color, bool use_transparent_texels_for_black, const uint8_t* pForce_selectors = nullptr); // Low-level interface for BC1 encoding. // Always returns a 4 color block, unless cEncodeBC1Use3ColorBlocksForBlackPixels or cEncodeBC1Use3ColorBlock flags are specified. // total_orderings_to_try controls the perf. vs. quality tradeoff on 4-color blocks when the cEncodeBC1UseLikelyTotalOrderings flag is used. It must range between [MIN_TOTAL_ORDERINGS, MAX_TOTAL_ORDERINGS4]. // total_orderings_to_try3 controls the perf. vs. quality tradeoff on 3-color bocks when the cEncodeBC1UseLikelyTotalOrderings and the cEncodeBC1Use3ColorBlocks flags are used. Valid range is [0,MAX_TOTAL_ORDERINGS3] (0=disabled). - void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags = 0, uint32_t total_orderings_to_try = DEFAULT_TOTAL_ORDERINGS_TO_TRY, uint32_t total_orderings_to_try3 = DEFAULT_TOTAL_ORDERINGS_TO_TRY3); - + void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags = 0, uint32_t total_orderings_to_try = DEFAULT_TOTAL_ORDERINGS_TO_TRY, uint32_t total_orderings_to_try3 = DEFAULT_TOTAL_ORDERINGS_TO_TRY3, const uint8_t *pForce_selectors = nullptr); + + // Constants used for high quality BC4/BC5 encoding (and alpha of BC3) + const uint32_t BC4_DEFAULT_SEARCH_RAD = 3; + const uint32_t BC4_USE_MODE8_FLAG = 1; + const uint32_t BC4_USE_MODE6_FLAG = 2; + const uint32_t BC4_USE_ALL_MODES = 3; + // Encodes a 4x4 block of RGBA pixels to BC3 format. // There are two encode_bc3() functions. // The first is the recommended function, which accepts a level parameter. // The second is a low-level version that allows fine control over BC1 encoding. void encode_bc3(uint32_t level, void* pDst, const uint8_t* pPixels); void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags = 0, uint32_t total_orderings_to_try = DEFAULT_TOTAL_ORDERINGS_TO_TRY); - + void encode_bc3_hq(uint32_t level, void* pDst, const uint8_t* pPixels, uint32_t alpha_search_rad = BC4_DEFAULT_SEARCH_RAD, uint32_t alpha_modes = BC4_USE_ALL_MODES); + // Encodes a single channel to BC4. // stride is the source pixel stride in bytes. void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride = 4); + uint32_t encode_bc4_hq(void* pDst, const uint8_t* pPixels, uint32_t stride = 4, uint32_t search_rad = BC4_DEFAULT_SEARCH_RAD, uint32_t mode_flag = BC4_USE_ALL_MODES, const uint8_t* pForce_selectors = nullptr); // Encodes two channels to BC5. // chan0/chan1 control which channels, stride is the source pixel stride in bytes. void encode_bc5(void* pDst, const uint8_t* pPixels, uint32_t chan0 = 0, uint32_t chan1 = 1, uint32_t stride = 4); + void encode_bc5_hq(void* pDst, const uint8_t* pPixels, uint32_t chan0 = 0, uint32_t chan1 = 1, uint32_t stride = 4, uint32_t alpha_search_rad = BC4_DEFAULT_SEARCH_RAD, uint32_t alpha_modes = BC4_USE_ALL_MODES); // Decompression functions. + + bool unpack_bc1_block_colors(const void* pBlock_bits, color32* c, bc1_approx_mode mode = bc1_approx_mode::cBC1Ideal); // Returns true if the block uses 3 color punchthrough alpha mode. bool unpack_bc1(const void* pBlock_bits, void* pPixels, bool set_alpha = true, bc1_approx_mode mode = bc1_approx_mode::cBC1Ideal); @@ -211,1273 +279,8 @@ namespace rgbcx bool unpack_bc3(const void* pBlock_bits, void* pPixels, bc1_approx_mode mode = bc1_approx_mode::cBC1Ideal); void unpack_bc5(const void* pBlock_bits, void* pPixels, uint32_t chan0 = 0, uint32_t chan1 = 1, uint32_t stride = 4); -} -#endif // #ifndef RGBCX_INCLUDE_H - -#ifdef RGBCX_IMPLEMENTATION -namespace rgbcx -{ - const uint32_t NUM_UNIQUE_TOTAL_ORDERINGS4 = 969; - - // All total orderings for 16 pixels 2-bit selectors. - // BC1 selector order 0, 2, 3, 1 (i.e. the selectors are reordered into linear order). - static uint8_t g_unique_total_orders4[NUM_UNIQUE_TOTAL_ORDERINGS4][4] = - { - {0,8,2,6},{4,3,9,0},{4,8,1,3},{12,0,3,1},{11,3,2,0},{6,4,6,0},{7,5,0,4},{6,0,8,2},{1,0,0,15},{3,0,8,5},{1,1,13,1},{13,1,2,0},{0,14,1,1},{0,15,1,0},{0,13,0,3},{16,0,0,0},{4,3,4,5},{8,6,0,2},{0,10,0,6},{10,0,4,2},{7,2,1,6},{4,7,5,0},{1,4,7,4},{0,14,2,0},{2,7,2,5},{9,0,5,2},{9,2,2,3},{10,0,5,1},{2,3,7,4},{4,9,0,3},{1,5,0,10},{1,1,6,8}, - {6,6,4,0},{11,5,0,0},{11,2,0,3},{4,0,10,2},{2,3,10,1},{1,13,1,1},{0,14,0,2},{2,3,3,8},{12,3,1,0},{14,0,0,2},{9,1,3,3},{6,4,0,6},{1,1,5,9},{5,9,0,2},{2,10,1,3},{12,0,0,4},{4,6,6,0},{0,6,4,6},{3,7,4,2},{0,13,3,0},{3,10,0,3},{10,2,1,3},{1,12,1,2},{2,0,13,1},{11,0,5,0},{12,1,3,0},{6,4,5,1},{10,4,2,0},{3,6,1,6},{7,3,6,0},{10,4,0,2},{10,0,2,4}, - {0,5,9,2},{0,9,3,4},{6,4,2,4},{3,4,7,2},{3,3,5,5},{4,2,9,1},{6,2,8,0},{3,5,3,5},{4,10,1,1},{10,1,3,2},{5,7,0,4},{5,3,7,1},{6,8,1,1},{8,8,0,0},{11,1,0,4},{14,1,0,1},{9,3,2,2},{8,2,1,5},{0,0,2,14},{3,3,9,1},{10,1,5,0},{8,3,1,4},{1,5,8,2},{6,1,9,0},{3,2,1,10},{3,11,1,1},{7,6,3,0},{9,0,3,4},{5,2,5,4},{0,2,3,11},{15,0,0,1},{0,6,6,4}, - {3,4,9,0},{4,7,0,5},{0,4,4,8},{0,13,2,1},{2,4,1,9},{3,2,5,6},{10,6,0,0},{3,5,6,2},{8,0,4,4},{1,3,6,6},{7,7,0,2},{6,1,4,5},{0,11,1,4},{2,2,8,4},{0,1,2,13},{15,0,1,0},{7,2,6,1},{8,1,7,0},{1,8,4,3},{2,13,1,0},{1,0,7,8},{14,2,0,0},{1,8,1,6},{9,3,3,1},{0,0,7,9},{4,4,1,7},{9,0,6,1},{10,2,4,0},{1,7,3,5},{0,3,8,5},{5,2,4,5},{1,2,5,8}, - {0,8,7,1},{10,3,2,1},{12,0,4,0},{2,1,4,9},{5,2,2,7},{1,9,3,3},{15,1,0,0},{6,3,4,3},{9,5,0,2},{1,6,9,0},{6,6,0,4},{13,2,1,0},{5,1,8,2},{0,5,11,0},{7,1,0,8},{1,2,12,1},{0,3,3,10},{7,4,2,3},{5,1,4,6},{7,0,3,6},{3,12,0,1},{3,4,5,4},{1,10,0,5},{7,4,3,2},{10,5,0,1},{13,3,0,0},{2,5,4,5},{3,10,1,2},{5,1,2,8},{14,0,1,1},{1,5,4,6},{1,4,5,6}, - {2,3,11,0},{11,0,4,1},{11,2,2,1},{5,3,8,0},{1,3,10,2},{0,1,13,2},{3,1,4,8},{4,2,4,6},{1,5,6,4},{2,1,11,2},{1,2,9,4},{4,7,3,2},{6,2,5,3},{7,2,2,5},{8,1,4,3},{3,2,8,3},{12,1,0,3},{7,8,1,0},{7,0,2,7},{5,10,0,1},{0,2,14,0},{2,9,3,2},{7,0,0,9},{11,1,4,0},{10,4,1,1},{2,2,9,3},{5,7,2,2},{1,3,1,11},{13,2,0,1},{4,2,8,2},{2,3,1,10},{4,2,5,5}, - {7,0,7,2},{10,0,0,6},{0,8,5,3},{4,4,0,8},{12,4,0,0},{0,1,14,1},{8,0,1,7},{5,1,5,5},{11,0,3,2},{0,4,1,11},{0,8,8,0},{0,2,5,9},{7,3,2,4},{7,8,0,1},{1,0,3,12},{7,4,5,0},{1,6,7,2},{7,6,1,2},{9,6,1,0},{12,2,0,2},{4,1,6,5},{4,0,1,11},{8,4,4,0},{13,0,1,2},{8,6,2,0},{4,12,0,0},{2,7,5,2},{2,0,5,9},{5,4,5,2},{3,8,5,0},{7,3,3,3},{4,4,8,0}, - {2,1,3,10},{5,0,1,10},{6,4,3,3},{4,9,1,2},{1,4,0,11},{11,3,1,1},{4,0,12,0},{13,0,0,3},{6,1,6,3},{9,0,4,3},{8,0,0,8},{8,4,0,4},{0,12,1,3},{0,4,10,2},{3,4,8,1},{1,3,8,4},{9,2,5,0},{5,7,4,0},{1,0,11,4},{4,10,0,2},{1,3,12,0},{6,9,0,1},{5,0,9,2},{5,9,2,0},{13,1,0,2},{9,3,4,0},{9,4,0,3},{3,1,12,0},{2,4,3,7},{1,2,13,0},{2,2,4,8},{6,8,0,2}, - {9,2,1,4},{9,5,1,1},{2,0,4,10},{5,4,0,7},{0,0,6,10},{1,2,0,13},{4,7,2,3},{6,5,5,0},{3,3,1,9},{1,6,1,8},{12,2,1,1},{4,4,5,3},{1,0,6,9},{0,6,10,0},{4,8,3,1},{4,3,2,7},{2,1,7,6},{1,9,1,5},{3,1,3,9},{8,7,1,0},{1,2,3,10},{14,1,1,0},{5,4,4,3},{3,7,0,6},{7,4,1,4},{3,7,5,1},{1,1,0,14},{0,10,3,3},{0,4,3,9},{1,7,7,1},{2,0,10,4},{5,8,0,3}, - {6,7,3,0},{0,8,4,4},{5,7,3,1},{7,9,0,0},{7,6,2,1},{0,4,5,7},{6,3,5,2},{1,2,1,12},{5,2,0,9},{8,5,0,3},{4,6,1,5},{1,1,7,7},{10,5,1,0},{1,2,8,5},{1,8,2,5},{5,1,0,10},{6,9,1,0},{13,0,2,1},{8,3,5,0},{6,3,6,1},{2,11,3,0},{3,7,3,3},{1,5,2,8},{7,5,2,2},{0,6,7,3},{13,1,1,1},{5,3,4,4},{7,2,7,0},{5,8,3,0},{3,13,0,0},{0,7,9,0},{8,0,3,5}, - {1,3,7,5},{4,0,2,10},{12,0,1,3},{1,7,6,2},{3,9,0,4},{7,2,0,7},{0,1,7,8},{2,1,8,5},{0,13,1,2},{0,8,1,7},{5,0,11,0},{5,6,2,3},{0,3,0,13},{2,3,4,7},{5,6,3,2},{4,2,10,0},{3,3,7,3},{7,2,5,2},{1,1,11,3},{12,3,0,1},{5,1,1,9},{1,15,0,0},{9,7,0,0},{9,1,2,4},{0,7,3,6},{3,0,13,0},{3,0,11,2},{0,6,5,5},{8,2,2,4},{6,10,0,0},{4,8,4,0},{0,0,3,13}, - {0,4,12,0},{7,1,6,2},{3,5,0,8},{8,0,6,2},{6,2,3,5},{2,10,0,4},{4,11,0,1},{6,1,5,4},{5,1,3,7},{0,11,3,2},{4,6,0,6},{2,6,0,8},{3,1,7,5},{2,14,0,0},{2,9,2,3},{0,3,4,9},{11,0,1,4},{13,0,3,0},{8,3,0,5},{0,5,3,8},{5,11,0,0},{0,1,4,11},{2,1,9,4},{3,4,4,5},{7,1,2,6},{12,2,2,0},{9,4,1,2},{6,0,2,8},{4,6,2,4},{11,2,3,0},{3,2,2,9},{10,3,1,2}, - {1,1,2,12},{0,5,2,9},{0,1,11,4},{6,2,4,4},{2,8,2,4},{0,9,4,3},{11,0,2,3},{0,2,11,3},{6,0,7,3},{0,3,6,7},{4,5,5,2},{1,2,6,7},{7,5,1,3},{9,0,2,5},{2,6,4,4},{4,1,9,2},{4,8,2,2},{1,12,3,0},{0,9,6,1},{0,10,6,0},{3,1,5,7},{2,13,0,1},{2,2,1,11},{3,6,0,7},{5,6,5,0},{5,5,4,2},{4,0,3,9},{3,4,1,8},{0,11,2,3},{2,12,1,1},{7,1,3,5},{7,0,9,0}, - {8,0,8,0},{1,0,2,13},{3,3,10,0},{2,4,4,6},{2,3,8,3},{1,10,5,0},{7,3,0,6},{2,9,0,5},{1,4,6,5},{6,6,3,1},{5,6,0,5},{6,3,0,7},{3,10,2,1},{2,5,5,4},{3,8,4,1},{1,14,0,1},{10,3,3,0},{3,5,7,1},{1,1,3,11},{2,4,0,10},{9,3,1,3},{5,10,1,0},{3,0,6,7},{3,1,9,3},{11,2,1,2},{5,3,3,5},{0,5,1,10},{4,1,11,0},{10,2,0,4},{7,6,0,3},{2,7,0,7},{4,2,2,8}, - {6,1,7,2},{4,9,2,1},{0,0,8,8},{3,7,2,4},{9,6,0,1},{0,12,4,0},{6,7,1,2},{0,7,2,7},{1,0,10,5},{0,0,14,2},{2,7,3,4},{5,0,0,11},{7,7,1,1},{6,2,7,1},{4,5,3,4},{3,5,1,7},{5,9,1,1},{6,2,1,7},{3,2,0,11},{0,11,0,5},{3,11,2,0},{10,1,4,1},{7,0,4,5},{11,4,0,1},{10,3,0,3},{0,2,4,10},{0,15,0,1},{0,11,5,0},{6,7,2,1},{1,12,2,1},{4,1,3,8},{1,0,13,2}, - {1,8,5,2},{7,0,1,8},{3,12,1,0},{9,2,4,1},{1,7,4,4},{11,4,1,0},{4,3,8,1},{2,8,4,2},{1,11,3,1},{1,1,4,10},{4,10,2,0},{8,2,5,1},{1,0,9,6},{5,3,2,6},{0,9,7,0},{10,2,2,2},{5,8,1,2},{8,7,0,1},{0,3,12,1},{1,0,1,14},{4,8,0,4},{3,8,0,5},{4,6,5,1},{0,9,5,2},{10,2,3,1},{2,3,9,2},{1,0,12,3},{11,3,0,2},{4,5,2,5},{0,2,12,2},{9,1,0,6},{9,2,0,5}, - {1,2,7,6},{4,7,4,1},{0,12,2,2},{0,0,0,16},{2,8,3,3},{3,6,2,5},{0,6,3,7},{7,5,4,0},{3,3,3,7},{3,3,0,10},{5,0,6,5},{0,0,10,6},{8,5,3,0},{8,1,5,2},{6,0,9,1},{11,1,2,2},{2,11,2,1},{9,5,2,0},{3,0,4,9},{2,2,12,0},{2,6,6,2},{2,1,13,0},{6,0,5,5},{2,0,14,0},{2,11,1,2},{4,4,7,1},{2,0,11,3},{3,1,1,11},{2,9,4,1},{3,7,6,0},{14,0,2,0},{1,10,4,1}, - {8,0,7,1},{3,6,5,2},{0,3,11,2},{2,5,6,3},{11,1,3,1},{6,5,3,2},{3,8,1,4},{0,2,7,7},{2,10,2,2},{1,6,2,7},{11,0,0,5},{12,1,1,2},{12,1,2,1},{0,7,1,8},{0,3,9,4},{0,2,1,13},{7,1,4,4},{10,1,0,5},{4,0,8,4},{5,2,7,2},{0,2,0,14},{4,3,7,2},{2,7,1,6},{1,2,2,11},{6,3,3,4},{1,14,1,0},{2,4,6,4},{5,3,6,2},{5,3,5,3},{8,4,1,3},{1,3,0,12},{3,5,2,6}, - {1,8,7,0},{0,7,4,5},{2,1,6,7},{4,11,1,0},{7,2,4,3},{6,1,3,6},{4,5,4,3},{2,11,0,3},{1,5,7,3},{12,0,2,2},{5,0,4,7},{1,13,0,2},{7,7,2,0},{4,1,7,4},{4,5,0,7},{5,0,5,6},{6,5,4,1},{2,4,2,8},{1,10,1,4},{6,3,1,6},{3,3,8,2},{0,7,7,2},{4,4,2,6},{1,1,8,6},{1,12,0,3},{2,1,12,1},{1,9,2,4},{1,11,0,4},{2,5,2,7},{10,0,3,3},{4,6,3,3},{3,7,1,5}, - {1,9,0,6},{7,1,7,1},{1,6,5,4},{9,2,3,2},{6,2,2,6},{2,2,2,10},{8,3,3,2},{0,1,8,7},{2,0,8,6},{0,3,1,12},{9,4,2,1},{9,4,3,0},{6,2,6,2},{1,8,0,7},{5,1,10,0},{0,5,5,6},{8,2,4,2},{2,3,2,9},{6,0,3,7},{2,2,6,6},{2,6,2,6},{1,13,2,0},{9,3,0,4},{7,3,5,1},{6,5,2,3},{5,2,6,3},{2,0,12,2},{5,7,1,3},{8,1,3,4},{3,1,10,2},{1,0,15,0},{0,8,0,8}, - {5,0,7,4},{4,4,6,2},{0,1,0,15},{10,0,1,5},{7,3,4,2},{4,9,3,0},{2,5,7,2},{3,4,2,7},{8,3,2,3},{5,1,6,4},{0,10,2,4},{6,6,1,3},{6,0,0,10},{4,4,3,5},{1,3,9,3},{7,5,3,1},{3,0,7,6},{1,8,6,1},{4,3,0,9},{3,11,0,2},{6,0,6,4},{0,1,3,12},{0,4,2,10},{5,5,6,0},{4,1,4,7},{8,1,6,1},{5,6,4,1},{8,4,2,2},{4,3,1,8},{3,0,2,11},{1,11,4,0},{0,8,3,5}, - {5,1,7,3},{7,0,8,1},{4,3,5,4},{4,6,4,2},{3,2,4,7},{1,6,3,6},{0,7,8,1},{3,0,1,12},{9,1,4,2},{7,4,0,5},{1,7,0,8},{5,4,1,6},{9,1,5,1},{1,1,9,5},{4,1,1,10},{5,3,0,8},{2,2,5,7},{4,0,0,12},{9,0,7,0},{3,4,0,9},{0,2,6,8},{8,2,0,6},{3,2,6,5},{4,2,6,4},{3,6,4,3},{2,8,6,0},{5,0,3,8},{0,4,0,12},{0,16,0,0},{0,9,2,5},{4,0,11,1},{1,6,4,5}, - {0,1,6,9},{3,4,6,3},{3,0,10,3},{7,0,6,3},{1,4,9,2},{1,5,3,7},{8,5,2,1},{0,12,0,4},{7,2,3,4},{0,5,6,5},{11,1,1,3},{6,5,0,5},{2,1,5,8},{1,4,11,0},{9,1,1,5},{0,0,13,3},{5,8,2,1},{2,12,0,2},{3,3,6,4},{4,1,10,1},{4,0,5,7},{8,1,0,7},{5,1,9,1},{4,3,3,6},{0,2,2,12},{6,3,2,5},{0,0,12,4},{1,5,1,9},{2,6,5,3},{3,6,3,4},{2,12,2,0},{1,6,8,1}, - {10,1,1,4},{1,3,4,8},{7,4,4,1},{1,11,1,3},{1,2,10,3},{3,9,3,1},{8,5,1,2},{2,10,4,0},{4,2,0,10},{2,7,6,1},{8,2,3,3},{1,5,5,5},{3,1,0,12},{3,10,3,0},{8,0,5,3},{0,6,8,2},{0,3,13,0},{0,0,16,0},{1,9,4,2},{4,1,8,3},{1,6,6,3},{0,10,5,1},{0,1,12,3},{4,0,6,6},{3,8,3,2},{0,5,4,7},{1,0,14,1},{0,4,6,6},{3,9,1,3},{3,5,8,0},{3,6,6,1},{5,4,7,0}, - {3,0,12,1},{8,6,1,1},{2,9,5,0},{6,1,1,8},{4,1,2,9},{3,9,4,0},{5,2,9,0},{0,12,3,1},{1,4,10,1},{4,0,7,5},{3,1,2,10},{5,4,2,5},{5,5,5,1},{4,2,3,7},{1,7,5,3},{2,8,0,6},{8,1,2,5},{3,8,2,3},{6,1,2,7},{3,9,2,2},{9,0,0,7},{0,8,6,2},{8,4,3,1},{0,2,8,6},{6,5,1,4},{2,3,5,6},{2,10,3,1},{0,7,0,9},{4,2,7,3},{2,4,8,2},{7,1,1,7},{2,4,7,3}, - {2,4,10,0},{0,1,10,5},{4,7,1,4},{0,10,4,2},{9,0,1,6},{1,9,6,0},{3,3,4,6},{4,5,7,0},{5,5,2,4},{2,8,1,5},{2,3,6,5},{0,1,1,14},{3,2,3,8},{10,1,2,3},{9,1,6,0},{3,4,3,6},{2,2,0,12},{0,0,9,7},{4,0,9,3},{7,0,5,4},{4,5,6,1},{2,5,1,8},{2,5,9,0},{3,5,4,4},{1,3,11,1},{7,1,5,3},{3,2,7,4},{1,4,2,9},{1,11,2,2},{2,2,3,9},{5,0,10,1},{3,2,11,0}, - {1,10,3,2},{8,3,4,1},{3,6,7,0},{0,7,5,4},{1,3,3,9},{2,2,10,2},{1,9,5,1},{0,5,0,11},{3,0,3,10},{0,4,8,4},{2,7,7,0},{2,0,2,12},{1,2,11,2},{6,3,7,0},{0,6,2,8},{0,10,1,5},{0,9,0,7},{6,4,4,2},{6,0,1,9},{1,5,10,0},{5,4,6,1},{5,5,3,3},{0,0,4,12},{0,3,2,11},{1,4,1,10},{3,0,9,4},{5,5,0,6},{1,7,8,0},{2,0,3,11},{6,4,1,5},{10,0,6,0},{0,6,0,10}, - {0,4,11,1},{3,1,6,6},{2,5,8,1},{0,2,10,4},{3,1,11,1},{6,6,2,2},{1,1,10,4},{2,1,2,11},{6,1,8,1},{0,2,13,1},{0,7,6,3},{6,8,2,0},{3,0,0,13},{4,4,4,4},{6,2,0,8},{7,3,1,5},{0,11,4,1},{6,7,0,3},{2,6,3,5},{5,2,1,8},{7,1,8,0},{5,5,1,5},{1,8,3,4},{8,2,6,0},{6,0,10,0},{5,6,1,4},{1,4,4,7},{2,7,4,3},{1,4,8,3},{5,4,3,4},{1,10,2,3},{2,9,1,4}, - {2,2,11,1},{2,5,0,9},{0,0,1,15},{0,0,11,5},{0,4,7,5},{0,1,15,0},{2,1,0,13},{0,3,10,3},{8,0,2,6},{3,3,2,8},{3,5,5,3},{1,7,1,7},{1,3,2,10},{4,0,4,8},{2,0,9,5},{1,1,1,13},{2,2,7,5},{2,1,10,3},{4,2,1,9},{4,3,6,3},{1,3,5,7},{2,5,3,6},{1,0,8,7},{5,0,2,9},{2,8,5,1},{1,6,0,9},{0,0,5,11},{0,4,9,3},{2,0,7,7},{1,7,2,6},{2,1,1,12},{2,4,9,1}, - {0,5,7,4},{6,0,4,6},{3,2,10,1},{0,6,1,9},{2,6,1,7},{0,5,8,3},{4,1,0,11},{1,2,4,9},{4,1,5,6},{6,1,0,9},{1,4,3,8},{4,5,1,6},{1,0,5,10},{5,3,1,7},{0,9,1,6},{2,0,1,13},{2,0,6,8},{8,1,1,6},{1,5,9,1},{0,6,9,1},{0,3,5,8},{0,2,9,5},{5,2,8,1},{1,1,14,0},{3,2,9,2},{5,0,8,3},{0,5,10,1},{5,2,3,6},{2,6,7,1},{2,3,0,11},{0,1,9,6},{1,0,4,11}, - {3,0,5,8},{0,0,15,1},{2,4,5,5},{0,3,7,6},{2,0,0,14},{1,1,12,2},{2,6,8,0},{3,1,8,4},{0,1,5,10} - }; - // All total orderings for 16 pixels [0,2] 2-bit selectors. - // BC1 selector order: 0, 1, 2 - // Note this is different from g_unique_total_orders4[], which reorders the selectors into linear order. - const uint32_t NUM_UNIQUE_TOTAL_ORDERINGS3 = 153; - static uint8_t g_unique_total_orders3[NUM_UNIQUE_TOTAL_ORDERINGS3][3] = - { - {6,0,10},{3,6,7},{3,0,13},{13,3,0},{12,4,0},{9,1,6},{2,13,1},{4,7,5},{7,5,4},{9,6,1},{7,4,5},{8,6,2},{16,0,0},{10,6,0},{2,7,7}, - {0,0,16},{0,3,13},{1,15,0},{0,2,14},{1,4,11},{15,1,0},{1,12,3},{9,2,5},{14,1,1},{8,2,6},{3,3,10},{4,2,10},{14,0,2},{0,14,2},{1,7,8},{6,6,4}, - {11,5,0},{6,4,6},{11,3,2},{4,3,9},{7,1,8},{10,4,2},{12,1,3},{11,0,5},{9,3,4},{1,0,15},{9,0,7},{2,6,8},{12,2,2},{6,2,8},{6,8,2},{15,0,1}, - {4,8,4},{0,4,12},{8,5,3},{5,9,2},{11,2,3},{12,3,1},{6,3,7},{1,1,14},{2,9,5},{1,8,7},{4,10,2},{7,7,2},{13,1,2},{0,15,1},{3,2,11},{7,0,9}, - {4,4,8},{3,8,5},{0,5,11},{13,2,1},{1,10,5},{4,11,1},{3,10,3},{5,10,1},{10,2,4},{0,6,10},{14,2,0},{11,4,1},{3,12,1},{1,13,2},{1,5,10},{5,11,0}, - {12,0,4},{8,1,7},{6,10,0},{3,13,0},{7,2,7},{0,7,9},{5,8,3},{0,12,4},{11,1,4},{13,0,3},{0,16,0},{5,7,4},{10,3,3},{10,0,6},{0,13,3},{4,6,6}, - {2,8,6},{2,5,9},{7,8,1},{2,1,13},{2,0,14},{7,3,6},{5,1,10},{3,11,2},{5,4,7},{8,3,5},{10,5,1},{6,9,1},{1,3,12},{4,5,7},{2,2,12},{4,1,11}, - {0,8,8},{4,12,0},{6,5,5},{8,7,1},{5,5,6},{3,7,6},{7,9,0},{4,9,3},{0,10,6},{8,0,8},{5,3,8},{10,1,5},{6,1,9},{7,6,3},{9,5,2},{0,1,15}, - {9,7,0},{2,14,0},{3,4,9},{8,4,4},{9,4,3},{0,9,7},{1,9,6},{3,9,4},{5,2,9},{2,3,11},{5,6,5},{1,14,1},{6,7,3},{2,4,10},{2,12,2},{8,8,0}, - {2,10,4},{4,0,12},{0,11,5},{2,11,3},{1,11,4},{3,5,8},{5,0,11},{3,1,12},{1,2,13},{1,6,9} - }; - - // For each total ordering, this table indicates which other total orderings are likely to improve quality using a least squares pass. Each array is sorted by usefulness. - static uint16_t g_best_total_orderings4[NUM_UNIQUE_TOTAL_ORDERINGS4][MAX_TOTAL_ORDERINGS4] = - { -#if RGBCX_USE_SMALLER_TABLES - { 202,120,13,318,15,23,403,450,5,51,260,128,77,21,33,494,515,523,4,141,269,1,2,700,137,49,48,102,7,64,753,82 }, - { 13,141,23,217,115,51,77,2,64,21,0,4,5,317,137,269,202,33,318,7,291,352,9,10,3,180,32,6,365,102,341,349 }, - { 29,58,262,1,52,74,6,171,5,287,151,334,27,500,75,26,331,223,53,635,220,19,50,45,46,17,14,396,163,409,324,70 }, - { 40,51,33,453,14,23,62,56,12,196,730,475,153,99,403,775,117,130,585,34,4,17,162,11,139,57,102,38,108,47,123,440 }, - { 33,23,51,13,102,64,202,128,12,40,15,196,153,10,1,2,77,99,141,0,515,5,117,3,120,403,700,165,22,14,269,453 }, - { 13,23,51,4,77,141,202,33,115,64,32,128,0,11,177,40,15,102,2,217,7,137,269,21,90,59,515,1,180,403,22,6 }, - { 26,235,19,47,648,624,78,145,27,112,122,64,444,6,630,453,25,42,65,130,711,85,390,113,416,108,665,29,730,138,644,95 }, - { 64,141,352,751,217,247,237,437,177,269,86,954,947,875,32,318,95,77,304,92,597,180,232,291,128,864,349,588,372,202,312,1 }, - { 642,898,180,638,901,341,82,197,10,951,15,515,165,762,700,253,811,753,752,365,143,479,244,569,8,110,351,873,55,31,499,116 }, - { 221,23,51,125,438,254,13,21,39,49,308,656,0,115,530,159,158,401,30,166,912,386,165,688,518,9,105,627,424,22,421,33 }, - { 143,31,1,44,197,8,180,125,116,55,13,498,23,341,638,242,93,15,2,141,0,901,752,115,36,206,165,479,338,365,515,762 }, - { 12,23,51,13,14,15,37,99,515,38,700,117,2,196,134,153,753,64,54,33,128,120,21,0,328,5,139,82,453,719,457,1 }, - { 13,15,23,515,961,700,457,753,51,115,4,165,197,2,38,569,1,474,0,37,99,719,5,12,629,14,11,3,33,77,64,10 }, - { 15,515,700,753,1,0,2,4,3,23,134,12,961,5,10,197,11,33,82,120,457,51,165,7,6,341,217,21,77,9,40,180 }, - { 13,51,23,457,719,961,730,401,165,453,0,117,386,15,134,1,758,153,12,54,515,99,11,2,700,5,753,4,308,33,6,899 }, - { 134,898,82,117,13,33,77,102,23,260,341,351,120,901,197,153,961,111,196,110,180,457,854,10,450,8,165,40,4,115,0,365 }, - { 60,18,126,167,35,16,191,71,24,92,121,271,68,107,212,146,118,150,199,7,21,1,9,575,727,5,566,48,0,132,108,273 }, - { 62,136,129,123,128,41,162,17,249,211,214,789,618,710,38,678,248,507,57,64,152,269,119,3,177,183,597,106,4,179,216,90 }, - { 403,523,51,475,494,453,817,899,202,23,450,13,421,120,102,730,33,128,4,1,805,5,7,153,757,260,318,196,77,457,326,65 }, - { 4,59,3,62,12,33,56,193,27,21,102,17,40,77,76,84,32,0,6,123,119,177,128,11,18,611,605,25,13,51,73,210 }, - { 43,20,319,422,414,945,0,7,819,61,5,376,325,173,804,904,470,693,97,707,14,49,22,104,147,107,95,32,426,1,330,577 }, - { 13,23,51,2,0,115,4,141,217,33,10,77,1,15,64,180,3,515,7,6,22,102,11,5,40,9,165,700,202,197,317,341 }, - { 28,49,0,105,1,24,65,159,35,55,95,239,16,2,109,7,9,14,170,320,347,168,424,158,10,301,124,5,67,21,64,36 }, - { 15,515,700,753,0,1,13,2,117,4,12,10,5,165,457,3,9,134,11,7,6,51,77,64,961,82,33,197,14,341,120,141 }, - { 7,71,14,149,97,18,60,16,150,92,398,189,140,124,24,273,35,2,69,302,154,68,0,336,517,43,66,28,118,251,230,1 }, - { 4,102,33,77,40,59,11,624,210,12,128,342,5,503,91,139,64,32,25,494,202,678,416,0,403,275,21,450,196,318,523,177 }, - { 25,19,42,6,122,813,256,235,85,26,436,53,297,573,680,390,445,63,27,416,80,233,65,73,389,283,45,605,194,17,250,343 }, - { 402,102,202,128,33,300,403,23,12,77,40,21,342,117,483,99,25,494,6,4,63,32,84,569,139,757,475,318,19,26,196,134 }, - { 158,9,0,109,39,49,65,22,35,168,55,24,68,124,159,16,185,344,333,154,254,272,175,289,1,577,95,28,105,810,30,169 }, - { 197,180,115,237,498,165,2,5,287,546,400,3,61,34,509,13,297,80,341,52,45,186,58,881,23,873,468,176,64,17,311,250 }, - { 120,968,373,260,704,110,450,202,137,318,77,95,269,326,217,717,661,652,851,349,93,1,518,98,827,291,21,177,82,33,848,719 }, - { 44,116,144,268,434,489,367,384,98,127,918,93,948,31,206,940,855,0,203,137,9,22,617,141,332,105,393,492,959,282,299,131 }, - { 13,77,23,33,51,0,64,141,102,4,2,115,1,6,202,15,10,128,269,7,177,180,3,40,22,11,515,217,117,318,700,137 }, - { 15,515,700,753,4,11,141,40,165,23,64,180,13,202,32,3,51,125,5,197,21,128,0,93,77,1,120,82,269,117,110,59 }, - { 176,231,585,62,34,14,412,161,56,236,527,57,17,3,51,202,4,23,369,283,128,13,472,440,84,361,136,457,381,130,719,53 }, - { 9,0,180,217,237,101,141,352,88,100,230,64,175,317,115,498,68,39,30,1,702,83,213,36,365,208,752,13,252,321,952,546 }, - { 28,9,22,1,49,0,109,39,83,95,86,30,13,105,128,55,141,168,158,67,31,159,208,12,96,5,185,2,160,64,137,23 }, - { 72,4,38,12,51,89,477,11,57,76,401,308,23,474,99,148,413,179,59,13,431,152,54,569,17,3,205,629,197,421,405,15 }, - { 457,13,23,961,15,51,515,700,165,12,753,629,11,1,719,117,0,3,2,37,569,197,40,328,33,5,153,134,99,64,38,196 }, - { 254,100,310,9,30,1,39,625,166,265,190,0,272,557,131,731,31,98,578,688,404,93,101,88,49,21,127,264,44,36,252,478 }, - { 51,23,12,13,15,128,99,120,10,202,515,153,64,82,700,33,165,2,5,117,403,1,141,0,3,196,37,453,753,197,260,93 }, - { 38,99,542,139,453,117,196,23,457,13,328,111,37,134,961,11,12,51,40,775,587,401,474,54,153,477,41,629,33,475,14,277 }, - { 6,85,25,233,343,91,26,63,138,29,19,65,283,4,81,235,42,122,605,64,648,256,174,370,74,389,718,59,45,194,445,416 }, - { 49,5,97,20,197,21,18,193,0,64,408,729,173,350,43,422,165,7,14,104,61,32,509,713,523,102,120,95,125,397,35,232 }, - { 144,116,268,434,384,489,367,206,93,855,940,44,98,332,617,127,959,911,137,282,203,31,22,219,141,9,131,276,417,0,1,120 }, - { 17,106,64,62,32,255,136,292,476,162,129,241,123,141,41,237,720,214,209,352,519,211,186,148,752,247,507,90,21,77,197,119 }, - { 2,29,52,50,5,58,14,6,27,1,366,357,45,53,17,19,171,151,26,181,133,38,218,764,287,583,61,113,3,487,600,281 }, - { 130,59,196,412,381,730,711,236,77,210,202,402,453,99,401,108,361,803,291,283,153,4,57,51,128,183,14,719,503,117,23,11 }, - { 13,23,51,141,77,4,33,64,115,0,217,10,180,202,2,102,11,9,15,165,40,21,128,352,22,7,197,3,317,515,269,1 }, - { 23,13,202,51,120,15,21,5,141,1,128,269,137,515,64,102,125,48,98,33,260,523,318,93,700,165,450,77,2,12,403,82 }, - { 1,2,14,46,29,67,38,52,5,171,58,24,103,69,96,70,83,181,54,75,163,223,16,45,112,309,155,0,186,35,18,108 }, - { 15,515,700,753,13,0,1,2,153,5,23,10,117,3,9,7,134,165,12,6,341,33,4,14,77,457,115,21,719,180,217,82 }, - { 197,165,509,13,391,180,308,115,23,546,5,498,2,29,3,401,901,61,34,80,14,457,250,569,237,873,38,297,45,15,468,386 }, - { 19,73,27,250,200,714,444,472,26,53,34,17,813,322,283,390,128,297,78,123,432,14,436,136,106,690,57,122,389,80,503,3 }, - { 3,17,21,45,62,32,38,12,155,14,2,328,5,99,401,536,828,13,227,488,106,51,719,119,540,76,165,221,115,629,209,41 }, - { 115,341,873,197,365,13,901,180,569,752,317,1,10,498,143,634,261,0,509,15,943,237,44,31,116,601,165,127,282,23,141,64 }, - { 453,51,23,403,33,421,475,102,15,153,196,515,13,700,117,523,12,40,753,21,4,134,0,494,670,899,22,801,730,10,11,401 }, - { 23,13,51,33,12,117,153,134,453,196,15,99,515,40,14,700,128,102,11,753,77,64,403,202,0,401,475,37,65,2,3,38 }, - { 2,7,5,14,70,1,29,61,52,45,6,112,66,16,21,32,592,46,38,135,87,58,186,315,290,128,113,0,64,48,227,23 }, - { 33,23,102,51,128,13,64,202,141,1,77,10,153,40,196,117,2,3,0,5,15,269,403,12,137,134,318,165,120,6,453,99 }, - { 16,92,7,20,43,35,126,71,60,14,107,18,68,97,0,121,279,149,24,246,191,48,118,575,55,140,362,783,230,150,375,566 }, - { 13,23,4,33,77,64,51,102,141,128,32,10,0,202,40,115,59,22,90,11,177,21,291,6,7,318,180,117,137,2,95,165 }, - { 507,162,129,41,4,211,62,38,123,59,57,248,183,130,99,11,3,361,202,17,402,556,266,305,803,210,128,184,152,136,313,117 }, - { 643,123,193,650,802,18,25,389,718,256,65,289,84,91,619,511,415,90,235,63,57,510,324,216,862,102,6,183,108,397,217,736 }, - { 13,23,15,1,515,51,0,2,700,5,753,165,141,115,12,3,4,180,21,197,457,7,6,10,120,9,33,202,77,32,8,11 }, - { 23,51,13,453,64,403,12,21,5,202,128,475,165,141,523,95,125,115,3,1,4,730,120,32,2,494,180,719,457,197,450,401 }, - { 204,74,135,66,6,174,192,7,138,172,85,353,348,580,280,97,95,500,29,64,426,32,87,889,65,81,25,2,52,43,568,673 }, - { 35,0,68,69,24,9,1,16,65,103,149,133,18,114,28,50,83,2,189,7,46,14,101,336,175,124,251,55,71,218,38,238 }, - { 16,101,0,118,9,18,24,68,35,154,71,124,60,212,191,520,55,806,694,167,28,39,364,375,1,346,252,65,604,302,22,21 }, - { 0,9,16,35,1,24,68,18,65,21,103,67,13,149,28,189,71,23,101,238,114,7,335,133,486,141,22,212,48,50,30,118 }, - { 13,202,23,77,33,51,128,5,21,141,115,32,102,64,4,0,318,269,10,15,291,2,494,177,11,217,3,515,22,137,6,700 }, - { 16,92,60,35,7,18,24,68,150,149,14,71,0,375,97,126,118,107,230,191,246,273,140,55,175,653,9,575,2,28,566,517 }, - { 76,90,21,179,316,148,205,32,464,288,184,257,245,1,89,2,460,57,152,45,38,358,645,5,12,449,350,48,37,17,4,14 }, - { 19,27,26,813,80,297,17,495,436,53,73,200,4,378,250,59,106,25,45,128,361,42,113,469,122,390,77,40,736,6,11,136 }, - { 6,26,235,138,19,145,112,70,331,262,25,42,52,624,27,453,122,47,500,78,648,85,29,2,630,632,409,113,50,226,108,75 }, - { 7,16,14,24,92,35,18,2,46,9,60,140,0,87,50,5,54,13,12,38,171,23,126,21,58,64,1,70,128,71,220,163 }, - { 90,205,257,184,32,179,460,5,245,45,2,288,769,524,57,21,152,229,17,1,497,4,292,59,619,452,432,76,476,11,266,14 }, - { 15,515,700,753,4,5,11,141,13,1,33,3,0,128,202,23,180,21,2,64,269,32,117,134,120,40,102,318,153,17,137,352 }, - { 47,130,711,108,453,412,730,196,390,283,78,27,51,183,381,236,128,200,719,14,153,472,503,34,59,250,3,4,57,803,123,432 }, - { 12,277,51,474,111,153,23,99,13,37,961,94,629,542,569,431,79,139,38,134,117,453,33,188,196,40,115,15,11,157,401,515 }, - { 17,495,469,106,26,378,80,27,161,483,19,742,527,436,383,862,73,136,53,814,297,6,119,84,62,56,25,3,209,611,4,128 }, - { 81,681,636,91,0,750,370,104,718,138,18,693,173,784,29,397,348,74,192,673,174,65,6,207,64,280,306,52,671,32,355,319 }, - { 15,515,700,753,33,77,4,102,115,117,40,13,1,153,134,11,5,217,23,196,2,21,3,317,32,365,0,341,291,59,12,51 }, - { 0,9,28,35,68,1,65,67,101,39,69,175,16,238,13,22,96,124,18,24,251,30,55,12,23,2,50,141,114,5,154,103 }, - { 23,33,77,13,117,40,11,102,64,4,51,403,153,453,10,0,196,134,128,65,12,291,86,99,95,59,15,141,202,180,137,719 }, - { 214,90,289,6,874,64,25,65,235,42,751,249,256,312,194,85,746,875,174,32,525,288,519,835,247,348,233,544,217,524,437,352 }, - { 1,22,2,0,36,67,28,5,49,95,12,50,168,83,105,55,7,9,14,194,103,23,114,21,584,46,10,13,38,69,208,159 }, - { 269,141,13,202,33,180,318,77,291,137,102,352,128,23,349,51,31,217,372,317,125,197,44,21,11,5,901,1,18,0,4,494 }, - { 435,144,274,88,203,418,30,1,190,410,96,778,100,530,521,326,466,795,686,166,960,321,382,264,367,822,131,31,692,9,213,93 }, - { 76,72,90,21,37,179,12,205,32,428,148,38,308,405,4,413,57,184,749,245,316,221,54,645,288,1,152,155,464,257,2,14 }, - { 77,33,64,102,13,141,23,2,40,1,51,10,0,115,6,180,202,128,4,3,177,269,15,7,22,165,291,14,217,318,137,11 }, - { 397,81,4,32,65,788,693,804,681,11,249,21,91,64,690,494,3,0,422,56,348,725,194,123,23,59,523,319,61,510,95,90 }, - { 60,126,16,7,92,121,314,246,35,107,150,132,14,146,24,18,199,298,232,71,359,140,672,97,392,649,5,423,95,21,22,388 }, - { 15,515,141,217,115,700,13,23,120,317,753,180,33,260,110,137,341,51,1,365,4,77,64,202,0,40,36,352,197,269,10,21 }, - { 111,134,117,474,23,13,961,12,569,431,37,15,51,115,515,700,277,99,753,38,197,405,457,4,72,94,629,45,11,89,54,148 }, - { 23,13,51,5,1,15,2,21,12,202,141,0,515,165,120,32,4,64,700,3,115,197,269,125,753,7,9,128,6,180,453,403 }, - { 13,141,4,23,5,2,115,217,202,51,180,137,269,352,77,1,317,3,21,318,0,15,9,64,10,197,11,341,33,515,752,7 }, - { 165,125,197,13,391,21,23,558,48,380,97,120,298,33,14,426,66,115,32,386,900,180,6,98,357,237,326,509,51,278,221,457 }, - { 120,82,15,260,515,1,351,77,450,700,13,21,141,23,753,202,217,93,110,33,51,854,5,128,326,102,137,180,817,48,269,352 }, - { 23,13,15,51,515,700,961,753,0,457,1,2,4,115,10,453,569,5,33,165,11,719,14,40,64,197,3,21,474,629,38,401 }, - { 264,166,39,30,9,100,435,254,93,921,190,363,1,625,411,382,897,656,203,478,404,812,438,110,473,88,18,691,156,141,274,272 }, - { 9,0,252,100,166,39,101,265,364,68,88,329,520,18,419,676,118,167,404,604,16,1,21,30,212,158,553,49,382,274,48,13 }, - { 15,515,700,753,4,11,141,5,3,13,202,1,180,21,2,165,269,23,40,64,0,318,12,32,128,51,77,117,523,197,120,457 }, - { 24,1,2,69,35,16,67,18,14,50,0,46,68,9,38,7,133,71,83,149,28,108,189,218,65,114,238,29,75,54,5,96 }, - { 90,289,214,64,874,13,77,712,66,751,4,23,51,192,32,0,202,194,312,177,33,65,234,104,875,288,59,5,835,416,102,95 }, - { 0,9,49,127,98,31,301,28,371,159,1,395,512,737,158,761,916,623,16,44,242,39,170,18,293,105,24,272,101,22,23,385 }, - { 17,62,136,214,123,129,32,292,119,209,710,106,141,162,128,64,45,4,77,249,11,618,211,3,207,130,519,183,38,177,21,269 }, - { 5,107,581,356,279,32,441,362,493,660,13,298,0,534,49,147,21,22,132,121,97,423,7,590,259,683,14,786,126,508,60,246 }, - { 51,13,15,730,453,23,515,719,386,457,12,700,403,475,899,1,6,523,753,421,99,401,165,33,2,19,361,5,0,670,120,27 }, - { 49,28,9,159,272,22,254,131,158,327,95,105,0,39,35,168,347,286,374,55,65,627,424,912,68,578,1,24,239,175,688,169 }, - { 15,515,700,33,753,4,77,141,341,317,1,10,13,180,102,22,40,117,115,365,5,901,23,197,134,11,217,351,64,82,21,137 }, - { 134,15,13,515,23,700,12,753,51,474,37,961,197,10,457,569,4,0,99,2,115,38,165,153,94,3,139,11,1,82,33,5 }, - { 7,2,20,58,5,14,128,66,6,29,32,43,21,52,16,38,631,61,74,97,46,135,113,25,202,192,13,0,884,45,112,87 }, - { 77,13,33,202,23,128,102,4,141,342,117,0,269,318,134,22,11,21,32,153,403,291,49,64,137,51,40,15,494,5,196,98 }, - { 2,1,14,6,46,38,29,65,5,36,67,0,103,7,22,86,133,50,108,208,52,83,24,323,283,69,28,18,10,25,23,75 }, - { 15,515,700,753,1,5,4,2,3,13,0,11,180,341,12,33,10,197,134,365,77,23,21,901,6,117,165,7,37,32,17,102 }, - { 203,268,206,93,417,940,31,8,120,137,44,499,959,473,202,692,728,559,0,260,10,326,141,564,817,127,341,1,450,22,110,23 }, - { 15,82,515,120,700,0,10,753,33,8,64,165,110,31,260,93,13,197,23,22,40,4,351,44,77,9,11,153,102,51,1,196 }, - { 60,0,16,7,14,43,20,71,28,10,2,22,154,18,13,24,92,1,51,576,35,615,805,925,68,126,124,149,97,64,23,55 }, - { 19,6,26,80,5,84,27,17,25,2,504,129,45,240,56,123,4,119,618,1,76,106,64,51,14,3,128,65,32,710,0,42 }, - { 15,515,700,753,13,4,77,23,33,51,0,5,8,10,11,31,44,1,82,22,202,64,110,102,93,21,291,40,141,180,9,49 }, - { 195,98,271,223,132,167,146,407,1,360,121,834,393,591,212,199,293,259,522,107,354,147,156,191,807,590,48,18,125,16,765,541 }, - { 128,202,77,210,402,318,33,102,6,40,403,29,342,269,196,757,99,139,2,111,42,4,494,117,275,300,13,12,678,0,177,122 }, - { 13,33,23,40,51,102,4,117,77,64,134,0,128,153,202,196,453,11,15,12,1,22,403,141,59,14,10,475,515,65,700,95 }, - { 7,16,14,24,18,2,28,0,92,71,1,22,6,35,60,20,168,10,154,118,5,302,124,69,97,109,703,158,420,12,149,66 }, - { 15,1,515,23,0,13,700,2,51,753,180,5,120,165,197,21,115,4,33,9,141,7,12,6,3,457,386,202,260,523,8,31 }, - { 60,107,121,132,146,126,199,279,150,92,16,649,441,35,955,7,21,0,423,5,18,195,598,298,493,356,32,653,22,362,953,10 }, - { 31,44,98,276,284,299,116,935,9,201,0,131,39,127,144,662,1,137,371,492,567,489,93,254,49,268,22,28,30,293,434,737 }, - { 13,15,23,515,700,0,1,51,753,4,2,10,77,202,5,115,3,165,197,457,9,12,11,961,33,120,22,141,180,7,6,40 }, - { 123,162,184,257,17,183,229,130,129,3,84,136,99,152,556,383,57,497,12,205,4,62,56,452,80,266,128,14,40,119,27,106 }, - { 196,33,117,40,153,23,134,13,51,102,453,0,15,475,12,14,515,2,22,700,4,21,753,64,401,670,730,1,9,11,10,99 }, - { 224,219,187,131,258,385,442,871,836,31,98,908,44,574,127,944,137,839,116,36,613,1,254,39,926,160,829,96,93,371,860,827 }, - { 121,195,156,132,146,360,590,407,786,522,883,591,259,929,626,941,150,687,5,55,296,379,467,178,586,465,279,21,1,13,60,354 }, - { 2,1,14,29,6,5,46,52,38,19,114,75,26,65,108,96,25,50,36,70,103,309,17,236,218,74,12,86,0,3,10,112 }, - { 15,515,82,700,120,753,10,0,8,197,260,165,351,64,13,110,117,93,31,1,9,33,22,23,457,44,450,77,102,898,40,49 }, - { 7,66,97,2,172,74,226,52,29,135,192,232,43,324,92,5,38,20,222,14,6,568,87,107,353,620,580,16,138,174,448,32 }, - { 62,129,123,162,136,249,618,183,507,57,4,152,17,59,11,184,117,77,3,128,211,41,130,205,12,40,33,106,64,229,38,313 }, - { 1,13,15,2,4,515,23,0,3,115,700,5,51,77,341,141,753,180,33,217,197,202,901,6,21,165,11,365,318,317,10,102 }, - { 6,26,235,19,145,47,112,78,64,27,453,95,29,444,25,624,85,108,648,70,32,130,74,42,711,630,632,138,65,122,113,730 }, - { 23,51,12,15,13,99,515,153,117,10,700,37,120,82,165,2,753,64,128,0,403,3,5,1,134,197,453,31,202,457,110,21 }, - { 16,24,18,71,64,35,92,7,246,146,9,108,60,118,199,5,140,2,267,0,230,830,32,133,1,68,50,330,247,563,36,12 }, - { 15,515,700,753,0,1,13,2,23,3,4,217,51,5,115,8,9,180,341,10,7,6,317,77,33,372,901,197,365,11,120,165 }, - { 234,639,178,202,77,142,5,455,450,49,416,0,147,427,198,21,315,329,13,318,325,557,120,344,113,259,22,128,61,105,23,494 }, - { 1,31,36,44,141,180,55,2,64,22,98,116,13,352,0,115,10,127,5,164,253,498,237,165,341,197,4,86,15,170,125,23 }, - { 15,120,13,141,23,260,217,515,1,77,51,110,180,700,317,82,269,137,115,202,21,753,64,5,351,291,0,450,352,93,36,326 }, - { 26,6,112,396,19,145,25,122,648,287,42,74,624,222,416,45,138,66,644,151,113,651,29,573,64,280,445,27,525,85,70,58 }, - { 156,360,5,146,121,21,271,522,354,132,49,13,18,195,16,340,60,591,446,586,727,0,107,407,167,48,1,463,199,566,32,23 }, - { 5,61,49,147,178,612,660,120,21,182,23,427,259,683,33,4,77,70,13,3,376,98,64,0,481,344,48,595,291,263,141,51 }, - { 89,79,468,179,358,205,94,405,115,498,72,180,365,431,37,111,341,734,188,317,482,217,11,4,245,152,413,216,12,474,490,752 }, - { 24,16,35,68,18,71,7,92,0,108,9,14,118,101,336,175,375,302,28,124,154,55,149,60,398,1,65,2,140,273,345,230 }, - { 51,730,421,801,453,386,23,523,13,475,719,401,670,365,899,403,115,457,758,165,33,494,450,6,423,805,629,56,569,514,958,388 }, - { 113,45,6,311,29,2,151,614,145,491,112,80,5,27,61,74,315,66,209,631,19,25,58,17,73,26,1,243,70,64,611,287 }, - { 4,339,188,471,11,59,79,12,377,94,99,33,77,102,51,111,37,152,13,961,474,542,40,342,3,23,128,403,202,177,184,57 }, - { 15,4,515,11,700,33,82,40,0,120,753,10,8,110,13,93,23,165,77,260,64,31,22,51,44,102,351,1,125,9,197,21 }, - { 16,24,18,0,35,68,28,71,124,118,60,7,9,55,14,92,109,101,419,175,22,252,154,375,149,302,158,346,2,49,1,126 }, - { 17,45,227,21,106,3,2,243,209,5,48,32,221,62,207,50,29,186,290,270,263,52,14,496,400,119,46,255,54,430,38,721 }, - { 340,354,586,658,156,195,698,668,1,296,9,18,883,363,447,379,303,98,411,13,31,163,51,5,371,48,919,846,121,21,360,70 }, - { 277,153,111,12,23,51,474,99,38,37,139,117,41,457,79,453,542,13,11,33,134,157,629,188,961,14,196,401,102,569,15,94 }, - { 0,18,16,159,49,24,9,105,35,68,7,28,22,1,60,344,55,101,109,2,14,158,13,23,71,118,455,286,272,424,5,327 }, - { 0,105,9,49,16,18,158,28,518,24,101,320,1,68,170,301,272,127,7,286,35,890,109,39,159,98,21,344,31,55,371,23 }, - { 141,1,180,15,13,2,365,217,515,352,317,115,341,0,4,5,269,700,23,21,3,752,197,77,753,51,31,901,10,202,8,64 }, - { 4,23,51,33,19,17,102,153,485,880,40,403,196,26,300,453,27,117,78,0,12,200,47,5,11,14,342,99,53,77,475,2 }, - { 62,184,56,440,130,229,183,3,556,152,99,162,12,266,17,548,136,57,305,161,123,14,452,4,383,403,257,34,40,84,33,139 }, - { 13,23,77,141,64,202,33,51,269,115,0,102,21,4,217,128,5,32,318,137,291,9,15,2,180,10,3,317,177,515,7,6 }, - { 1,22,36,105,170,0,86,2,31,28,239,64,55,5,10,98,9,44,127,95,654,67,301,143,13,12,49,23,320,141,83,21 }, - { 15,515,700,753,0,1,13,2,23,901,5,8,51,82,9,180,457,4,7,12,3,6,10,120,341,141,22,898,197,351,115,260 }, - { 1,39,274,98,100,265,190,30,438,310,166,223,88,96,909,31,264,625,530,9,382,812,21,252,593,0,254,539,44,131,23,778 }, - { 18,212,167,118,363,1,447,411,146,60,271,16,781,121,647,9,621,562,21,478,664,68,815,5,354,98,48,101,24,446,777,463 }, - { 24,28,22,0,7,1,2,16,14,65,35,49,158,95,109,159,55,105,10,18,124,9,67,5,239,149,12,289,108,68,21,424 }, - { 105,22,131,272,286,98,55,239,1,31,320,9,127,327,36,185,28,374,86,219,0,64,187,44,578,164,224,913,535,115,601,13 }, - { 22,31,28,301,127,98,44,0,105,1,512,395,9,293,109,299,95,338,239,125,242,116,36,320,55,841,900,685,599,23,13,763 }, - { 2,1,58,29,5,14,52,46,186,334,45,155,151,50,400,75,38,69,502,61,48,227,223,7,163,17,262,67,549,21,70,113 }, - { 7,107,135,232,97,14,2,92,66,16,172,192,278,387,298,356,38,35,448,52,46,43,60,29,20,126,324,526,357,359,64,5 }, - { 20,43,104,426,173,7,560,414,707,784,319,81,0,861,422,819,38,74,715,52,376,97,879,32,330,22,49,64,66,95,192,526 }, - { 104,74,636,66,204,0,355,81,222,25,29,319,145,784,20,65,90,4,174,194,7,64,6,746,138,173,750,715,91,43,192,32 }, - { 0,9,101,35,68,39,65,28,252,124,67,154,364,336,100,166,30,1,289,55,149,346,16,114,158,88,439,24,429,22,570,194 }, - { 57,14,4,231,236,585,176,59,369,23,361,13,719,51,300,342,12,457,56,3,62,38,202,401,34,46,2,322,11,215,210,507 }, - { 1,2,15,3,141,0,515,5,33,700,13,64,77,180,6,128,753,10,4,269,102,202,11,7,134,197,352,120,117,318,12,291 }, - { 5,1,21,202,13,32,48,23,0,61,259,22,494,120,70,49,51,18,137,128,465,12,178,115,2,453,403,141,58,3,90,450 }, - { 141,205,4,72,59,79,245,11,352,94,152,76,247,216,21,188,452,217,497,12,89,37,111,339,588,77,64,875,864,115,358,464 }, - { 15,515,700,753,0,1,2,13,5,4,23,3,8,341,365,51,115,10,120,457,6,141,77,197,31,7,165,9,202,450,961,260 }, - { 5,2,50,14,58,38,171,46,29,1,45,186,17,52,155,218,48,281,61,487,54,36,67,21,328,334,151,227,760,114,400,133 }, - { 457,120,70,125,318,64,23,48,795,291,202,761,751,415,77,846,269,758,21,237,96,260,391,165,87,1,128,5,221,13,137,763 }, - { 13,23,51,33,4,40,117,102,453,64,153,196,0,77,15,11,12,475,1,65,134,10,515,22,21,14,700,59,403,141,2,753 }, - { 229,152,57,266,452,381,432,12,313,184,99,471,17,4,62,339,157,3,129,59,128,11,369,37,77,38,40,123,5,497,188,257 }, - { 49,28,109,22,159,9,272,95,105,131,55,35,254,168,39,327,169,0,1,286,175,374,347,158,420,67,36,194,312,424,627,346 }, - { 5,2,61,29,45,58,80,311,1,17,209,227,52,243,106,869,454,151,592,496,48,334,14,155,6,186,46,171,75,21,255,667 }, - { 244,44,110,141,260,30,269,352,839,131,574,228,373,276,1,406,219,717,217,137,253,224,120,93,36,31,567,116,661,187,341,88 }, - { 12,99,79,139,11,453,196,51,277,474,111,23,542,37,94,188,33,13,401,775,40,961,313,102,4,339,153,485,629,134,300,431 }, - { 16,35,9,0,68,24,149,69,67,18,1,114,65,230,71,7,103,133,50,167,212,118,101,191,140,64,399,28,124,283,55,565 }, - { 88,30,274,435,131,613,190,100,93,829,166,1,187,795,530,127,382,957,960,160,31,137,466,264,39,800,406,254,28,473,521,219 }, - { 167,16,18,118,212,24,60,71,101,68,191,9,375,411,363,35,0,1,589,199,302,21,447,55,146,126,92,271,647,121,562,48 }, - { 64,141,86,177,77,128,147,597,304,95,269,102,275,4,352,49,120,5,372,194,465,13,588,237,947,216,202,180,612,751,107,534 }, - { 18,65,90,403,523,289,240,214,194,102,701,475,202,217,283,862,389,51,33,0,494,421,453,817,84,64,847,899,352,13,23,437 }, - { 13,51,23,202,5,12,21,128,15,115,0,1,141,120,64,32,4,2,515,403,165,457,3,10,700,99,453,318,719,450,308,401 }, - { 98,223,393,31,1,271,834,791,167,44,202,64,93,697,5,116,77,125,450,446,212,18,541,293,51,120,195,132,284,13,807,765 }, - { 15,515,700,753,4,11,23,13,40,51,82,165,0,110,93,33,141,64,120,5,10,77,3,102,180,32,202,125,8,197,31,21 }, - { 15,515,700,753,0,1,13,2,901,23,5,341,3,51,82,8,4,180,961,9,115,10,12,6,898,7,351,141,134,22,31,120 }, - { 234,416,77,5,315,639,325,202,147,198,113,49,450,61,455,142,0,21,22,342,329,494,178,58,102,427,318,230,13,120,43,470 }, - { 60,146,16,18,156,126,121,271,199,360,132,24,167,0,640,10,71,522,21,92,5,340,107,354,118,150,22,195,446,35,28,212 }, - { 4,361,11,14,56,368,377,161,27,12,300,77,59,200,17,554,202,33,40,494,495,21,210,80,757,25,128,23,19,38,444,53 }, - { 141,82,217,351,15,352,120,1,180,260,515,64,854,36,700,317,752,372,13,269,77,753,922,21,349,23,202,110,93,137,51,373 }, - { 15,515,700,753,77,13,0,1,23,33,102,2,51,4,3,5,291,217,10,9,450,120,341,7,317,6,11,117,115,8,260,180 }, - { 15,515,120,13,700,23,77,141,1,260,0,753,180,51,137,202,115,365,110,291,217,5,128,9,21,341,197,269,2,450,317,165 }, - { 174,6,348,85,138,74,280,204,66,233,192,355,289,65,81,580,636,353,25,91,104,343,673,214,64,95,42,712,792,32,194,90 }, - { 152,497,452,59,4,216,11,79,94,77,128,188,269,339,588,33,76,529,318,32,141,471,12,202,111,21,5,51,37,90,72,177 }, - { 417,499,10,141,253,244,110,559,8,564,180,260,728,120,352,638,642,341,951,206,143,752,901,93,137,661,922,373,44,31,811,197 }, - { 13,77,23,33,4,51,0,102,128,59,141,40,64,115,177,10,137,22,202,2,7,11,90,1,117,180,269,14,49,6,134,3 }, - { 1,2,22,0,36,5,67,50,14,28,12,86,38,46,83,168,194,65,103,114,49,7,10,95,21,69,23,24,128,51,55,13 }, - { 17,106,119,207,255,306,742,378,84,62,136,45,3,5,240,80,61,56,209,383,311,790,655,32,2,440,76,151,58,29,179,263 }, - { 3,128,1,141,2,202,33,5,64,15,0,515,102,13,269,10,700,180,134,51,120,6,77,318,23,137,17,117,753,197,82,153 }, - { 514,38,377,328,11,57,41,248,880,266,556,4,152,361,471,757,485,403,305,102,3,211,313,99,457,130,12,14,157,40,23,54 }, - { 68,0,167,101,9,118,264,520,16,18,21,478,562,1,124,212,100,936,664,777,191,88,806,154,48,24,759,604,35,252,265,65 }, - { 230,689,699,213,466,352,217,831,30,443,418,144,854,201,840,855,1,251,203,317,530,957,96,93,822,539,36,752,351,137,83,800 }, - { 33,77,102,117,15,82,13,134,23,64,0,515,120,153,51,4,40,128,700,260,202,141,196,22,753,11,351,10,1,326,95,269 }, - { 11,40,33,51,117,13,542,328,14,134,38,153,23,12,485,231,102,54,775,37,3,377,111,139,211,4,457,403,369,475,99,719 }, - { 33,64,77,128,141,2,1,202,102,13,23,117,0,15,3,153,51,134,10,40,6,5,515,269,137,180,318,165,700,7,196,753 }, - { 15,515,700,753,4,1,5,11,13,21,33,180,93,141,64,2,23,77,82,3,0,102,32,40,352,341,10,197,98,110,117,901 }, - { 1,2,14,67,50,46,38,24,103,83,0,5,36,28,29,133,114,96,65,52,18,75,54,108,22,7,238,58,160,9,361,69 }, - { 258,201,276,137,160,860,116,261,295,843,567,144,131,44,187,268,943,219,284,31,202,935,141,98,662,203,127,96,36,93,224,1 }, - { 7,2,14,16,46,87,75,52,92,278,29,38,140,70,1,5,35,294,24,262,135,69,171,172,58,409,112,60,50,66,97,12 }, - { 13,23,0,2,51,1,33,4,115,10,15,141,77,3,5,180,217,515,9,7,64,11,700,6,102,40,197,22,317,753,165,202 }, - { 74,145,6,66,25,204,42,29,222,337,138,26,7,525,192,174,746,287,544,135,415,2,609,632,112,64,87,0,85,45,712,396 }, - { 77,33,102,15,217,13,23,141,202,515,51,700,291,4,269,753,317,180,21,64,318,115,128,0,275,2,352,196,3,5,137,11 }, - { 187,219,258,871,44,442,160,574,137,224,908,116,839,131,36,926,276,201,93,228,202,860,31,613,144,531,406,1,902,30,190,318 }, - { 1,372,141,5,21,77,225,744,96,30,23,349,13,291,269,284,69,442,459,144,303,839,217,622,160,330,260,48,120,410,189,352 }, - { 66,222,2,74,29,87,135,6,7,145,52,25,294,337,226,172,138,331,42,70,97,112,26,1,632,192,43,5,415,609,461,353 }, - { 45,17,106,209,5,2,21,29,48,207,3,186,243,155,255,263,454,119,400,496,270,14,290,62,425,1,171,32,659,52,38,56 }, - { 93,88,141,120,30,213,260,373,100,717,459,82,110,1,166,450,180,321,217,372,36,269,131,225,22,352,326,466,473,187,244,410 }, - { 266,57,152,381,313,471,12,229,99,369,339,62,157,3,4,37,77,38,188,17,11,162,40,184,129,59,475,775,128,452,403,453 }, - { 217,352,317,141,752,15,180,515,372,365,700,341,753,349,77,21,291,1,115,244,64,120,13,98,269,82,5,498,864,351,23,144 }, - { 14,514,369,102,403,377,51,719,880,153,23,13,457,11,485,4,401,12,328,453,33,40,117,57,629,38,730,236,134,670,361,961 }, - { 107,7,172,14,92,135,2,359,60,314,46,16,126,278,232,150,279,32,38,392,298,5,35,97,24,192,259,288,330,52,356,312 }, - { 0,4,25,13,59,90,65,23,26,19,18,12,5,216,91,51,389,33,77,11,22,85,27,81,21,177,746,45,42,194,37,123 }, - { 5,49,315,202,416,77,455,639,450,21,197,137,350,13,408,0,329,318,494,344,61,402,64,509,347,120,113,48,95,713,308,401 }, - { 130,47,381,390,59,90,200,214,289,6,65,472,29,64,874,648,50,751,624,26,52,32,4,194,875,714,85,249,247,33,881,19 }, - { 51,23,453,13,719,12,457,165,37,730,99,4,386,197,401,17,11,2,3,15,5,961,475,6,515,64,54,700,32,115,0,403 }, - { 15,515,1,13,700,2,23,0,753,5,3,180,51,4,165,12,141,21,197,457,7,115,6,9,352,10,120,202,8,341,11,77 }, - { 0,9,1,67,35,28,68,16,24,65,18,69,50,114,103,12,22,13,5,101,2,96,23,83,149,21,39,55,7,175,433,124 }, - { 28,105,22,0,1,320,170,9,49,301,109,95,127,31,98,55,65,35,2,24,168,159,36,713,16,740,13,338,21,44,512,23 }, - { 13,77,4,51,23,33,102,202,128,59,40,0,64,141,117,403,115,11,15,318,153,269,22,515,475,134,10,494,177,1,90,210 }, - { 13,23,0,51,77,33,2,141,4,10,1,64,115,102,3,6,22,15,217,11,180,7,40,515,165,202,177,9,269,128,700,5 }, - { 456,116,492,8,949,268,867,391,203,51,499,13,719,386,31,791,457,918,125,10,23,93,479,685,417,0,22,338,506,551,870,730 }, - { 17,237,45,180,106,62,32,64,115,41,136,498,255,21,197,129,241,13,3,227,23,352,165,752,350,365,449,155,4,546,476,38 }, - { 1,15,180,515,0,2,341,700,901,352,4,141,13,3,752,5,753,217,317,115,365,23,197,21,51,165,31,6,269,202,77,7 }, - { 205,141,216,269,497,4,588,76,59,152,128,452,79,77,875,11,72,94,188,217,352,12,247,37,90,64,32,1,474,23,947,372 }, - { 64,247,217,237,317,180,752,115,349,141,498,13,437,304,23,372,352,164,579,291,33,864,177,197,0,490,72,10,482,77,269,51 }, - { 2,1,0,13,15,141,3,77,5,515,64,33,23,180,6,700,4,117,217,7,10,11,102,165,753,197,115,134,40,352,12,269 }, - { 11,40,38,328,33,542,12,313,41,339,23,157,377,117,369,51,471,99,775,485,13,305,457,57,14,475,37,248,4,54,188,719 }, - { 33,77,102,40,13,23,0,51,4,128,64,202,117,141,22,196,153,10,134,15,59,269,1,137,65,11,403,318,453,86,515,177 }, - { 472,80,34,250,495,161,17,14,469,176,128,4,389,106,283,436,216,527,3,297,483,177,53,56,231,194,119,84,719,57,255,59 }, - { 317,352,180,141,217,752,115,341,365,244,1,269,202,901,253,15,21,498,372,4,137,515,13,2,700,318,5,197,23,143,753,349 }, - { 9,39,101,18,265,100,333,520,252,16,0,329,593,1,553,364,68,167,310,30,121,254,118,158,363,166,60,604,272,24,286,404 }, - { 15,515,1,180,700,901,0,2,753,341,752,4,3,13,115,365,317,5,23,197,141,217,165,352,6,22,36,9,137,51,7,10 }, - { 131,39,9,829,166,613,578,827,1,30,716,254,100,98,31,224,0,406,228,310,616,219,44,846,127,190,938,96,265,371,856,438 }, - { 17,64,62,106,141,751,136,292,32,129,352,41,38,476,86,128,214,237,5,177,123,209,217,45,269,954,162,710,180,3,90,4 }, - { 25,42,235,65,650,736,605,6,630,85,123,343,233,256,26,122,63,389,141,249,416,444,368,194,19,108,138,174,90,0,544,511 }, - { 184,229,152,57,266,432,497,452,17,381,619,257,313,12,4,205,59,3,99,471,157,128,5,129,339,369,77,11,32,45,202,2 }, - { 137,202,160,860,141,30,93,567,36,276,295,261,131,39,9,964,201,843,1,98,800,318,116,22,943,187,10,219,206,44,269,535 }, - { 0,493,125,64,49,9,279,10,35,18,93,55,293,31,14,13,194,165,325,48,22,132,21,107,98,389,44,581,342,259,174,137 }, - { 15,515,700,753,4,33,13,77,23,5,51,32,102,40,93,11,349,141,21,8,82,202,64,31,110,10,117,0,1,44,3,318 }, - { 110,253,854,811,352,141,244,951,180,642,661,384,498,143,752,317,911,10,269,206,559,351,261,120,902,533,922,959,365,160,332,217 }, - { 2,29,70,1,75,52,6,220,26,112,145,331,74,163,19,69,38,324,46,58,14,5,25,21,278,223,50,307,66,7,67,409 }, - { 13,23,77,33,51,4,64,141,115,102,0,2,128,177,40,11,202,10,6,180,7,15,269,1,32,217,59,22,291,3,137,515 }, - { 340,897,691,478,658,264,914,382,100,812,363,1,724,156,166,698,88,521,39,404,682,447,296,96,303,411,30,909,9,274,656,772 }, - { 9,18,310,101,265,159,326,120,105,158,33,363,77,195,51,55,13,39,354,132,23,7,28,639,16,137,98,1,252,272,709,49 }, - { 57,313,471,12,99,369,157,339,266,152,38,37,475,453,328,775,11,40,59,188,77,514,401,403,342,4,139,33,377,51,229,14 }, - { 16,7,24,14,35,140,60,92,18,69,71,2,189,1,46,230,108,388,150,38,21,172,278,67,246,267,50,309,236,135,451,0 }, - { 206,417,93,940,959,473,499,203,8,137,559,728,31,202,44,120,450,141,10,260,116,564,22,326,269,318,268,244,0,1,253,638 }, - { 15,515,700,753,1,0,13,2,23,4,3,51,5,217,7,77,341,115,8,9,10,33,6,180,317,349,291,120,11,165,457,901 }, - { 1,2,5,14,48,21,290,32,50,45,38,46,263,207,155,72,76,29,17,408,425,171,89,52,7,0,292,449,3,227,513,428 }, - { 121,132,354,167,271,223,146,98,18,463,1,668,446,195,407,60,212,447,781,48,360,363,411,522,156,393,807,9,21,16,293,13 }, - { 131,578,105,371,219,224,716,616,187,49,9,254,737,159,385,98,258,127,272,761,0,916,623,910,28,286,39,31,22,518,924,242 }, - { 302,467,97,6,273,1,24,484,124,51,36,18,2,398,453,421,523,69,7,23,13,403,386,150,66,0,298,65,426,165,22,158 }, - { 30,190,530,88,1,100,778,539,625,274,382,410,96,731,960,39,795,321,9,131,264,144,840,748,44,166,669,957,36,31,435,228 }, - { 141,1,2,128,64,33,15,202,3,0,180,5,13,77,515,134,269,102,197,700,10,137,318,6,120,165,753,352,4,82,23,117 }, - { 44,201,567,116,131,224,295,662,489,268,219,31,434,144,187,276,110,384,93,261,699,137,36,442,120,1,613,30,228,64,141,244 }, - { 12,15,51,23,515,37,99,13,700,0,10,117,753,38,165,82,134,120,11,453,197,64,115,569,1,629,401,22,457,474,110,153 }, - { 7,135,2,92,172,14,66,140,38,52,97,46,29,74,16,324,278,226,6,87,1,571,262,5,357,232,35,380,69,314,24,330 }, - { 125,386,23,963,949,60,51,391,165,221,13,197,118,21,719,193,541,421,517,150,393,7,401,453,308,5,791,551,326,558,48,173 }, - { 6,85,42,25,138,222,174,235,280,256,525,289,26,214,64,746,90,32,544,65,204,19,66,337,355,95,348,415,74,29,5,312 }, - { 1,14,5,50,2,67,24,0,46,69,48,21,58,103,16,12,18,38,54,96,83,7,502,45,36,181,35,9,430,28,10,155 }, - { 811,351,642,180,951,752,110,638,253,10,82,352,197,341,365,564,499,854,873,55,9,417,282,901,244,22,559,143,206,141,28,898 }, - { 23,13,51,15,12,453,403,165,4,515,115,719,475,457,700,523,2,21,0,99,202,197,14,5,386,753,128,401,37,308,33,117 }, - { 120,13,23,77,141,1,15,93,217,82,260,51,137,202,110,515,21,180,165,5,128,102,64,351,291,700,269,352,326,203,177,0 }, - { 1,5,0,22,12,2,36,21,10,23,86,13,28,51,9,128,48,14,32,50,7,3,96,137,54,4,202,49,37,65,208,323 }, - { 219,98,23,127,301,51,258,308,170,910,13,165,22,105,293,616,125,242,276,401,201,395,964,115,55,284,31,374,327,206,512,900 }, - { 64,180,80,165,5,237,2,250,34,58,297,61,197,17,22,29,186,498,231,445,247,3,752,311,95,32,483,153,27,45,115,469 }, - { 13,77,23,33,0,2,1,64,141,51,102,10,15,3,115,40,180,6,515,128,7,22,269,202,4,217,700,5,177,117,14,165 }, - { 15,120,51,515,13,450,23,700,202,153,196,753,260,64,128,141,730,4,326,386,21,523,33,318,5,457,95,32,403,1,77,269 }, - { 2,1,5,29,32,45,207,263,14,425,58,72,76,21,7,408,48,46,52,186,17,292,38,6,61,89,476,50,155,720,119,3 }, - { 15,515,700,753,4,13,11,5,1,23,33,21,3,141,32,2,40,180,117,64,269,202,102,197,0,165,120,51,341,352,153,12 }, - { 76,5,214,129,2,123,45,710,17,249,618,460,179,32,1,257,205,519,90,207,245,184,162,61,769,209,292,106,6,29,14,128 }, - { 1,15,23,13,120,141,51,515,202,21,700,165,0,180,137,2,5,77,128,93,753,260,269,197,326,33,110,352,82,102,318,48 }, - { 7,2,135,14,29,87,66,52,97,172,70,112,5,58,46,337,92,16,20,43,1,38,232,155,74,294,6,461,409,151,262,32 }, - { 574,187,384,926,860,110,258,434,269,531,141,244,160,261,253,116,699,959,940,717,533,36,219,31,902,661,871,295,201,352,10,260 }, - { 156,354,296,1,182,586,64,379,340,937,850,698,31,48,98,44,120,18,163,23,30,658,195,125,77,284,223,291,774,481,96,39 }, - { 250,80,34,472,17,495,176,469,33,194,64,483,4,297,141,14,161,27,53,667,56,833,73,527,585,231,106,51,84,814,2,59 }, - { 97,7,81,140,66,92,172,192,24,298,43,6,74,69,314,426,462,14,501,16,21,508,60,189,267,232,230,104,48,20,135,330 }, - { 31,44,116,144,268,393,492,434,367,489,127,98,918,0,384,9,22,206,948,105,93,203,1,456,332,940,299,28,137,49,293,125 }, - { 15,128,33,3,13,51,141,1,202,64,23,2,515,120,102,0,5,82,10,700,165,197,269,153,403,110,753,137,196,318,117,12 }, - { 31,98,127,9,0,105,22,28,44,512,293,395,299,1,242,49,685,763,320,599,125,116,109,276,284,95,870,159,23,456,36,900 }, - { 7,24,124,1,6,97,2,69,14,18,23,92,21,67,66,16,5,484,43,20,118,65,36,22,28,0,51,140,13,71,29,150 }, - { 1,64,442,303,284,349,202,141,622,67,154,447,260,44,652,429,9,335,237,919,197,98,167,33,682,269,547,77,863,411,340,201 }, - { 1,15,2,141,515,0,700,13,3,180,10,753,5,64,77,33,4,6,7,197,102,269,165,23,134,11,352,341,291,349,22,120 }, - { 99,139,12,453,196,277,775,40,475,33,23,401,215,51,11,14,77,111,313,130,38,211,37,266,129,15,339,153,719,3,369,515 }, - { 33,77,102,4,23,128,13,141,202,64,51,0,40,59,269,115,117,137,153,1,318,11,10,177,15,134,22,90,196,2,403,32 }, - { 7,2,14,58,70,112,16,5,87,38,46,52,6,128,135,1,32,21,155,29,66,64,0,97,92,186,172,294,13,23,20,37 }, - { 15,13,515,1,700,2,23,0,753,5,3,4,51,10,341,115,365,180,11,33,317,77,6,7,217,12,197,165,117,9,64,102 }, - { 2,1,14,29,75,69,67,6,52,46,38,24,103,220,83,25,70,87,262,74,96,267,50,366,26,16,226,394,357,66,108,19 }, - { 9,105,18,39,1,0,16,557,101,272,252,890,326,49,265,21,137,100,23,938,13,310,159,5,31,24,254,51,30,128,202,132 }, - { 80,209,45,61,667,17,6,106,5,2,151,29,483,255,454,833,27,311,112,19,738,378,1,58,113,26,25,469,119,887,32,64 }, - { 13,23,51,15,5,1,515,0,21,2,12,141,700,165,202,115,753,32,180,4,3,197,10,120,457,9,269,128,64,341,7,33 }, - { 99,12,453,277,139,157,369,474,339,51,38,23,37,196,188,401,775,111,11,313,328,475,153,266,4,471,79,40,33,629,102,14 }, - { 7,92,16,232,97,140,126,14,60,107,66,35,298,387,314,104,246,462,441,150,0,38,24,2,172,357,230,330,5,633,22,289 }, - { 13,77,23,202,318,141,33,4,51,269,102,177,115,403,137,2,40,494,90,11,342,128,31,117,21,32,7,12,64,134,14,10 }, - { 13,2,0,23,141,1,77,3,180,33,6,64,15,10,115,51,4,5,217,197,7,165,515,102,22,11,700,269,40,352,177,14 }, - { 15,515,700,753,4,11,1,93,13,5,180,110,82,21,120,23,2,33,10,141,3,165,197,102,901,0,32,341,117,40,153,12 }, - { 15,515,700,753,1,13,0,2,23,4,77,51,3,5,341,291,7,33,6,115,10,9,8,217,11,177,120,180,102,165,197,365 }, - { 20,43,198,325,173,904,104,234,66,147,77,319,416,422,97,426,5,0,7,450,861,202,712,725,2,32,639,376,38,324,945,315 }, - { 105,0,9,28,49,301,170,1,127,159,22,16,31,98,512,623,24,109,158,395,35,68,371,65,713,55,2,242,293,21,44,18 }, - { 213,88,689,466,230,30,321,435,699,352,217,201,795,831,144,854,1,443,96,539,530,840,418,251,855,190,93,100,669,31,957,662 }, - { 130,453,47,196,4,57,14,59,236,711,51,153,730,77,412,381,23,202,108,128,361,13,283,117,11,719,200,46,34,78,210,2 }, - { 1,2,5,14,0,50,36,22,38,46,65,67,12,86,114,28,103,29,208,7,10,128,21,83,218,23,96,54,194,6,133,51 }, - { 6,26,74,19,165,453,14,730,1,125,197,50,29,51,138,357,13,2,108,391,70,719,46,457,47,500,386,262,112,23,235,52 }, - { 9,10,376,20,43,0,49,18,30,120,2,33,325,104,501,470,77,788,725,102,523,39,858,5,904,414,174,55,137,37,342,13 }, - { 15,515,700,753,0,1,13,23,51,77,120,202,341,82,5,4,9,260,2,137,141,128,115,351,901,8,180,10,197,21,450,33 }, - { 105,131,272,578,9,49,371,219,159,616,286,320,224,187,716,98,28,22,0,623,127,258,910,737,385,31,239,347,254,109,424,95 }, - { 457,51,13,23,961,12,719,99,453,15,4,515,165,401,629,3,700,11,17,14,2,37,753,41,57,569,38,45,0,33,5,32 }, - { 202,120,5,33,318,77,450,102,1,260,403,128,494,21,165,13,269,12,326,23,342,523,402,2,817,64,15,141,125,82,457,475 }, - { 141,269,352,217,180,64,349,137,202,160,317,15,372,515,700,752,318,753,244,13,437,291,165,864,22,237,5,82,954,21,77,418 }, - { 70,29,2,145,74,112,26,6,75,52,19,66,632,1,87,220,5,135,163,287,307,25,226,7,58,396,294,278,113,409,69,151 }, - { 82,351,317,15,752,180,898,352,141,901,515,341,10,700,365,1,753,498,0,217,253,115,55,854,33,5,143,32,21,160,36,197 }, - { 39,9,310,254,0,30,101,49,252,272,100,265,105,455,159,557,190,333,286,688,18,166,1,158,709,16,625,627,31,131,327,329 }, - { 2,58,29,5,1,151,186,52,70,45,7,549,14,75,112,400,113,155,61,46,227,163,311,315,66,6,307,27,17,220,287,74 }, - { 141,217,13,21,352,23,269,77,180,115,317,64,202,15,349,137,5,51,165,291,318,752,372,4,0,102,33,365,197,32,341,125 }, - { 68,35,0,9,65,101,149,124,24,154,175,16,28,7,67,1,18,189,114,398,55,14,345,39,118,133,69,2,230,429,71,283 }, - { 66,7,29,2,112,52,20,43,97,151,74,192,135,5,173,525,337,45,145,58,415,25,14,32,644,70,544,226,222,21,6,580 }, - { 31,125,44,22,116,299,242,55,1,170,64,36,479,870,456,685,10,599,558,0,268,506,28,740,23,903,492,164,393,206,2,86 }, - { 188,11,79,12,99,377,94,33,542,339,40,474,111,37,4,51,102,453,139,775,13,475,23,961,277,471,134,57,431,266,115,117 }, - { 658,698,340,98,296,303,1,31,850,363,156,919,44,774,586,385,120,77,82,10,223,30,354,291,23,914,478,87,260,163,48,13 }, - { 15,515,700,753,82,4,1,13,901,33,197,11,5,10,23,165,2,0,180,3,21,77,51,120,365,115,217,40,117,102,32,401 }, - { 15,515,700,753,4,11,5,13,1,141,3,180,23,202,21,2,269,64,165,33,40,32,0,318,120,128,12,197,117,352,51,17 }, - { 91,6,233,85,370,718,81,65,25,256,63,343,42,74,235,123,138,511,397,249,26,194,650,355,64,87,544,18,90,643,66,214 }, - { 23,13,202,51,21,120,1,5,141,128,450,64,318,403,15,137,260,33,12,48,32,31,125,494,269,102,165,515,77,2,197,14 }, - { 180,317,365,341,752,217,115,352,901,482,372,498,1,141,15,253,515,244,2,700,0,21,13,82,23,4,579,351,753,291,269,77 }, - { 13,115,197,341,9,352,468,237,64,498,23,165,22,509,901,546,482,180,28,569,317,51,365,873,391,95,86,217,49,837,752,706 }, - { 13,23,51,1,141,5,165,202,21,120,64,125,180,15,2,33,197,115,128,32,260,269,12,82,4,515,137,7,318,93,0,700 }, - { 214,289,90,174,874,6,138,280,65,81,64,85,355,751,194,233,312,348,835,91,0,32,343,636,249,29,875,288,519,104,247,74 }, - { 15,515,700,753,4,5,11,13,1,33,23,21,2,3,102,32,141,77,180,117,31,64,0,40,134,196,120,352,12,44,197,6 }, - { 33,15,13,515,117,23,700,217,134,753,0,51,153,77,141,2,4,64,196,1,3,180,10,115,5,102,6,11,22,202,165,7 }, - { 15,515,700,753,33,4,77,102,1,40,13,117,11,115,134,5,21,153,23,217,3,32,2,317,120,196,180,141,51,12,59,260 }, - { 15,515,700,753,13,0,1,23,2,217,51,3,4,5,8,317,115,9,341,10,202,180,6,365,7,82,457,22,120,901,33,291 }, - { 7,2,135,20,97,14,66,52,337,673,192,29,43,355,353,5,16,294,107,376,147,226,331,560,64,470,222,104,415,32,4,324 }, - { 195,132,142,167,146,77,363,271,121,354,202,120,647,178,786,212,687,0,101,878,16,522,60,5,450,411,35,55,98,639,259,318 }, - { 202,77,20,0,318,66,104,128,102,269,177,43,33,7,216,291,494,5,2,342,74,173,97,112,450,22,337,10,234,52,64,678 }, - { 107,362,612,356,359,97,414,43,259,20,392,7,298,147,819,683,465,173,729,660,319,14,5,779,581,595,246,35,501,92,0,230 }, - { 6,165,14,453,13,51,19,23,386,457,74,391,308,2,26,401,47,758,603,108,719,366,1,29,309,730,324,197,133,70,115,867 }, - { 179,72,205,180,247,245,4,490,352,59,317,152,79,498,94,217,148,76,752,864,11,216,141,405,89,452,197,111,497,188,37,21 }, - { 107,7,298,314,14,359,32,392,232,279,172,97,60,581,387,126,121,0,534,493,356,92,441,95,13,21,35,147,22,5,16,362 }, - { 156,271,354,586,360,132,591,195,121,18,340,1,5,13,21,48,668,446,23,463,296,658,60,55,407,698,146,70,626,51,163,24 }, - { 13,23,51,4,0,12,457,15,11,453,2,515,5,1,99,10,115,165,700,475,401,403,3,961,40,14,37,753,719,32,64,569 }, - { 48,125,21,165,13,221,23,763,423,508,197,5,98,92,193,16,441,386,64,314,293,457,391,140,49,60,102,693,683,51,35,867 }, - { 202,77,120,450,5,318,1,494,0,195,18,132,523,403,326,604,354,260,121,576,203,167,234,817,682,49,35,615,21,20,13,102 }, - { 39,9,166,30,0,101,158,68,404,190,333,274,252,310,88,100,49,28,344,35,21,22,419,131,438,1,16,65,530,694,124,10 }, - { 15,515,700,753,110,4,1,11,165,180,93,13,82,5,2,197,33,120,0,3,10,23,21,115,901,217,341,77,317,51,32,117 }, - { 2,29,1,14,6,52,5,46,50,26,70,19,103,58,38,67,96,262,516,309,218,133,108,27,75,17,112,114,24,487,331,83 }, - { 120,77,15,13,1,141,260,23,515,217,110,51,137,700,317,202,165,291,180,21,753,128,0,177,326,93,450,82,64,269,197,5 }, - { 255,59,554,297,183,56,33,444,108,358,123,196,269,122,77,153,57,177,117,730,19,467,605,130,128,50,275,4,291,475,134,133 }, - { 13,23,51,12,153,14,117,120,165,134,99,401,38,453,15,128,197,719,64,515,475,403,37,33,196,700,40,125,5,0,54,2 }, - { 64,33,174,348,95,108,467,554,56,0,25,306,233,6,63,511,343,120,13,85,29,561,543,707,319,180,899,355,77,49,256,18 }, - { 120,260,51,23,77,15,202,1,93,82,141,450,13,326,515,137,21,5,64,33,110,700,128,165,318,203,269,102,351,753,197,125 }, - { 15,515,700,753,4,13,11,1,5,21,23,2,33,64,3,180,32,141,22,102,77,0,10,93,82,352,117,40,341,31,165,6 }, - { 15,515,700,753,341,13,23,141,33,1,0,217,4,77,180,10,82,351,51,137,5,64,9,317,21,11,102,40,260,202,854,115 }, - { 105,272,131,22,327,286,28,239,320,9,109,578,219,49,98,224,95,159,538,371,616,127,187,64,713,55,0,170,168,258,716,623 }, - { 16,18,68,35,24,60,71,118,92,126,0,9,101,191,7,55,154,175,212,14,167,150,302,28,375,1,107,124,346,273,21,108 }, - { 20,147,43,470,376,142,904,178,427,798,0,595,198,325,858,319,61,202,173,97,5,422,14,22,107,259,32,49,887,77,414,392 }, - { 13,23,51,12,33,15,99,64,128,515,453,202,117,153,37,102,700,40,134,196,120,0,2,753,141,14,38,3,82,403,77,21 }, - { 383,17,62,136,84,119,56,440,3,504,240,80,378,129,123,548,106,128,4,11,14,555,162,32,184,361,59,64,205,5,469,57 }, - { 70,1,48,652,5,638,846,888,21,349,269,260,340,562,767,761,163,883,774,141,125,518,591,0,23,9,87,13,371,303,622,31 }, - { 66,135,6,97,74,278,69,7,14,324,267,172,2,140,462,1,357,38,808,550,92,841,189,29,16,25,298,87,75,204,24,335 }, - { 51,23,33,13,102,40,12,128,64,77,10,202,0,196,117,4,14,99,134,453,65,153,11,475,139,403,22,141,86,2,21,15 }, - { 88,100,264,166,274,435,772,1,382,921,96,478,30,438,639,909,897,521,190,466,960,410,9,144,530,418,31,329,265,691,778,93 }, - { 62,440,136,56,84,3,504,548,555,383,4,17,129,128,507,361,123,59,119,162,14,57,152,328,161,11,202,495,184,27,80,215 }, - { 911,617,332,959,206,141,253,244,282,384,110,120,10,260,352,143,951,811,269,373,160,417,93,531,728,203,434,940,137,55,36,717 }, - { 120,15,260,141,77,1,515,82,700,351,33,23,450,13,110,326,64,217,269,753,203,137,102,5,165,21,51,291,93,177,373,128 }, - { 15,515,700,753,0,1,2,23,13,51,5,9,82,901,180,8,3,4,120,6,7,141,93,12,197,341,10,33,115,730,64,125 }, - { 7,104,97,107,356,232,66,560,298,289,14,707,38,568,359,64,20,0,65,324,22,214,92,32,192,5,387,43,712,90,172,95 }, - { 6,1,2,66,67,14,74,24,108,29,69,83,458,7,25,38,135,103,36,150,451,114,52,594,75,65,380,18,267,602,19,278 }, - { 13,23,51,12,115,21,202,5,457,15,4,1,64,719,0,403,2,3,453,165,99,141,401,128,32,515,10,37,523,197,120,700 }, - { 57,59,4,11,412,381,77,53,421,291,250,368,99,14,27,369,803,283,23,108,403,19,339,210,0,401,12,444,236,40,361,736 }, - { 15,515,700,1,0,753,2,13,23,5,51,180,3,115,6,7,457,4,9,8,12,82,197,165,141,901,120,719,33,64,21,22 }, - { 64,95,180,247,929,146,90,126,197,32,237,60,288,165,316,92,5,13,77,7,217,955,522,22,16,314,132,4,317,10,312,86 }, - { 15,1,120,13,23,515,0,51,700,180,141,2,5,202,21,260,753,165,137,33,77,110,197,128,326,7,450,4,102,9,269,12 }, - { 14,2,16,46,1,7,24,69,75,35,38,50,29,220,52,140,267,67,18,54,70,309,5,60,92,189,171,87,71,163,58,0 }, - { 31,98,127,44,9,299,0,276,293,284,116,49,935,599,105,22,456,201,28,1,39,125,242,137,371,144,131,492,159,272,51,395 }, - { 6,27,151,53,573,445,297,113,26,73,436,19,491,250,396,315,45,112,145,58,614,881,25,34,611,200,17,80,70,5,138,631 }, - { 32,693,81,788,90,804,403,56,494,21,84,397,202,65,18,77,64,681,214,725,523,784,526,33,102,825,240,0,115,241,817,91 }, - { 24,7,14,2,18,16,65,0,108,149,28,69,1,71,154,36,124,35,67,140,189,429,92,68,66,22,55,118,302,150,9,6 }, - { 0,68,9,35,65,101,189,212,114,67,124,69,1,154,149,39,230,64,252,16,88,702,103,100,18,336,28,329,520,83,30,755 }, - { 5,2,186,29,61,45,17,1,52,48,58,171,155,227,80,209,311,21,14,46,50,106,243,513,334,502,496,38,3,6,32,592 }, - { 15,515,700,753,13,1,2,0,3,4,5,23,341,11,10,33,6,51,165,117,153,7,180,12,365,901,77,569,197,115,64,9 }, - { 13,15,23,515,0,51,1,700,4,2,753,10,3,5,12,77,33,961,165,457,197,11,115,9,22,102,40,403,202,21,14,59 }, - { 15,515,700,753,13,0,1,23,2,33,102,5,4,10,9,3,51,115,77,7,6,341,12,11,217,40,457,196,180,165,8,523 }, - { 166,39,30,274,190,100,333,438,530,310,88,252,0,9,539,265,1,656,404,101,625,131,778,254,31,455,676,329,724,158,21,23 }, - { 734,148,94,308,431,115,37,89,111,413,79,468,197,629,341,474,569,12,13,873,179,401,11,4,180,23,205,72,59,365,134,51 }, - { 539,228,224,219,816,190,30,258,871,840,669,93,406,530,957,187,160,531,748,137,131,88,863,36,728,839,44,213,352,116,202,466 }, - { 393,791,125,801,730,551,386,23,31,175,93,98,51,13,144,788,126,203,21,345,116,22,949,110,575,165,326,44,0,4,60,221 }, - { 13,23,77,141,0,4,51,2,33,115,64,1,10,3,6,15,11,102,7,217,180,40,515,22,128,177,202,9,700,269,165,5 }, - { 2,29,7,70,52,14,1,58,112,46,75,5,171,163,87,220,307,151,186,334,38,66,155,16,69,135,278,45,262,97,6,21 }, - { 88,321,213,100,230,435,689,466,1,382,30,352,217,699,410,96,795,36,921,752,190,141,144,180,44,831,317,83,443,31,840,251 }, - { 363,411,101,520,354,9,195,668,132,156,447,1,905,364,18,23,765,664,146,5,360,13,121,96,98,31,252,39,100,759,264,551 }, - { 13,23,51,730,12,719,453,457,401,475,5,21,403,2,0,1,15,4,3,899,99,32,165,11,515,308,197,115,6,961,700,523 }, - { 72,76,89,12,37,4,308,179,38,528,90,431,54,205,148,184,401,57,152,474,23,59,51,245,428,11,32,99,405,316,257,21 }, - { 376,20,43,147,470,173,97,595,107,319,414,142,819,5,729,178,858,7,427,32,426,104,14,0,392,362,259,61,230,77,560,246 }, - { 202,141,269,494,318,137,51,128,403,4,217,96,77,5,64,177,291,180,15,352,102,10,33,349,2,317,0,341,120,515,21,453 }, - { 77,202,33,128,102,318,494,269,13,0,117,23,342,291,403,15,134,51,153,141,177,515,82,137,196,700,203,64,22,351,753,4 }, - { 253,110,951,352,499,811,10,854,180,638,244,559,642,752,564,8,141,143,417,341,901,260,206,197,922,661,93,15,498,373,165,911 }, - { 141,13,23,180,4,217,5,1,269,317,21,0,2,202,115,51,352,77,3,197,64,341,318,15,291,9,137,93,32,165,515,33 }, - { 9,0,18,252,16,101,68,39,24,118,35,109,158,329,28,167,60,364,333,265,49,100,22,419,553,55,1,677,71,7,212,159 }, - { 28,109,9,39,0,158,49,22,168,35,55,175,1,65,67,185,194,159,289,95,272,114,30,105,86,584,36,169,254,2,83,24 }, - { 15,515,13,700,1,753,2,23,0,3,4,5,33,341,11,51,6,10,197,115,901,180,77,40,102,12,365,165,141,217,7,317 }, - { 173,693,104,422,5,18,61,32,102,0,20,13,784,560,33,66,397,526,49,207,29,25,510,707,65,6,11,344,21,263,81,77 }, - { 23,13,386,51,308,801,719,221,401,949,21,730,165,421,102,115,125,33,341,670,468,117,770,1,120,6,197,14,403,97,67,958 }, - { 0,49,105,16,28,24,159,9,158,320,1,68,35,239,170,18,109,7,55,65,2,95,301,124,347,14,21,154,22,127,286,31 }, - { 2,5,1,207,45,29,32,58,76,61,6,263,292,655,72,14,17,476,7,119,52,306,70,64,21,90,186,214,106,38,3,790 }, - { 21,6,125,49,13,64,715,66,115,95,197,33,22,32,204,165,56,278,0,408,241,120,4,808,681,350,263,85,81,571,135,509 }, - { 612,427,325,107,202,5,376,49,64,392,403,470,21,147,31,788,494,14,362,465,858,98,20,804,518,43,845,318,125,97,725,534 }, - { 32,21,76,72,2,1,14,5,241,449,89,38,350,221,155,48,50,292,37,46,45,90,270,54,17,179,214,12,148,430,476,413 }, - { 24,0,28,16,7,124,35,154,14,149,65,18,9,68,55,108,175,71,2,1,22,109,92,67,484,336,118,69,302,398,570,420 }, - { 1,5,14,2,48,50,38,67,46,21,0,54,45,270,281,12,24,32,155,96,513,103,290,83,61,58,36,17,37,72,69,181 }, - { 13,961,569,197,37,15,23,474,515,94,148,111,12,165,629,341,700,79,901,401,51,405,753,10,134,4,115,734,873,11,89,117 }, - { 33,23,102,51,13,40,77,128,64,202,141,15,4,12,0,1,2,117,22,11,10,403,153,515,99,318,137,269,139,196,700,134 }, - { 0,1,24,67,9,16,18,35,28,69,103,50,5,2,65,12,83,68,7,96,14,22,21,149,75,114,13,133,23,71,218,54 }, - { 384,617,940,332,855,911,206,959,434,282,141,10,93,253,244,110,144,268,120,36,352,137,417,203,116,31,44,269,160,201,143,951 }, - { 30,93,473,137,31,704,450,652,190,203,800,254,166,274,326,144,269,160,127,303,120,625,88,848,110,435,77,521,349,131,340,744 }, - { 53,27,73,26,19,250,297,200,25,630,17,6,611,122,34,42,714,235,472,65,436,14,80,684,690,106,45,113,680,108,64,4 }, - { 15,515,1,2,700,0,753,3,5,141,180,4,13,77,33,10,217,6,7,134,11,352,197,64,165,341,317,23,12,115,102,40 }, - { 254,530,39,613,688,221,30,31,438,190,228,960,1,44,141,21,180,406,23,166,9,202,13,96,137,48,131,829,317,269,393,51 }, - { 9,39,28,35,30,166,158,36,0,175,101,346,364,67,49,68,168,420,88,1,194,131,100,352,55,83,190,64,137,570,86,65 }, - { 62,56,3,548,555,507,440,161,34,4,215,136,162,514,361,527,17,14,211,130,328,11,383,123,84,183,38,57,184,152,205,494 }, - { 92,126,107,7,356,493,97,279,359,298,16,246,35,60,14,441,362,121,43,423,5,132,392,20,508,230,199,146,232,173,150,414 }, - { 15,82,141,515,291,922,349,700,217,260,372,120,351,93,77,753,318,352,373,854,1,326,269,21,13,102,144,202,64,23,203,137 }, - { 141,217,352,115,180,13,269,317,752,77,23,21,341,197,5,372,244,291,9,64,51,102,4,1,365,2,165,33,3,48,237,351 }, - { 78,47,390,19,130,453,108,27,711,813,730,444,412,283,196,690,123,14,128,26,250,389,650,236,200,65,51,4,34,183,297,73 }, - { 34,250,297,80,472,64,495,17,311,3,148,45,667,61,176,53,243,27,90,161,469,141,483,151,62,128,29,4,58,56,5,231 }, - { 51,23,33,13,551,77,102,326,421,21,523,120,5,899,453,692,202,153,308,615,115,958,450,401,791,68,221,93,475,18,403,4 }, - { 98,223,393,363,411,1,478,834,664,156,284,691,447,791,914,293,354,724,697,9,807,541,759,51,18,421,48,264,948,586,195,848 }, - { 7,14,107,232,16,92,2,60,46,5,359,121,24,526,220,620,135,1,172,21,126,314,132,77,18,75,32,278,12,23,52,38 }, - { 32,76,2,1,21,72,241,14,5,48,292,89,476,45,720,270,179,90,17,214,148,38,50,29,129,155,350,46,290,227,123,464 }, - { 15,515,700,753,13,23,33,77,51,4,102,0,32,202,1,11,128,82,117,141,40,5,110,8,3,90,137,21,10,318,403,165 }, - { 66,6,69,2,1,74,14,135,278,267,380,24,29,97,67,38,103,75,7,388,324,25,52,150,87,83,189,357,335,108,204,172 }, - { 152,4,339,59,79,471,188,11,77,94,128,33,529,377,12,111,102,202,452,402,216,99,13,542,51,40,474,37,64,291,23,961 }, - { 15,515,700,753,1,0,196,13,33,2,77,5,23,102,3,10,9,7,217,4,6,153,117,177,14,457,115,12,40,730,11,134 }, - { 17,209,45,106,207,5,255,119,62,2,61,3,263,742,306,655,425,378,32,56,29,136,84,80,311,58,186,240,243,383,14,21 }, - { 120,260,450,15,1,23,817,13,515,523,326,5,700,51,82,31,202,64,21,753,318,93,32,269,98,33,351,77,102,125,457,165 }, - { 116,492,268,93,23,206,203,0,551,918,13,51,8,22,417,940,120,10,499,31,949,791,125,523,165,473,341,730,421,959,401,391 }, - { 15,515,700,753,165,13,0,1,197,23,4,82,120,2,180,12,260,719,8,3,386,117,5,523,901,11,341,51,10,9,141,351 }, - { 14,24,69,7,2,66,108,1,67,6,36,398,18,267,150,97,29,38,83,149,65,74,28,0,189,71,388,16,273,124,46,22 }, - { 330,96,523,335,367,662,141,839,1,922,372,615,244,717,269,443,418,352,403,692,217,854,752,180,36,64,498,576,349,201,98,284 }, - { 184,90,257,205,245,229,57,152,769,17,524,5,32,497,45,432,619,2,452,266,4,106,1,21,179,59,76,3,460,292,381,128 }, - { 7,14,16,2,46,5,70,107,87,13,58,307,92,32,38,23,202,0,172,24,18,21,60,128,77,35,20,10,9,4,171,112 }, - { 7,66,140,16,14,92,97,69,267,172,189,24,380,2,35,60,298,451,230,135,314,74,150,71,38,357,6,330,67,423,21,443 }, - { 121,167,354,132,18,446,147,101,212,146,407,16,55,35,647,191,20,271,199,68,60,259,463,107,9,126,363,7,195,43,14,411 }, - { 76,90,179,32,205,21,184,460,257,288,45,245,316,5,57,152,241,2,358,1,229,72,524,148,48,769,17,4,12,38,14,720 }, - { 147,259,178,878,427,465,581,198,786,798,142,534,325,929,20,362,35,132,107,376,43,5,279,77,49,146,70,202,590,771,33,14 }, - { 473,93,450,778,141,30,855,466,144,203,330,530,88,523,459,372,201,617,839,704,254,321,934,326,39,36,82,717,332,213,559,403 }, - { 523,475,51,899,730,453,23,719,403,33,457,13,421,386,4,120,117,196,102,153,15,801,450,817,515,260,202,11,700,99,165,125 }, - { 15,1,13,515,0,2,700,5,23,753,4,3,341,317,10,115,180,11,33,64,217,77,117,165,197,7,6,365,9,141,102,134 }, - { 19,4,119,40,33,202,27,84,102,56,77,73,504,485,26,494,757,63,862,59,23,300,25,12,128,11,5,13,342,880,469,6 }, - { 32,20,2,13,5,21,23,6,12,38,43,29,64,7,95,51,61,207,48,147,90,178,17,182,49,0,115,202,52,362,37,22 }, - { 339,188,11,79,4,94,377,12,99,111,542,102,37,33,474,51,471,40,453,152,77,13,59,403,342,23,117,57,475,134,128,38 }, - { 34,128,283,176,495,231,318,432,503,275,529,527,161,53,3,202,56,291,585,469,73,17,14,412,57,27,80,245,250,381,402,51 }, - { 15,515,13,700,1,217,141,120,23,180,753,115,365,51,317,341,77,260,0,291,110,137,202,5,21,269,64,36,349,2,4,10 }, - { 13,15,961,515,700,753,4,12,2,457,3,11,197,51,37,569,115,23,5,0,99,10,1,134,6,111,165,33,72,40,38,79 }, - { 15,515,700,753,13,1,0,2,23,33,5,3,10,4,9,115,7,102,6,51,12,217,77,11,40,457,569,341,117,317,14,719 }, - { 5,76,2,32,292,214,45,1,129,519,123,179,90,710,17,29,460,72,14,207,21,249,58,205,464,263,618,48,6,245,3,257 }, - { 72,76,32,4,21,12,38,23,99,54,89,3,14,17,51,57,11,90,13,488,179,2,59,148,45,37,5,115,401,1,10,421 }, - { 98,223,393,1,834,264,284,791,724,293,478,772,697,909,363,682,905,447,541,821,411,51,421,9,807,48,765,31,730,96,386,410 }, - { 341,13,509,8,23,638,165,901,762,10,569,242,391,197,873,642,506,499,629,961,15,180,116,456,206,546,417,1,338,457,515,867 }, - { 1,2,5,50,14,38,46,114,0,36,29,22,218,65,86,96,137,21,133,285,12,10,323,181,17,58,51,23,67,7,28,6 }, - { 481,878,202,13,5,23,182,32,269,21,1,318,77,142,557,494,141,33,640,137,70,291,2,51,260,415,929,403,120,58,4,259 }, - { 15,515,700,753,1,4,13,0,2,5,341,3,11,180,134,12,10,317,197,365,33,21,23,165,117,6,77,7,217,37,32,498 }, - { 25,119,19,6,26,42,27,17,4,790,45,814,2,469,483,84,122,1,0,33,32,128,76,80,611,113,73,56,5,240,202,77 }, - { 14,2,7,1,24,0,65,6,16,69,67,22,124,28,108,5,18,36,86,10,38,46,66,398,289,168,12,83,21,23,610,13 }, - { 51,23,128,13,15,202,12,120,33,64,141,82,10,515,0,403,700,3,1,99,117,269,153,165,753,5,318,197,102,260,2,137 }, - { 16,35,24,0,9,18,7,1,68,69,50,71,103,65,67,189,133,23,28,13,60,537,149,335,75,21,64,5,114,2,12,14 }, - { 754,803,133,576,880,543,2,1,657,50,14,38,46,5,29,67,218,36,58,171,52,96,24,103,775,0,114,83,181,54,65,45 }, - { 21,32,5,3,2,17,14,72,76,1,12,23,38,51,4,54,10,0,89,13,99,137,45,36,421,115,543,11,22,128,221,48 }, - { 434,384,268,144,855,940,617,206,332,116,93,911,959,282,203,137,141,489,44,120,10,110,244,36,98,31,269,253,367,417,160,9 }, - { 15,2,1,0,13,515,5,700,3,23,180,217,141,10,753,4,117,6,77,33,64,7,11,197,352,317,341,134,165,115,12,9 }, - { 2,113,6,25,1,0,29,4,7,833,5,45,32,61,128,19,77,151,74,145,64,42,14,210,655,106,59,177,27,17,21,738 }, - { 116,268,918,203,551,31,8,692,206,791,403,499,417,93,940,421,0,23,22,120,13,523,44,51,299,473,959,1,10,475,202,125 }, - { 107,126,132,612,362,279,20,146,259,493,199,121,590,43,660,147,35,376,939,60,941,534,683,5,0,953,16,7,49,649,595,470 }, - { 15,515,700,753,13,1,0,23,2,33,77,4,3,51,5,102,115,10,9,341,6,7,11,342,217,12,120,180,40,317,141,8 }, - { 53,27,17,161,469,378,73,527,19,136,383,250,495,56,862,26,62,84,80,106,200,4,34,14,440,297,3,128,585,5,129,123 }, - { 17,45,209,106,5,207,243,454,119,255,2,263,186,290,29,3,21,62,425,61,84,32,58,56,48,408,655,136,306,14,742,227 }, - { 4,152,59,452,128,79,216,11,339,471,529,188,94,77,202,12,291,33,318,377,99,51,23,5,402,349,32,474,102,13,205,111 }, - { 15,515,700,753,1,0,2,13,3,5,23,4,180,51,115,9,6,12,7,8,197,33,10,961,901,77,141,752,110,22,120,341 }, - { 951,752,638,811,351,642,180,253,10,341,197,901,110,873,8,244,15,352,165,898,143,515,564,762,499,55,365,700,82,753,141,854 }, - { 6,262,197,350,74,26,115,509,841,583,165,38,21,13,47,50,235,19,33,324,453,4,308,196,138,99,64,903,675,1,223,130 }, - { 125,165,391,23,386,221,21,13,558,457,51,867,197,115,401,758,77,97,308,791,7,180,48,120,963,451,743,89,603,134,403,450 }, - { 1,14,2,5,16,46,7,38,58,24,50,0,69,48,35,67,54,18,12,75,21,45,513,155,430,37,270,9,61,163,223,32 }, - { 23,13,51,0,12,15,4,1,115,2,515,453,10,457,5,3,202,21,165,700,403,11,37,64,77,401,9,197,753,59,475,99 }, - { 129,84,17,56,27,495,19,548,80,123,162,378,3,504,161,469,618,73,40,53,4,26,205,184,106,183,62,6,257,128,862,12 }, - { 28,9,22,49,109,1,67,0,39,55,168,158,83,36,35,86,420,194,185,159,95,105,69,208,272,103,50,114,2,254,169,30 }, - { 242,391,8,456,116,13,23,492,341,165,867,51,499,457,479,638,338,509,719,10,1,642,417,762,401,93,206,268,901,569,22,197 }, - { 211,162,248,130,57,4,41,556,507,266,183,152,305,361,11,129,62,229,38,471,514,313,157,300,377,3,440,128,123,328,339,59 }, - { 7,92,97,16,298,140,60,126,14,35,279,314,232,246,43,230,508,173,71,107,423,24,150,779,20,189,66,18,607,21,0,653 }, - { 15,515,700,753,1,0,2,13,23,5,3,180,51,901,6,4,7,12,9,115,8,457,165,82,120,197,10,64,141,341,22,117 }, - { 0,18,403,25,523,74,6,24,42,91,22,102,13,51,49,193,475,681,95,85,730,64,899,397,273,750,247,673,32,805,757,288 }, - { 56,0,18,65,33,554,84,343,64,6,90,561,22,19,899,108,27,63,289,475,240,467,370,32,233,214,24,123,95,287,28,194 }, - { 31,98,127,9,0,44,293,105,395,299,49,242,28,22,599,116,1,284,276,125,456,685,763,159,272,623,23,935,393,144,201,137 }, - { 1,5,2,14,38,46,50,48,21,7,58,45,270,61,155,171,0,290,69,32,29,54,67,16,24,666,663,17,37,75,502,52 }, - { 23,51,13,453,457,12,719,4,15,99,401,2,961,3,11,730,475,515,0,1,165,115,629,700,14,17,403,40,5,33,37,64 }, - { 968,967,966,965,964,963,962,961,960,959,958,957,956,955,954,953,952,951,950,949,948,947,946,945,944,943,942,941,940,939,938,937 }, - { 2,1,14,29,67,103,6,46,52,75,24,133,38,218,83,309,36,108,70,114,96,5,238,74,25,26,220,236,65,50,69,87 }, - { 7,71,16,92,24,60,14,97,150,140,35,189,149,298,18,230,43,508,2,423,69,0,38,314,66,279,399,517,251,20,232,273 }, - { 23,1,120,51,13,202,77,141,260,21,15,5,128,82,2,450,269,165,102,318,48,32,137,515,125,64,12,115,351,180,33,7 }, - { 77,13,33,23,64,51,4,102,141,128,40,1,2,202,0,6,177,115,137,15,59,10,11,7,269,22,515,180,318,3,700,95 }, - { 101,9,18,363,264,520,411,604,676,682,905,271,16,821,167,0,621,364,39,100,121,118,166,781,647,252,1,848,447,265,404,60 }, - { 144,203,326,382,166,418,93,88,96,822,1,141,859,77,744,438,110,269,921,367,521,274,100,39,494,120,403,473,217,576,13,291 }, - { 13,21,180,125,5,23,191,32,18,16,146,199,115,24,165,118,0,225,22,1,60,197,64,901,375,241,48,12,408,71,522,818 }, - { 15,515,700,753,13,0,23,8,1,51,82,102,2,33,4,9,180,165,5,77,10,110,12,197,120,260,18,326,351,403,22,457 }, - { 33,77,102,64,13,23,128,51,141,202,1,40,0,2,117,10,15,4,6,318,269,134,22,515,180,115,177,153,137,196,3,700 }, - { 174,544,104,525,74,0,151,25,6,624,29,66,2,636,81,45,204,177,64,416,7,644,5,138,222,319,355,77,22,122,789,216 }, - { 141,304,372,352,291,947,177,269,128,954,77,349,217,202,64,318,498,437,102,864,86,13,115,180,137,5,210,197,32,950,678,7 }, - { 161,200,53,17,714,27,34,73,472,62,585,56,440,383,136,78,527,19,4,3,106,361,14,250,80,514,377,84,322,390,862,548 }, - { 32,76,72,21,38,14,89,54,12,37,2,241,5,428,17,1,181,221,350,45,3,4,449,90,148,179,99,292,794,770,477,46 }, - { 33,23,128,64,141,13,77,51,102,202,2,15,1,3,40,10,5,153,269,515,165,0,117,196,180,318,6,700,137,134,120,22 }, - { 96,137,30,0,9,39,840,202,669,406,141,530,613,1,180,88,22,160,679,576,28,403,31,219,49,228,829,100,36,15,10,856 }, - { 180,141,352,1,15,752,115,0,217,365,2,515,13,901,341,317,23,4,197,700,269,5,3,31,753,244,21,165,253,202,51,44 }, - { 1,2,67,0,28,50,83,65,14,46,103,114,24,38,36,9,69,5,18,7,22,133,55,218,16,124,29,54,96,160,12,480 }, - { 180,115,352,317,365,217,752,901,141,15,341,1,515,253,700,0,753,873,2,197,31,137,165,244,4,120,160,44,98,5,202,3 }, - { 5,32,347,49,13,21,95,713,23,1,77,33,60,64,107,4,126,928,296,850,0,241,197,102,652,195,180,534,165,153,379,10 }, - { 341,180,365,901,317,115,15,752,515,700,217,873,753,82,0,110,197,141,951,165,1,564,13,351,253,12,10,3,2,4,308,244 }, - { 17,45,21,3,106,5,155,38,227,32,2,209,62,54,12,243,14,181,552,587,46,540,207,794,37,48,430,119,255,221,770,29 }, - { 16,24,35,18,7,0,50,1,9,14,75,69,2,5,12,21,60,13,67,71,23,48,10,108,223,181,189,103,46,64,92,51 }, - { 127,13,98,165,308,23,286,293,258,51,219,395,197,115,301,401,31,391,22,105,457,170,239,276,55,338,629,116,180,479,509,569 }, - { 539,213,748,840,957,669,30,466,88,217,144,251,863,190,137,93,230,228,679,352,317,203,617,321,258,530,160,219,96,831,816,689 }, - { 5,48,1,21,2,14,0,36,12,38,32,54,430,181,50,270,72,99,281,45,17,10,46,22,37,218,67,3,290,76,23,51 }, - { 13,23,0,4,33,51,2,115,141,1,77,217,180,10,9,317,3,102,11,5,15,197,7,202,22,165,40,64,515,6,341,31 }, - { 13,15,117,515,23,12,37,134,165,700,38,54,457,753,51,64,153,197,14,10,33,82,961,0,99,89,115,719,141,3,4,1 }, - { 5,21,2,3,1,32,14,12,48,17,0,10,51,23,38,22,4,72,13,54,36,45,137,76,99,114,86,37,11,64,540,430 }, - { 202,128,77,318,291,33,269,102,275,141,494,342,40,678,0,177,20,210,402,7,4,5,137,6,13,450,403,32,49,120,23,22 }, - { 1,2,24,14,67,46,69,50,38,103,16,18,75,35,83,29,52,96,5,108,0,7,54,71,149,394,236,309,70,133,220,58 }, - { 15,515,1,700,0,2,753,13,23,5,180,3,51,4,165,457,12,197,115,6,7,21,9,141,8,901,33,82,120,77,10,110 }, - { 0,28,65,14,67,2,124,24,1,9,7,69,55,154,36,16,46,114,175,35,83,22,429,18,109,149,68,189,108,336,251,133 }, - { 56,162,403,3,129,775,99,161,17,40,527,33,880,4,14,128,475,12,548,23,102,202,361,117,34,184,383,200,183,196,64,53 }, - { 151,2,29,58,112,45,186,113,5,70,52,1,311,6,315,66,61,7,74,27,631,17,80,87,287,243,209,227,14,491,19,869 }, - { 6,1,74,2,75,29,25,66,26,70,52,138,67,324,357,42,19,220,14,85,87,108,38,451,309,103,24,69,380,135,114,65 }, - { 15,515,700,13,23,0,1,120,753,51,180,2,260,202,5,141,77,102,9,450,115,21,197,165,7,137,110,33,12,269,901,4 }, - { 5,45,17,2,14,46,48,38,181,50,155,3,186,54,61,29,21,227,281,80,540,106,12,400,52,1,58,32,328,171,209,487 }, - { 16,18,265,121,158,35,60,9,39,7,329,105,252,68,24,1,132,167,159,22,0,49,286,101,21,146,23,327,120,709,5,14 }, - { 108,467,283,56,389,650,123,412,33,177,899,475,216,453,269,349,619,65,51,730,403,670,23,196,523,128,84,13,401,789,503,543 }, - { 514,3,11,377,328,4,361,507,57,403,14,880,130,485,176,215,236,38,152,102,211,56,62,757,54,585,300,556,34,555,40,229 }, - { 3,555,62,266,130,99,507,139,514,12,152,229,215,305,57,40,440,33,403,471,38,56,475,14,361,313,775,328,196,548,123,23 }, - { 120,202,318,15,77,13,1,450,33,269,515,260,5,128,494,51,23,700,102,141,40,753,326,403,817,137,523,21,177,922,342,7 }, - { 15,1,515,23,0,13,700,2,51,753,180,5,165,21,197,12,3,120,115,4,141,6,9,7,457,33,386,202,82,8,31,341 }, - { 15,180,515,82,351,700,10,317,753,115,217,365,141,898,33,901,13,23,110,854,752,77,1,197,4,341,143,36,64,352,102,9 }, - { 104,289,66,707,214,90,712,64,97,173,20,0,414,194,874,43,32,7,568,560,65,38,426,312,715,192,376,74,835,5,324,147 }, - { 84,56,0,554,63,65,453,249,123,643,18,26,847,475,511,403,416,561,524,289,370,73,9,19,45,42,719,194,27,467,33,730 }, - { 21,346,13,350,308,826,197,101,352,68,570,0,165,23,9,841,115,100,509,694,221,230,35,217,569,88,124,749,1,777,212,154 }, - { 16,92,7,24,60,18,35,140,126,14,50,71,46,330,2,75,246,5,121,267,571,1,230,309,220,0,9,64,146,236,54,108 }, - { 82,15,515,898,365,700,180,33,341,753,77,901,10,115,55,351,21,5,1,4,13,102,36,217,2,165,752,120,197,117,11,317 }, - { 16,24,35,18,69,71,140,1,103,7,189,68,0,50,9,108,2,133,60,267,230,46,149,67,167,118,92,14,75,21,191,38 }, - { 60,71,16,18,7,20,43,118,35,68,375,28,608,0,175,566,154,92,14,149,628,33,22,13,2,10,279,23,107,356,55,117 }, - { 187,258,871,295,201,434,219,224,489,384,268,110,261,839,44,699,93,116,36,131,141,228,144,160,940,567,244,406,137,574,98,253 }, - { 66,7,97,172,192,712,232,324,204,74,43,448,387,426,568,20,526,107,104,135,356,729,173,0,22,5,32,95,2,64,500,560 }, - { 15,515,700,753,1,4,0,341,13,3,134,2,5,33,11,77,12,10,23,197,365,901,7,40,217,32,21,6,51,180,961,37 }, - { 0,28,24,9,35,65,16,124,68,55,109,154,7,39,22,149,158,14,175,1,49,252,18,71,2,168,289,419,108,420,67,101 }, - { 7,16,14,92,2,46,140,24,220,35,38,60,75,1,50,18,87,54,5,126,29,52,278,262,314,107,71,21,172,135,330,394 }, - { 7,92,16,14,172,126,2,60,140,35,135,314,278,46,24,38,232,107,330,66,5,18,150,246,230,97,52,1,121,563,279,21 }, - { 6,26,235,53,297,436,27,19,25,73,113,445,90,214,65,42,64,289,250,611,624,32,45,648,614,17,85,491,34,122,200,416 }, - { 352,141,1,217,854,752,351,180,244,36,110,661,82,258,816,160,295,219,567,224,230,269,922,144,260,268,93,201,137,116,489,202 }, - { 16,60,35,18,126,107,68,191,92,121,7,14,598,20,493,279,167,446,118,0,28,43,463,55,24,212,375,566,9,150,575,21 }, - { 15,1,515,2,4,13,0,700,3,5,23,753,341,77,51,115,33,11,180,10,197,141,6,165,7,901,102,40,9,202,217,12 }, - { 23,51,13,202,21,5,1,120,15,137,128,125,32,2,12,141,33,165,64,515,403,318,700,48,180,7,6,450,115,523,475,260 }, - { 131,716,224,371,219,187,737,616,385,254,9,98,105,924,31,258,836,39,127,578,49,916,44,761,272,137,944,159,0,242,442,22 }, - { 15,515,700,1,753,2,5,0,4,13,3,180,11,141,197,10,341,217,33,134,165,6,77,7,317,12,352,64,365,32,102,40 }, - { 66,74,7,173,174,29,192,2,222,20,226,43,353,52,712,6,0,138,500,204,97,145,64,104,426,673,355,90,25,5,65,87 }, - { 5,259,786,534,590,493,279,49,13,581,465,21,929,35,941,132,147,32,23,612,362,626,107,121,178,0,146,61,48,939,10,18 }, - { 2,14,16,7,278,69,135,140,46,24,267,35,92,38,1,189,29,52,309,60,66,75,71,172,74,357,18,87,67,6,230,5 }, - { 165,13,308,197,391,23,401,15,51,457,180,509,115,569,3,629,961,719,34,758,317,734,14,29,46,2,17,901,38,453,5,217 }, - { 1,22,2,14,0,28,7,168,67,49,65,24,36,95,5,105,55,35,12,46,69,16,114,159,194,50,10,9,158,83,164,109 }, - { 34,453,3,196,130,14,322,11,47,51,377,236,361,4,730,153,514,711,57,440,62,17,161,108,176,59,485,56,162,412,202,117 }, - { 18,16,21,23,48,13,24,35,121,5,156,60,51,1,7,132,141,221,163,115,0,271,447,340,363,202,125,71,2,781,22,698 }, - { 165,13,457,23,197,961,629,569,341,41,12,38,401,901,54,51,115,17,15,509,421,37,62,45,719,57,32,328,117,758,157,99 }, - { 2,1,77,141,33,64,3,102,0,23,13,5,128,10,6,15,180,202,269,40,51,515,7,165,137,117,318,4,700,153,197,352 }, - { 68,212,0,124,101,9,154,16,562,191,21,149,65,24,35,1,118,167,818,350,520,100,722,841,264,71,13,302,478,23,375,346 }, - { 98,23,48,598,13,293,541,21,125,121,51,807,0,31,35,259,126,7,386,1,223,783,10,107,199,20,221,144,342,963,49,64 }, - { 21,13,5,586,1,23,167,48,33,781,647,49,165,18,51,271,77,32,761,118,0,82,391,22,146,141,459,31,197,156,115,4 }, - { 2,1,5,61,29,7,58,45,14,6,425,32,70,52,290,738,207,21,72,112,66,76,655,17,186,46,64,263,38,0,128,87 }, - { 39,265,9,100,1,333,363,101,18,411,447,254,166,310,31,98,264,30,639,404,156,286,16,93,593,203,272,682,0,905,44,821 }, - { 6,2,1,19,29,51,26,108,25,74,5,23,14,114,13,386,133,103,42,66,453,70,309,138,719,324,65,38,64,96,52,75 }, - { 20,43,356,107,49,858,595,7,414,359,0,5,392,319,97,612,422,819,14,376,173,246,22,470,147,427,230,92,197,33,683,95 }, - { 0,9,68,35,65,67,114,101,28,1,124,175,336,69,154,103,83,24,189,133,39,16,50,7,2,149,55,251,18,345,230,36 }, - { 23,13,51,15,0,1,515,115,165,2,5,12,700,202,4,21,141,457,753,197,10,3,180,120,32,9,318,11,453,64,6,269 }, - { 121,195,60,16,126,107,98,271,146,407,132,35,1,167,199,223,493,191,279,20,18,5,43,7,21,92,48,393,0,362,212,467 }, - { 31,44,299,116,393,144,492,456,268,22,105,0,367,918,384,434,127,489,98,9,963,125,242,948,1,28,206,49,36,51,93,293 }, - { 23,13,457,51,165,401,719,758,197,453,961,629,308,14,15,12,730,3,386,569,391,29,739,515,34,828,832,901,115,514,670,341 }, - { 105,36,131,22,180,115,341,127,169,1,9,31,64,98,44,365,317,141,272,143,160,55,219,86,197,776,239,187,0,535,13,752 }, - { 1,6,2,14,66,25,29,5,108,67,65,114,19,38,26,52,74,7,24,18,69,86,36,388,64,51,17,83,23,46,42,75 }, - { 51,386,23,453,719,13,730,6,457,670,758,19,401,165,2,475,47,26,899,14,108,17,1,5,197,29,894,754,236,74,27,285 }, - { 252,18,9,101,121,16,132,0,419,167,364,60,604,35,265,363,146,271,39,158,68,109,28,329,848,24,647,907,682,159,212,55 }, - { 283,503,128,432,26,193,63,269,789,529,102,122,389,275,678,6,25,318,445,4,342,27,573,605,177,862,643,291,216,57,235,59 }, - { 2,1,29,75,69,52,14,6,46,74,87,7,220,226,278,38,135,66,267,70,16,262,25,24,380,324,357,140,67,394,97,222 }, - { 97,298,69,7,66,140,189,24,16,267,172,423,60,150,14,314,92,71,81,501,43,35,74,6,517,232,149,607,83,330,18,2 }, - { 475,421,403,899,51,805,523,958,453,817,23,615,401,801,120,326,202,670,494,730,450,386,115,629,260,576,77,365,569,0,165,13 }, - { 7,20,14,128,77,97,112,202,2,177,16,415,269,318,275,66,107,43,141,414,135,38,307,10,58,0,6,291,32,5,4,40 }, - { 24,14,7,0,2,1,22,28,16,65,168,124,35,67,108,109,18,49,10,149,69,158,5,95,289,12,55,6,36,71,46,21 }, - { 26,80,27,73,122,25,19,17,6,42,684,209,445,573,667,106,45,690,4,611,255,680,297,495,65,59,128,119,483,113,64,53 }, - { 107,259,362,376,465,20,470,147,595,534,612,683,660,43,5,49,581,0,858,35,427,246,97,786,178,356,14,21,142,878,7,279 }, - { 131,30,228,190,856,406,224,88,219,530,863,613,778,274,944,816,187,39,100,160,258,31,44,93,1,321,539,36,871,137,435,531 }, - { 113,6,311,25,45,491,80,611,27,26,209,667,17,73,122,42,684,396,19,85,106,5,614,4,2,255,151,29,1,64,648,61 }, - { 15,515,700,753,0,1,23,51,120,2,13,82,5,260,9,4,341,77,180,115,141,10,7,12,450,8,202,901,197,351,165,93 }, - { 219,127,98,258,395,421,924,293,242,201,697,105,276,51,308,23,453,272,401,944,512,137,13,31,284,567,386,365,116,131,964,125 }, - { 15,180,352,141,515,752,217,82,1,317,854,700,351,753,115,341,110,13,260,120,21,36,33,898,23,10,5,365,4,160,901,137 }, - { 129,123,17,257,162,184,205,249,183,769,5,80,3,4,229,130,119,45,90,99,618,106,57,497,12,128,2,84,59,152,27,40 }, - { 33,102,23,77,64,128,51,13,0,202,10,141,40,15,1,22,117,137,2,86,4,403,269,153,515,196,65,11,700,115,99,5 }, - { 7,14,2,16,172,107,46,92,5,135,35,202,294,87,38,232,29,97,20,21,24,1,60,220,66,43,12,0,126,52,54,70 }, - { 403,576,615,523,475,326,805,817,494,421,51,202,120,450,137,453,23,859,260,401,402,77,33,670,0,958,15,197,386,515,165,480 }, - { 141,352,217,137,0,180,202,349,9,269,23,51,115,291,77,372,13,317,120,752,365,351,93,22,2,341,64,10,82,854,28,18 }, - { 1,23,13,51,202,141,5,165,21,15,120,180,64,2,197,125,33,102,12,7,137,515,48,128,269,318,93,700,0,403,9,4 }, - { 25,151,6,145,122,29,174,45,113,74,4,665,42,138,2,614,416,287,19,348,746,0,66,26,1,7,64,243,311,396,81,624 }, - { 30,190,254,166,100,382,731,829,88,131,264,795,9,93,625,274,438,1,578,613,716,31,44,39,530,36,616,921,265,203,160,77 }, - { 132,5,21,13,1,23,32,195,379,687,156,121,626,296,48,70,850,146,51,82,883,771,35,49,652,407,60,4,260,0,845,33 }, - { 9,254,0,49,272,131,39,159,688,101,105,578,518,158,286,28,327,333,68,224,252,219,344,16,22,1,716,31,30,228,24,890 }, - { 16,7,35,60,18,20,14,68,9,0,28,118,43,92,126,55,107,2,101,154,24,71,5,202,121,109,22,252,21,97,1,621 }, - { 15,515,700,753,13,1,341,2,0,4,3,5,11,23,10,33,117,12,901,197,6,134,77,8,165,317,21,365,217,7,17,40 }, - { 78,19,444,47,26,390,27,453,130,813,108,730,711,65,412,122,51,680,113,235,690,196,630,283,128,236,14,64,73,53,200,445 }, - { 2,7,29,5,61,6,45,1,66,113,112,14,52,315,738,128,32,151,74,16,20,64,70,21,592,0,25,4,425,43,491,222 }, - { 145,112,74,66,6,29,26,70,19,396,25,87,2,287,135,151,138,222,5,226,42,122,7,307,1,644,45,58,113,651,635,632 }, - { 92,16,7,60,126,24,140,35,14,232,18,121,246,71,46,267,172,150,107,314,132,146,230,2,278,108,330,199,236,5,38,572 }, - { 13,115,197,538,569,341,98,55,165,127,365,762,219,286,844,23,170,206,734,638,535,901,169,253,629,0,873,509,180,10,332,258 }, - { 58,151,74,53,287,27,29,396,6,70,2,73,5,52,112,26,651,1,297,113,17,75,19,45,334,445,145,34,315,549,436,331 }, - { 214,289,90,874,104,751,64,65,312,835,204,249,750,194,74,81,875,32,519,288,348,0,174,247,636,715,138,192,784,6,524,280 }, - { 9,39,28,35,30,0,166,49,1,175,439,158,64,346,36,101,67,364,86,88,274,100,168,55,23,10,420,22,190,141,505,180 }, - { 341,901,15,515,700,753,1,365,10,0,569,180,2,197,115,31,165,3,5,4,44,22,317,13,9,951,23,253,116,143,762,93 }, - { 120,202,77,450,260,15,128,318,102,515,494,13,817,700,269,5,403,51,1,33,23,753,82,326,141,342,291,137,21,523,351,32 }, - { 13,115,241,64,180,32,125,197,165,4,118,22,21,23,16,247,237,28,225,191,95,141,167,5,0,341,288,35,459,18,177,24 }, - { 16,24,35,14,1,2,7,69,18,46,60,50,267,140,71,189,108,38,75,92,0,5,9,230,67,21,309,335,54,236,394,220 }, - { 15,515,700,753,898,180,901,341,197,638,10,165,33,1,115,4,77,365,317,13,102,217,117,0,5,2,253,3,82,569,21,752 }, - { 193,523,18,84,56,730,233,65,4,817,90,33,643,403,91,511,453,240,59,11,214,51,719,196,153,475,32,123,64,847,102,561 }, - { 112,29,151,2,74,6,66,7,222,145,287,45,5,624,52,25,113,416,58,122,19,70,186,204,4,87,644,549,337,884,32,0 }, - { 13,0,23,2,1,15,33,3,77,515,141,5,4,217,10,51,64,180,700,115,6,117,11,7,753,40,102,165,197,22,317,153 }, - { 28,0,1,67,65,9,2,114,83,69,103,50,36,22,55,24,46,14,124,109,35,7,16,38,133,160,389,323,18,12,154,5 }, - { 121,132,18,167,271,146,101,363,621,9,411,647,16,354,520,60,212,932,1,806,55,0,195,446,68,35,31,364,777,252,407,118 }, - { 26,6,85,396,122,624,25,19,42,445,64,648,573,416,174,680,665,214,45,348,90,65,194,145,113,881,138,289,112,436,297,544 }, - { 16,146,18,92,24,199,60,71,121,126,35,108,156,953,271,674,132,7,32,640,360,246,649,118,21,95,5,517,14,9,1,314 }, - { 51,13,23,453,475,730,719,15,457,403,64,115,33,95,4,523,3,12,21,6,899,102,5,128,401,202,11,141,308,515,22,125 }, - { 151,396,6,53,27,113,58,26,73,112,74,287,45,29,297,19,145,70,138,445,315,436,34,2,17,573,5,61,549,491,1,80 }, - { 223,1,888,774,260,98,269,385,349,202,96,141,421,622,730,863,318,697,87,453,393,418,922,834,751,5,163,335,120,291,352,30 }, - { 16,60,92,35,126,121,7,150,246,18,107,1,598,24,167,195,14,97,71,279,98,441,191,199,517,146,356,223,298,271,230,0 }, - { 22,1,105,28,239,170,0,55,95,31,36,301,2,320,98,127,9,49,44,64,35,67,10,86,5,12,109,23,168,13,21,312 }, - { 2,6,5,207,292,76,1,119,45,32,17,29,61,306,790,58,240,106,14,64,214,151,476,710,7,72,84,128,4,179,70,25 }, - { 51,23,221,254,115,13,438,530,125,48,21,39,541,960,386,49,1,613,15,840,228,308,627,131,688,401,5,326,421,158,165,83 }, - { 1,5,2,0,12,22,21,36,10,14,48,86,23,13,32,54,3,4,28,65,51,50,137,37,208,114,9,38,17,7,281,202 }, - { 363,23,447,182,296,340,1,93,698,478,379,156,284,144,18,269,21,98,141,70,668,411,664,658,110,914,67,937,180,691,335,291 }, - { 17,32,45,498,41,115,180,197,106,62,54,38,546,165,13,155,468,509,341,243,241,217,542,15,57,536,428,51,117,721,292,129 }, - { 32,95,64,246,22,92,180,13,5,652,125,241,638,237,7,49,4,126,21,115,197,296,888,316,0,165,774,23,16,392,1,534 }, - { 15,515,700,753,33,341,13,217,4,141,77,23,180,317,1,10,102,351,82,115,40,5,854,21,137,11,352,901,365,117,197,0 }, - { 15,120,1,82,93,217,515,260,77,141,13,110,700,351,352,23,180,753,21,854,202,317,64,349,269,51,165,137,5,128,291,36 }, - { 13,23,51,141,77,0,33,4,115,64,2,10,102,202,217,128,1,177,269,11,7,22,6,21,32,9,180,40,15,3,165,318 }, - { 478,264,1,520,98,724,9,682,223,664,21,759,13,772,604,100,23,363,411,48,821,5,0,905,909,447,31,265,88,101,166,39 }, - { 20,29,7,2,77,416,6,128,33,5,0,113,104,32,43,13,491,66,23,21,102,51,74,210,202,525,64,318,10,81,174,14 }, - { 2,1,5,14,7,58,61,29,45,290,46,38,52,21,32,270,6,592,425,0,75,155,16,48,17,50,72,70,207,24,263,663 }, - { 80,6,17,209,106,26,483,113,19,469,255,25,378,27,495,833,45,64,161,2,61,667,76,742,32,90,445,5,814,65,887,119 }, - { 98,223,393,1,354,834,195,791,447,697,284,293,360,541,781,156,51,807,18,664,421,411,163,668,48,31,591,765,883,386,948,23 }, - { 679,141,816,36,93,406,876,144,228,137,1,180,669,21,332,251,5,269,116,187,96,351,202,752,317,64,203,831,574,466,855,345 }, - { 15,515,700,13,1,753,2,0,23,341,3,5,4,10,51,11,33,165,6,7,115,197,12,64,180,153,217,77,9,569,901,317 }, - { 13,23,202,51,5,21,403,15,120,64,1,450,128,141,12,523,33,165,494,125,2,515,269,7,48,102,318,95,260,180,453,197 }, - { 16,18,24,60,71,92,146,246,199,35,140,7,9,118,121,108,167,230,126,132,0,640,156,14,68,133,267,360,649,271,64,55 }, - { 269,141,678,177,202,77,128,318,33,947,40,120,291,349,102,137,64,352,210,864,461,498,13,342,196,23,275,450,954,0,205,111 }, - { 16,24,92,18,71,60,35,7,108,191,167,246,140,14,126,21,1,68,150,118,149,388,399,9,273,0,121,796,230,48,212,517 }, - { 2,14,1,29,46,75,52,70,69,171,38,7,58,163,16,5,24,220,67,112,223,54,50,409,155,35,267,186,151,334,394,140 }, - { 9,252,100,265,166,39,88,404,329,0,1,520,382,812,101,593,264,274,604,676,30,118,68,553,18,664,363,23,639,865,21,411 }, - { 16,18,35,24,0,60,158,7,22,68,14,49,109,159,55,9,28,71,2,10,5,105,1,118,329,13,344,23,92,20,21,126 }, - { 15,13,515,700,23,0,753,1,51,2,4,10,77,5,3,197,115,165,961,202,9,457,180,12,141,22,33,120,6,11,318,31 }, - { 160,93,251,137,317,1,180,36,120,217,345,752,617,352,332,10,96,531,498,318,365,202,141,269,816,341,901,679,143,35,83,968 }, - { 6,25,42,128,19,59,122,4,85,26,611,27,269,233,45,0,343,91,318,80,11,177,283,73,33,614,2,77,64,138,445,216 }, - { 95,64,74,7,32,81,51,204,0,20,237,65,56,38,91,23,207,180,347,343,29,6,511,52,49,10,25,18,554,370,14,312 }, - { 202,120,326,260,450,817,494,318,137,403,128,77,523,553,859,5,704,1,15,23,13,576,7,16,615,51,682,291,515,0,21,234 }, - { 20,43,107,356,362,126,595,92,359,7,422,319,493,16,858,5,392,246,414,683,60,0,35,945,441,21,259,819,49,97,279,173 }, - { 25,42,6,77,33,102,0,122,4,690,29,483,210,27,21,19,2,300,18,648,680,119,117,59,1,10,342,12,26,153,91,684 }, - { 31,44,299,116,125,242,456,599,22,393,0,144,492,28,268,1,9,963,301,105,367,36,127,170,384,434,206,98,918,10,13,93 }, - { 410,521,686,367,662,88,335,321,201,96,98,772,144,1,934,921,443,435,284,274,264,551,120,897,44,100,33,225,744,418,909,960 }, - { 142,178,878,234,132,786,195,202,77,416,147,929,146,522,167,259,687,639,450,271,626,481,590,5,198,212,771,49,0,465,315,427 }, - { 254,39,131,9,272,0,578,716,310,224,30,49,105,827,518,829,166,333,616,228,613,846,101,219,1,31,890,98,159,938,252,100 }, - { 230,699,854,473,450,351,831,137,855,217,352,704,800,202,251,498,160,144,206,203,317,201,253,752,418,141,1,332,82,180,443,36 }, - { 403,202,475,453,494,23,51,77,318,402,13,33,128,102,137,141,120,342,269,0,450,4,899,576,40,421,275,117,217,177,196,64 }, - { 23,44,98,182,291,144,116,39,110,141,96,82,905,70,367,264,125,93,77,411,120,1,658,202,100,415,107,363,197,30,447,105 }, - { 15,515,1,13,700,23,77,120,0,753,51,180,202,141,260,5,21,115,2,137,128,9,450,197,365,269,12,326,110,102,318,7 }, - { 0,32,18,95,207,577,193,29,61,104,64,784,715,102,693,887,81,91,583,671,403,5,52,474,397,180,138,49,37,344,38,263 }, - { 0,101,9,68,252,16,100,39,166,364,124,24,154,265,212,88,18,35,329,419,28,118,71,30,65,158,191,55,1,694,21,676 }, - { 16,24,191,18,35,71,167,118,149,68,212,9,0,1,21,108,101,92,60,375,302,7,589,755,124,674,350,48,562,246,13,363 }, - { 2,14,69,24,1,67,46,16,38,103,29,267,7,35,189,135,278,71,108,18,83,309,52,6,149,388,75,236,60,0,150,66 }, - { 13,1,23,0,4,2,51,15,180,33,3,115,5,515,141,10,77,700,11,9,197,341,202,165,217,102,22,7,753,317,365,6 }, - { 447,1,698,411,31,363,98,5,919,4,156,125,759,691,13,64,459,354,44,21,48,293,30,914,478,225,82,120,2,922,848,839 }, - { 854,82,351,217,141,180,352,15,515,752,1,700,317,898,753,244,10,21,922,115,77,36,4,260,64,110,372,13,5,365,120,11 }, - { 0,1,4,13,5,2,82,33,3,120,10,23,9,11,77,260,21,102,8,31,40,6,351,51,64,450,22,117,93,110,7,457 }, - { 51,13,403,23,12,475,1,2,21,5,453,523,115,202,817,7,0,99,3,6,450,120,494,64,22,95,49,899,10,37,32,141 }, - { 180,752,352,141,498,864,317,217,9,0,115,237,230,39,30,197,83,1,930,64,35,365,372,13,579,88,702,36,101,901,482,21 }, - { 16,24,0,7,22,18,28,35,14,158,71,2,109,60,1,168,49,154,124,68,10,55,92,118,159,9,5,747,95,105,65,6 }, - { 15,515,700,753,0,23,1,13,341,51,120,77,141,4,137,33,260,82,202,9,180,5,351,2,291,10,11,901,21,115,40,128 }, - { 24,68,35,149,18,16,0,7,9,14,189,108,69,65,67,1,71,2,118,28,140,101,114,336,230,124,175,133,46,55,251,154 }, - { 13,308,197,115,125,9,165,237,391,23,509,569,546,28,49,629,22,338,317,254,749,180,468,159,903,386,217,352,558,39,36,734 }, - { 64,90,32,217,77,4,141,216,172,312,128,13,86,33,597,147,352,95,5,115,875,22,59,11,102,14,182,437,97,177,874,707 }, - { 1,22,36,0,105,28,2,67,95,49,55,5,239,12,86,9,83,170,312,64,31,21,23,10,164,50,114,159,208,13,7,320 }, - { 9,18,16,0,159,105,101,252,49,310,24,68,35,39,265,272,7,1,60,28,455,890,329,557,118,286,55,137,327,167,5,13 }, - { 129,123,214,249,618,17,5,257,205,184,460,76,2,162,769,245,90,106,128,45,119,1,183,4,3,12,179,64,6,229,99,209 }, - { 51,453,719,457,23,13,730,899,475,386,4,15,11,12,670,196,515,523,961,401,153,3,700,99,753,117,403,32,120,165,57,0 }, - { 173,66,192,204,20,74,104,636,7,43,289,426,825,712,560,214,81,750,65,97,707,0,90,414,64,348,32,500,22,861,95,6 }, - { 13,23,1,51,5,21,141,120,202,15,165,2,515,180,12,125,0,64,82,700,197,269,32,48,260,128,115,93,9,137,33,753 }, - { 200,34,322,78,472,390,27,714,19,14,136,161,453,176,236,444,59,3,62,128,108,57,283,862,73,53,47,17,412,813,4,56 }, - { 33,347,66,204,426,498,56,172,97,95,5,681,546,22,10,0,135,180,4,241,19,174,6,353,263,21,7,370,42,197,27,808 }, - { 131,224,219,187,385,371,258,442,254,737,31,98,836,127,924,944,44,871,908,716,39,827,201,574,116,137,36,1,276,242,578,616 }, - { 1,15,2,180,4,141,13,515,0,5,3,115,700,901,341,23,217,352,753,51,197,77,317,33,365,752,165,21,6,7,269,93 }, - { 370,91,718,74,81,510,397,66,636,240,355,84,138,511,18,278,6,681,701,289,90,5,214,582,64,104,0,643,192,65,750,32 }, - { 15,515,700,753,13,120,1,0,165,2,197,23,260,180,4,82,51,386,8,5,12,10,3,141,351,341,326,9,450,7,64,6 }, - { 32,2,76,5,1,292,72,45,476,214,21,241,29,14,17,48,129,90,179,460,464,123,290,148,519,205,3,263,249,38,710,89 }, - { 13,165,115,17,197,569,23,509,457,45,32,41,106,180,62,38,659,734,155,536,341,629,961,873,587,54,431,37,391,99,405,428 }, - { 68,24,35,16,0,101,9,124,154,71,149,65,18,175,28,118,7,55,302,108,92,14,22,346,1,39,429,252,375,364,10,67 }, - { 0,9,1,68,230,65,35,69,83,23,101,13,141,67,217,352,21,39,16,88,28,124,212,100,115,154,51,64,30,36,10,317 }, - { 100,265,88,909,410,382,812,593,1,213,321,30,252,230,352,264,9,166,689,39,676,98,21,466,724,639,478,217,13,48,553,101 }, - { 113,61,198,904,43,0,5,37,899,325,20,59,33,523,204,725,817,389,470,329,222,40,174,58,22,453,690,848,122,104,788,105 }, - { 16,24,141,18,7,0,71,140,35,269,75,352,12,9,108,217,5,330,60,64,199,70,22,13,486,246,318,133,65,50,23,498 }, - { 271,167,121,60,18,191,146,199,16,1,446,132,575,212,463,354,126,35,598,566,727,98,107,21,608,955,640,407,5,24,223,68 }, - { 15,515,115,217,700,13,317,753,141,180,33,23,110,120,4,341,82,10,1,260,365,36,64,854,351,21,51,352,137,77,40,0 }, - { 173,7,97,356,43,107,20,387,729,104,426,232,560,595,359,392,414,707,885,81,5,0,66,858,612,49,861,14,22,32,819,230 }, - { 15,515,700,753,13,23,51,82,0,33,165,120,196,4,1,2,197,453,260,351,180,12,40,8,386,110,5,326,9,141,217,457 }, - { 9,100,120,30,77,795,137,82,202,39,264,827,578,127,0,166,373,318,18,326,141,260,1,450,731,31,33,395,217,291,341,254 }, - { 14,2,67,1,24,69,0,28,65,7,46,18,114,108,36,83,38,398,9,16,124,133,103,154,50,55,22,267,29,160,35,547 }, - { 14,7,69,24,66,16,2,267,189,67,71,150,140,97,18,60,172,35,6,1,38,149,388,92,83,135,108,74,462,380,29,36 }, - { 15,515,700,753,1,13,2,0,4,341,5,3,23,365,11,117,180,10,12,33,134,115,77,197,217,165,6,7,317,102,21,9 }, - { 1,22,0,12,5,2,36,28,21,10,86,13,23,49,128,9,95,51,55,96,208,141,48,202,4,137,37,64,105,3,50,7 }, - { 605,630,63,123,736,650,65,108,444,368,561,389,19,25,42,619,122,194,183,27,53,33,84,26,297,813,114,73,256,235,249,216 }, - { 127,39,9,0,31,371,98,254,1,385,395,44,30,836,187,131,100,116,284,578,299,166,28,21,737,16,276,272,23,49,137,935 }, - { 2,61,6,29,45,151,1,655,7,207,32,5,112,425,17,76,833,4,14,64,58,106,119,25,113,128,72,52,70,21,292,790 }, - { 2,5,17,14,3,29,23,27,13,401,46,6,51,58,1,453,45,53,34,52,133,19,236,26,181,114,99,366,151,108,218,38 }, - { 31,22,170,1,301,44,127,98,36,28,55,105,239,0,338,116,512,299,293,125,86,10,242,395,2,13,9,64,841,23,95,685 }, - { 1,21,2,14,5,32,48,50,38,270,46,76,290,72,45,54,17,0,155,221,263,207,37,281,430,3,89,12,181,408,36,67 }, - { 17,106,119,378,84,240,62,80,383,136,306,3,56,790,742,5,207,504,64,440,32,128,45,2,123,209,14,4,61,57,297,667 }, - { 13,15,1,515,23,2,0,700,4,115,3,51,5,10,753,180,33,341,217,11,165,317,365,197,6,77,40,64,22,9,7,117 }, - { 772,335,96,744,1,367,662,686,652,897,303,264,521,31,225,410,141,520,260,116,64,44,321,98,144,88,919,966,340,269,349,284 }, - { 1,0,5,28,36,2,12,22,83,67,65,50,24,14,9,96,21,218,18,114,48,281,54,10,7,160,181,103,37,23,133,99 }, - { 25,6,145,42,138,81,174,348,525,544,26,74,85,280,287,648,746,91,66,0,29,396,204,64,636,90,122,194,355,104,65,233 }, - { 16,24,0,18,28,158,7,35,49,22,68,159,55,1,14,109,105,2,9,71,65,154,124,95,424,344,60,239,118,577,21,10 }, - { 352,854,699,230,93,689,137,144,217,160,251,36,669,202,351,120,617,855,752,203,332,82,450,180,141,748,831,30,258,201,1,816 }, - { 15,13,515,700,1,2,0,753,23,5,4,3,51,33,10,115,11,317,217,77,180,341,117,165,6,134,197,153,64,9,102,7 }, - { 104,20,43,173,66,319,0,77,202,7,198,5,97,580,355,74,2,204,174,52,712,234,426,155,102,192,32,4,500,337,226,904 }, - { 13,23,15,51,1,515,0,2,5,700,141,4,753,165,115,12,3,21,457,10,180,269,32,6,197,202,9,7,120,11,77,33 }, - { 15,515,700,753,0,1,13,2,23,115,4,317,8,3,5,51,9,341,10,217,22,365,33,457,6,180,77,901,197,120,18,7 }, - { 15,515,700,753,1,0,2,3,4,13,5,141,23,16,82,217,457,10,365,180,9,317,51,21,269,898,64,202,11,12,318,341 }, - { 2,5,1,14,50,38,29,17,114,46,133,3,45,21,58,171,181,36,218,12,6,52,0,48,137,65,361,23,155,4,285,51 }, - { 0,68,9,65,101,124,35,212,16,149,154,100,24,1,114,336,67,589,252,39,71,189,69,562,18,13,30,398,118,88,265,264 }, - { 1,2,14,22,0,7,67,65,28,36,24,46,168,5,86,69,38,16,49,12,289,10,194,50,83,114,95,6,18,23,55,158 }, - { 13,23,15,51,515,0,700,4,1,753,2,10,115,961,457,12,33,11,3,5,197,9,165,77,102,403,453,40,64,22,37,59 }, - { 15,515,700,753,0,1,2,13,23,5,51,901,8,9,180,3,7,82,4,120,12,10,719,341,6,31,141,457,197,22,115,93 }, - { 100,252,88,101,0,265,9,724,48,1,21,352,213,676,410,382,321,230,30,329,593,909,39,812,553,217,23,689,520,264,166,419 }, - { 2,5,1,58,171,14,46,50,29,52,45,38,186,155,67,54,151,281,334,61,48,96,17,181,103,400,502,227,21,223,12,69 }, - { 23,120,13,1,202,141,51,21,165,128,260,15,5,269,137,64,33,180,82,318,93,197,77,326,515,125,110,700,450,2,32,48 }, - { 341,197,10,901,13,15,8,638,569,515,479,23,180,873,700,165,143,642,0,961,753,951,1,115,509,499,116,12,498,242,82,206 }, - { 15,1,23,13,515,21,120,51,2,141,202,700,5,180,165,0,753,197,12,7,33,260,352,137,269,4,82,128,48,9,110,6 }, - { 2,29,50,58,1,6,5,52,14,262,17,46,27,53,151,34,171,74,324,26,38,309,45,113,19,96,287,396,223,67,73,583 }, - { 13,23,141,51,4,202,0,115,77,2,33,217,5,317,180,64,10,269,3,9,15,21,1,128,102,137,318,11,352,515,22,31 }, - { 1,67,0,24,50,5,14,18,16,69,2,9,103,35,83,12,96,28,54,7,58,223,21,46,281,48,65,181,22,38,36,108 }, - { 13,23,141,51,77,64,202,115,33,102,128,4,0,269,10,21,217,32,180,318,9,137,2,11,22,291,7,177,16,31,165,197 }, - { 317,115,180,365,873,498,217,341,13,752,482,197,569,352,1,901,36,23,457,468,165,346,546,143,509,134,579,876,868,2,332,21 }, - { 184,257,205,229,152,17,57,497,266,432,452,524,5,619,381,32,4,90,2,12,313,128,45,59,245,106,3,471,129,769,339,214 }, - { 13,23,1,0,15,2,4,515,51,3,10,33,5,700,115,180,753,77,11,365,341,217,9,6,197,7,102,165,317,40,22,64 }, - { 626,70,771,687,379,846,767,761,518,878,82,481,31,786,49,591,178,163,407,44,87,13,845,125,590,371,195,120,98,557,937,351 }, - { 264,1,410,909,772,897,686,521,335,478,98,96,691,639,100,44,284,382,31,321,744,88,914,724,662,765,223,9,682,363,0,367 }, - { 13,23,1,2,0,15,51,515,5,10,4,33,115,77,180,700,3,141,217,40,6,753,317,197,64,165,7,11,102,9,341,22 }, - { 141,77,13,64,269,23,115,21,318,217,5,202,102,33,137,2,15,291,177,51,48,180,32,4,515,352,128,7,0,10,96,11 }, - { 13,15,23,515,51,0,700,753,1,2,4,10,33,11,961,453,115,40,457,14,12,3,9,5,165,401,197,77,22,21,64,102 }, - { 1,22,0,36,2,31,5,12,13,105,28,9,49,86,141,21,23,95,128,55,44,115,170,10,164,98,180,4,137,239,83,51 }, - { 100,101,88,0,252,9,265,30,21,39,759,724,213,329,321,13,419,68,562,382,676,352,694,35,553,410,1,166,909,593,230,23 }, - { 539,88,30,190,321,530,840,144,669,435,957,748,778,100,96,418,203,213,1,131,410,228,466,274,36,382,219,863,613,83,822,352 }, - { 7,97,92,173,298,107,43,314,232,140,16,356,20,387,729,362,126,359,246,14,230,501,426,441,0,5,560,66,104,779,35,60 }, - { 2,14,7,1,58,5,46,16,38,70,75,45,24,155,29,0,21,52,61,163,220,50,69,270,35,48,32,171,18,6,64,54 }, - { 447,411,363,664,647,98,621,1,354,271,223,478,18,777,781,936,360,759,167,132,121,48,21,156,9,195,118,293,23,691,13,264 }, - { 2,1,14,65,36,67,0,7,46,22,69,5,38,24,28,6,83,29,86,114,168,50,124,208,12,18,108,10,194,484,103,16 }, - { 421,386,51,791,730,958,165,801,23,453,697,403,615,13,221,523,24,899,401,326,551,670,576,102,18,33,125,77,566,115,203,197 }, - { 104,319,422,945,0,81,20,43,715,32,784,693,879,7,397,74,306,207,52,681,671,2,61,173,6,636,904,95,887,5,18,192 }, - { 2,29,1,46,14,52,70,262,6,26,50,67,75,96,309,38,103,112,58,19,5,163,145,83,74,220,223,357,24,69,331,25 }, - { 786,929,590,771,687,626,941,178,465,259,70,5,13,21,35,534,107,518,132,49,878,48,146,121,379,279,31,767,147,195,108,125 }, - { 5,2,186,45,17,29,48,50,14,61,46,155,400,1,227,171,52,58,38,54,430,209,80,281,3,106,536,311,181,243,21,502 }, - { 0,32,64,95,817,494,342,403,207,202,194,389,453,365,312,180,316,5,690,237,848,577,450,61,102,523,475,289,49,241,65,482 }, - { 23,13,51,15,202,515,1,0,2,12,115,4,700,5,165,753,77,457,21,141,128,10,3,64,403,32,197,318,9,11,33,117 }, - { 119,2,6,76,5,17,45,292,306,240,32,1,19,84,64,61,4,209,710,80,26,0,106,27,214,25,128,129,29,179,3,113 }, - { 15,515,700,1,753,0,13,23,180,120,51,2,5,33,165,197,9,450,7,260,115,523,4,12,202,141,82,77,21,102,8,6 }, - { 6,138,74,280,222,85,66,226,25,42,87,204,64,337,29,135,95,174,235,26,145,65,19,32,792,294,112,52,256,2,5,22 }, - { 39,9,0,101,333,158,49,252,310,254,272,68,16,18,159,286,344,455,30,109,627,327,24,105,419,100,364,22,35,1,329,709 }, - { 21,5,32,14,2,1,38,72,76,54,17,3,48,221,270,0,45,46,12,181,37,89,36,50,540,290,430,10,4,741,99,23 }, - { 120,137,202,269,141,260,318,450,922,494,77,291,82,15,5,351,128,1,515,326,64,854,700,352,342,21,753,678,349,32,523,90 }, - { 16,24,68,35,71,18,149,118,191,167,9,0,212,124,65,246,7,67,140,189,399,101,133,60,1,108,267,114,69,92,695,154 }, - { 28,0,9,67,1,22,109,36,55,65,194,114,39,83,49,69,2,35,103,50,158,208,86,420,168,289,505,24,7,185,5,323 }, - { 147,325,198,427,142,178,202,798,5,376,20,318,259,43,120,450,77,234,534,904,470,465,878,725,329,14,315,0,260,858,70,61 }, - { 28,0,65,9,109,1,55,67,35,22,24,39,289,7,175,14,114,2,158,124,420,194,68,16,336,36,49,69,168,570,154,505 }, - { 141,180,13,115,1,23,4,269,2,202,0,317,217,51,15,5,21,352,77,318,3,752,197,10,165,365,137,341,9,515,33,64 }, - { 15,515,700,0,753,1,2,13,23,9,51,5,4,901,33,7,8,3,12,93,180,120,197,6,82,341,10,141,22,260,457,115 }, - { 6,74,66,1,25,75,324,380,278,26,138,85,135,500,87,42,220,841,97,350,29,19,70,226,38,21,52,606,235,889,2,14 }, - { 13,23,51,12,4,15,453,0,457,1,403,165,115,3,11,2,64,5,401,10,515,37,202,33,40,32,99,475,197,700,308,17 }, - { 788,180,5,83,693,319,314,4,32,21,17,11,817,3,510,498,33,12,24,104,814,120,64,117,306,804,523,450,288,160,102,43 }, - { 15,1,515,13,2,700,23,0,4,753,3,341,5,51,33,11,10,6,77,7,115,102,180,165,141,9,197,217,901,40,12,64 }, - { 101,18,9,167,520,16,0,118,60,212,604,364,694,24,55,252,68,917,264,35,1,121,146,363,39,100,806,5,21,166,191,28 }, - { 13,23,0,33,51,141,77,4,64,2,115,217,9,102,7,202,21,10,180,3,15,128,5,269,6,32,11,16,165,352,22,317 }, - { 66,135,97,74,172,6,278,7,204,324,138,174,29,85,2,87,25,140,92,192,52,38,802,69,448,500,808,620,22,1,280,232 }, - { 2,24,69,6,97,7,1,0,14,298,423,66,67,29,150,25,189,267,124,74,607,18,36,81,172,33,83,38,52,273,71,809 }, - { 0,9,158,39,68,49,109,16,24,333,35,344,101,22,159,254,272,30,124,65,28,18,793,154,310,252,327,105,627,419,286,55 }, - { 15,515,700,10,753,33,77,180,4,341,1,197,13,115,365,23,901,317,5,102,11,217,165,117,141,40,2,3,253,21,134,55 }, - { 100,166,382,478,265,264,88,39,98,1,404,274,9,593,724,921,639,438,363,682,411,31,30,812,96,447,821,905,252,0,223,435 }, - { 19,283,436,53,297,26,813,432,27,128,42,25,390,503,122,736,73,123,605,63,389,529,630,250,690,65,381,444,6,269,108,216 }, - { 33,202,13,128,494,0,51,141,269,1,4,2,102,180,15,137,65,95,6,450,77,40,117,59,457,36,196,817,134,86,49,515 }, - { 18,16,60,68,101,167,191,118,35,121,9,212,55,0,126,1,24,647,199,146,520,107,628,621,363,71,21,28,346,92,806,727 }, - { 352,230,217,531,160,93,36,669,748,854,689,258,137,871,728,699,752,251,574,202,373,351,228,120,717,260,144,219,268,82,816,1 }, - { 15,515,700,753,13,23,0,51,8,1,4,82,165,77,110,33,10,180,5,202,11,22,120,12,9,197,115,93,403,141,40,351 }, - { 0,9,101,217,35,88,352,100,39,175,30,68,562,752,13,317,252,115,180,197,64,1,83,141,65,213,165,230,194,36,28,265 }, - { 32,64,5,470,288,90,21,147,0,95,356,22,20,519,835,312,819,18,247,182,11,97,13,4,387,49,43,298,316,48,107,7 }, - { 16,14,24,1,7,2,35,0,5,50,18,69,46,12,58,75,9,67,70,163,21,54,38,48,223,502,281,37,140,60,28,10 }, - { 18,265,9,252,39,195,354,411,1,16,132,101,121,682,167,203,5,363,146,593,35,333,21,271,60,13,100,0,156,327,7,520 }, - { 4,13,1,115,141,23,2,180,5,0,51,3,217,202,77,15,33,269,341,318,317,21,165,515,11,10,197,365,9,137,64,352 }, - { 7,24,16,14,71,35,18,92,140,189,108,149,68,60,69,150,2,230,97,66,0,458,67,1,65,251,38,314,388,267,36,46 }, - { 1,22,31,36,0,2,44,5,141,105,180,170,12,64,13,98,86,55,23,21,28,164,115,127,10,125,128,4,9,239,352,197 }, - { 66,7,97,2,192,20,52,43,135,74,560,107,104,0,750,147,414,29,580,173,324,376,226,194,77,174,204,38,356,64,16,470 }, - { 0,101,9,68,35,124,24,65,39,16,252,100,154,166,28,364,149,694,30,88,55,346,1,419,71,439,265,289,22,21,175,158 }, - { 39,9,100,30,127,0,737,856,31,836,827,254,98,931,166,88,93,1,44,190,131,228,120,395,625,385,863,264,219,373,110,28 }, - { 5,17,2,3,21,45,14,155,48,32,38,1,328,181,186,46,23,51,12,61,227,29,106,54,99,133,62,832,13,37,514,543 }, - { 131,613,30,224,228,716,274,100,827,406,219,856,39,190,31,88,1,166,9,44,829,863,931,93,0,187,625,924,127,98,137,254 }, - { 352,217,64,141,752,269,180,864,437,372,954,115,498,177,77,349,317,318,579,291,947,197,247,0,23,717,237,304,128,457,776,678 }, - { 141,217,180,317,352,115,15,341,1,23,13,365,515,752,2,64,0,5,498,700,372,165,51,237,753,77,244,197,137,4,21,253 }, - { 5,2,17,14,1,45,3,38,21,29,181,58,46,48,50,133,114,171,61,155,32,6,186,281,361,12,36,54,4,13,52,514 }, - { 6,2,119,25,790,4,45,483,655,113,1,29,76,26,32,19,887,17,128,0,292,833,59,61,106,64,77,814,14,151,84,42 }, - { 13,23,2,0,51,4,1,115,141,3,5,180,33,217,77,9,202,11,7,15,10,6,317,64,21,197,515,165,102,128,22,269 }, - { 13,23,51,202,21,5,1,15,141,165,120,2,115,12,32,0,515,128,318,64,125,700,4,403,197,453,180,457,3,7,10,6 }, - { 9,39,31,30,0,127,1,44,100,131,98,187,385,276,88,442,219,908,254,116,49,166,935,28,201,36,141,827,137,299,284,21 }, - { 22,49,28,109,9,185,105,95,1,131,159,272,36,67,86,254,39,55,35,0,505,31,83,169,208,327,286,98,168,535,312,708 }, - { 5,1,2,21,0,12,48,22,10,14,36,3,32,17,23,54,86,38,4,51,13,37,137,50,65,281,114,45,28,99,58,202 }, - { 141,269,352,82,217,351,180,854,372,922,752,1,15,260,317,318,515,202,64,700,120,349,954,753,77,35,67,717,898,137,365,115 }, - { 144,203,613,418,326,406,96,669,137,679,1,228,494,822,840,317,36,83,855,160,817,859,856,816,217,831,345,93,876,77,44,251 }, - { 15,13,515,23,700,753,51,1,33,0,202,21,2,5,180,141,120,165,217,82,12,117,4,352,269,197,115,32,3,9,134,260 }, - { 13,23,115,1,0,51,4,77,2,33,15,141,10,5,341,180,515,3,217,202,9,365,317,64,700,102,11,165,197,22,753,7 }, - { 531,943,373,160,728,93,206,260,261,559,964,269,717,535,332,384,365,295,110,533,141,10,180,352,244,137,120,55,959,564,36,253 }, - { 1,31,36,170,22,55,44,10,86,64,127,0,2,98,301,164,740,338,237,143,5,125,116,13,242,141,299,180,23,169,105,12 }, - { 13,23,141,77,51,4,64,32,33,202,115,269,102,128,21,0,177,180,318,90,40,10,7,5,137,15,217,352,9,291,59,22 }, - { 202,120,260,318,77,15,450,269,1,82,33,23,141,13,51,515,351,128,700,5,64,326,137,21,102,110,753,494,93,523,817,165 }, - { 523,899,102,33,730,15,23,403,719,117,153,13,515,51,475,4,700,5,453,817,196,753,494,40,202,120,1,2,450,457,17,421 }, - { 202,403,494,450,120,817,523,475,318,453,33,402,128,77,13,51,260,576,342,102,15,23,515,4,700,5,82,753,326,210,137,615 }, - { 2,29,112,66,7,52,70,151,58,87,135,5,74,226,307,6,14,186,1,45,549,172,644,25,113,287,46,155,334,64,294,97 }, - { 1,77,349,291,260,120,652,102,5,39,64,269,9,33,340,342,13,98,888,698,23,296,100,318,51,202,87,137,638,128,50,850 }, - { 1,13,15,2,0,4,23,515,5,141,180,3,700,341,115,51,753,269,77,901,197,352,217,33,21,11,365,6,165,202,7,317 }, - { 7,14,16,2,46,5,70,58,1,38,24,35,92,163,0,75,21,18,50,54,140,12,87,220,155,69,171,23,60,9,13,307 }, - { 2,29,66,226,135,7,87,74,52,278,6,75,222,220,294,70,97,1,145,25,172,262,324,38,69,112,331,92,5,14,140,26 }, - { 15,515,700,753,33,77,117,4,1,102,134,40,153,11,13,196,217,21,5,51,23,115,32,3,2,202,141,137,128,291,48,177 }, - { 15,217,82,515,351,141,317,1,13,700,260,77,110,120,115,854,23,753,180,51,21,36,137,922,5,64,365,352,291,202,93,341 }, - { 31,190,30,373,120,110,863,88,44,127,908,856,260,318,82,98,93,187,836,717,935,39,442,131,141,254,228,219,1,968,77,116 }, - { 23,13,308,9,165,115,51,21,401,125,49,39,197,391,159,254,217,743,28,438,773,629,558,386,341,95,32,317,876,679,109,166 }, - { 49,5,43,165,7,0,21,104,125,22,173,422,64,13,623,102,20,18,314,95,91,141,23,31,193,51,391,900,779,558,92,232 }, - { 1,4,13,2,15,0,23,515,77,3,341,33,5,700,115,51,202,753,141,180,11,10,102,217,6,901,40,7,197,318,317,365 }, - { 93,843,295,120,36,160,206,261,10,137,567,110,384,141,943,268,201,332,258,55,1,180,64,116,44,144,699,203,282,31,260,373 }, - { 81,7,192,426,43,173,172,104,879,91,5,712,715,526,6,97,568,95,448,66,33,861,560,32,49,20,0,636,232,825,2,22 }, - { 475,403,51,453,33,102,13,23,494,202,0,196,15,77,153,18,4,117,515,450,318,22,730,128,700,421,65,753,269,402,134,817 }, - { 141,269,260,318,202,120,352,349,82,351,1,5,854,137,64,291,15,922,180,851,32,77,515,372,21,700,7,217,13,947,33,753 }, - { 15,515,700,753,1,13,0,2,4,23,3,5,180,115,197,12,51,165,217,10,961,9,6,141,352,21,8,7,33,77,457,120 }, - { 39,166,9,30,0,101,274,404,252,333,190,100,158,438,310,88,68,265,656,21,1,530,329,344,49,539,625,254,13,131,48,419 }, - { 0,1,28,9,22,12,65,83,67,36,5,2,50,55,96,109,16,13,24,23,21,238,49,18,285,160,128,39,69,114,7,323 }, - { 15,515,700,1,0,753,23,2,13,51,5,180,115,6,3,9,197,12,457,120,7,165,901,82,4,21,8,141,31,33,719,341 }, - { 9,39,30,0,28,166,22,49,180,1,352,35,317,158,88,141,498,131,115,345,752,128,228,217,100,83,219,930,13,251,365,36 }, - { 5,61,45,2,80,29,311,209,6,17,58,1,151,106,454,667,243,70,52,496,287,592,255,738,64,74,483,14,27,32,112,19 }, - { 31,125,22,44,299,456,685,242,599,116,170,28,0,1,492,393,506,144,558,10,268,301,239,23,13,36,963,367,55,206,105,95 }, - { 187,258,926,574,839,93,228,860,406,219,871,160,137,531,224,116,120,902,669,201,36,131,44,144,843,533,318,384,442,1,434,268 }, - { 7,107,75,16,87,9,64,177,24,18,291,77,349,141,60,232,23,0,51,269,132,14,5,21,70,32,678,112,126,121,71,947 }, - { 15,515,700,753,1,0,2,13,5,3,23,180,4,115,901,51,6,8,961,9,7,10,12,82,197,22,141,341,33,120,365,457 }, - { 13,23,51,1,5,202,2,12,15,21,165,141,0,115,3,4,32,515,197,10,180,318,128,120,64,700,6,7,403,269,457,137 }, - { 1,2,0,77,64,3,141,13,33,15,23,10,6,102,5,515,180,4,117,7,700,165,11,217,269,40,753,115,128,17,197,134 }, - { 345,531,332,269,260,317,717,752,373,351,180,352,728,82,10,365,160,533,217,143,498,251,244,93,341,901,36,1,141,898,55,864 }, - { 16,7,33,189,92,77,388,60,140,35,102,24,14,1,230,21,150,117,733,314,18,915,71,13,108,134,5,64,69,2,98,22 }, - { 142,202,234,178,5,786,77,49,70,0,416,450,639,878,1,48,21,929,147,259,315,455,198,120,12,481,163,113,846,329,318,22 }, - { 81,715,192,0,173,712,681,104,636,91,74,20,750,370,7,718,95,879,22,43,825,560,422,64,207,49,172,18,397,10,426,319 }, - { 13,23,15,51,515,0,700,753,4,1,961,2,10,115,457,11,33,453,3,5,9,40,12,197,165,77,401,475,64,102,22,569 }, - { 64,297,5,445,95,61,250,311,80,34,17,312,45,2,86,472,58,14,180,53,22,151,869,738,247,237,29,1,128,165,21,288 }, - { 16,24,18,71,7,35,118,92,14,154,60,68,0,149,28,302,124,150,55,175,2,9,97,1,429,20,108,273,22,65,43,126 }, - { 195,360,156,771,132,163,626,687,591,371,883,146,121,846,70,586,379,13,293,98,407,48,761,296,354,18,31,1,55,49,21,105 }, - { 202,13,77,23,318,33,51,0,4,141,5,21,217,32,291,102,64,128,15,10,9,494,269,137,515,403,1,31,117,700,120,317 }, - { 7,192,97,81,172,66,426,173,43,715,712,232,861,879,104,330,568,298,74,893,885,526,387,825,92,140,91,14,636,6,5,448 }, - { 2,1,14,6,67,7,65,69,24,36,66,124,108,83,38,29,22,86,0,18,484,5,28,46,12,10,25,302,150,16,650,74 }, - { 33,77,13,202,102,4,0,23,128,51,141,64,318,22,403,269,137,10,15,40,494,117,32,59,11,153,1,21,177,196,515,115 }, - { 269,141,318,77,349,291,217,202,33,15,372,304,515,22,102,177,351,700,352,120,5,137,10,317,260,753,64,851,854,403,49,21 }, - { 6,74,66,85,138,25,87,42,135,26,226,222,280,29,75,500,220,278,792,70,19,2,1,294,204,64,32,145,853,112,52,174 }, - { 9,0,105,39,16,18,1,101,272,31,127,98,24,518,333,252,310,28,68,737,846,371,158,916,938,49,30,7,286,35,301,455 }, - { 2,1,14,67,24,46,83,108,69,29,38,103,114,36,6,133,18,0,28,7,65,52,236,75,50,398,5,309,135,16,278,160 }, - { 22,1,28,105,49,95,0,2,67,55,36,239,168,159,65,35,14,170,320,164,9,7,10,5,114,12,83,64,194,109,24,301 }, - { 7,66,172,97,92,140,232,568,298,14,192,314,16,380,135,324,2,330,74,38,357,448,126,69,35,5,107,6,387,60,204,572 }, - { 2,29,1,14,5,6,46,133,114,50,52,26,218,108,19,13,366,236,27,45,70,17,58,23,86,51,137,65,112,38,25,12 }, - { 6,1,74,25,2,26,29,66,42,19,75,14,388,67,108,70,52,85,103,65,38,138,357,133,114,594,324,516,603,96,309,69 }, - { 22,9,28,1,36,49,109,105,86,95,131,31,169,39,0,141,272,159,44,55,98,180,13,30,185,115,83,128,352,137,64,208 }, - { 203,822,326,23,77,859,403,494,576,39,473,182,33,1,691,100,18,217,13,817,411,447,363,102,93,966,96,478,291,704,310,120 }, - { 15,515,700,753,33,77,117,4,102,134,115,153,13,1,40,217,11,196,341,2,5,3,23,317,365,0,21,291,32,51,12,569 }, - { 15,515,700,753,13,0,23,1,8,82,51,165,197,120,180,2,9,33,4,110,5,12,10,260,351,386,141,7,457,475,93,901 }, - { 1,23,13,15,51,0,21,2,515,5,141,180,120,165,700,202,197,4,753,12,33,9,7,82,115,93,3,352,260,6,110,48 }, - { 15,515,700,753,0,1,2,5,3,4,8,13,180,341,10,23,7,6,9,51,77,197,961,115,165,82,120,31,22,202,457,217 }, - { 559,661,922,564,141,533,10,317,373,110,143,269,244,260,332,261,93,642,752,295,351,876,531,843,180,206,728,384,352,1,434,120 }, - { 15,515,1,0,700,2,13,23,753,5,51,180,3,165,12,6,197,115,4,9,7,21,719,8,457,82,141,120,33,22,901,10 }, - { 198,234,0,325,5,77,202,416,20,147,32,43,639,315,49,61,450,455,142,21,113,230,22,318,725,342,207,13,95,904,494,10 }, - { 447,264,363,9,411,676,682,1,156,664,821,478,166,354,812,39,100,905,382,897,98,18,759,404,31,101,724,5,265,223,88,13 }, - { 24,14,69,16,35,18,2,7,108,189,71,67,267,149,1,46,68,83,38,140,0,236,251,9,388,60,133,103,65,28,29,50 }, - { 16,7,35,20,14,18,109,2,43,120,107,60,1,121,326,907,553,77,13,147,23,82,68,260,0,403,5,24,202,126,265,199 }, - { 30,131,187,276,31,44,613,442,39,9,190,228,1,839,116,935,908,219,127,88,244,224,110,137,93,201,98,141,36,567,0,856 }, - { 98,223,1,393,812,265,100,421,593,834,697,48,51,410,791,382,21,88,31,284,9,125,96,293,230,23,213,217,656,689,541,5 }, - { 98,51,127,219,616,258,105,293,395,421,924,512,31,308,23,201,116,44,301,272,763,276,125,13,453,170,401,295,261,944,115,567 }, - { 253,110,951,352,811,206,332,180,141,244,282,10,854,417,642,638,559,752,143,911,260,55,93,533,499,498,661,120,351,959,564,341 }, - { 49,9,159,254,272,158,0,131,28,39,627,105,327,286,22,518,688,578,68,347,374,101,224,424,95,35,219,24,16,364,65,344 }, - { 105,22,131,272,98,286,327,109,374,239,28,95,320,219,9,224,55,127,187,36,578,169,64,185,538,1,159,10,371,634,49,616 }, - { 691,478,340,1,658,914,724,363,744,698,156,772,411,296,682,447,9,284,335,98,264,303,909,21,354,410,225,13,664,686,88,919 }, - { 16,24,35,18,71,7,140,108,189,267,92,60,14,230,68,69,9,1,149,46,246,191,388,167,2,0,118,236,133,21,674,5 }, - { 9,0,127,31,98,371,395,39,737,49,1,44,385,272,512,28,293,242,836,761,254,299,101,16,187,22,116,158,159,131,18,21 }, - { 16,18,68,0,60,35,9,101,252,28,118,24,419,55,7,109,604,71,39,121,22,364,14,158,191,167,925,126,329,21,92,49 }, - { 116,268,203,93,206,692,551,31,417,940,499,8,473,44,202,523,959,0,120,137,559,22,450,403,576,10,728,299,13,326,51,1 }, - { 225,459,744,1,919,914,691,330,622,21,141,223,5,284,934,335,88,538,340,82,385,839,363,120,478,98,48,30,64,32,686,166 }, - { 5,1,2,0,14,36,21,281,12,48,50,67,22,28,54,83,24,218,38,10,181,9,32,18,65,58,45,114,430,17,99,37 }, - { 137,450,202,704,120,260,326,318,968,269,851,403,291,77,23,141,182,310,494,373,351,457,82,890,349,110,60,128,817,678,105,96 }, - { 15,515,700,753,4,33,13,23,77,5,40,11,102,93,1,21,110,51,82,117,141,2,10,8,32,64,120,31,202,3,217,115 }, - { 15,13,1,23,515,0,51,2,700,5,753,21,180,141,165,3,12,115,197,4,7,6,457,9,352,202,33,8,719,120,77,341 }, - { 219,127,258,98,276,201,131,395,944,293,116,284,567,31,242,105,137,935,295,44,403,860,51,224,576,456,9,371,578,475,202,512 }, - { 16,7,18,35,60,0,14,20,118,28,68,22,2,24,1,92,158,107,5,49,154,126,109,12,43,10,55,6,677,71,21,168 }, - { 93,728,531,160,559,373,574,120,295,860,533,269,717,260,926,902,258,318,36,201,261,434,851,137,617,141,187,352,843,384,332,251 }, - { 28,0,1,9,22,109,83,39,49,12,36,67,55,5,96,2,128,30,158,69,21,23,160,208,35,13,65,323,50,141,194,238 }, - { 23,13,51,1,5,15,141,21,0,2,165,515,202,700,12,197,180,120,32,115,4,753,64,9,7,269,6,3,125,386,48,453 }, - { 5,21,13,49,14,20,7,23,43,32,1,0,652,48,713,22,38,2,16,132,955,107,12,279,24,888,197,640,70,303,18,638 }, - { 9,0,28,39,1,30,35,101,22,67,83,141,49,175,36,68,55,88,13,251,10,69,23,158,180,115,64,100,217,65,345,166 }, - { 260,120,82,269,5,450,351,1,202,141,854,13,77,922,32,33,137,4,23,125,291,21,15,515,165,349,177,700,318,326,180,753 }, - { 121,16,18,35,363,101,60,20,107,14,68,259,621,55,604,43,7,252,9,364,126,0,167,191,5,407,132,28,199,419,146,10 }, - { 13,23,51,1,15,0,2,141,5,515,12,21,700,115,165,180,4,753,3,197,202,32,9,120,7,8,6,11,37,10,457,269 }, - { 201,144,206,443,418,203,435,96,335,459,187,1,88,332,330,321,269,934,30,372,822,521,268,326,44,523,382,141,410,264,494,473 }, - { 31,44,276,201,116,131,284,662,567,144,9,489,98,295,268,434,0,30,137,39,93,1,187,22,219,918,110,299,141,36,224,384 }, - { 520,478,664,1,264,604,9,167,777,759,411,0,806,724,48,21,101,68,647,936,363,223,118,682,410,18,100,16,252,98,265,13 }, - { 23,70,21,87,60,75,120,182,163,379,92,18,7,937,71,121,446,132,24,98,931,126,107,77,795,195,115,44,411,146,51,850 }, - { 39,9,0,737,127,31,846,98,1,827,105,310,371,30,254,100,44,18,395,242,272,101,385,916,836,16,265,131,938,93,166,557 }, - { 279,20,43,126,107,7,92,16,356,362,60,595,246,359,598,35,0,683,939,653,121,97,125,441,399,392,150,199,48,230,14,649 }, - { 206,417,93,959,499,728,8,559,120,473,137,141,10,564,31,260,44,450,203,341,253,244,373,116,143,638,268,180,352,110,318,940 }, - { 259,465,147,132,590,687,534,199,581,146,941,427,107,640,279,178,121,5,195,150,522,955,198,35,786,929,798,142,1,21,325,626 }, - { 523,15,120,450,202,515,403,51,817,700,13,753,23,457,33,899,128,64,730,102,494,342,115,719,453,196,49,99,318,421,308,5 }, - { 141,559,10,244,365,564,661,180,253,143,752,110,55,317,533,341,901,93,373,206,535,160,82,922,260,36,531,964,352,332,261,197 }, - { 219,258,98,127,276,964,943,137,843,535,201,935,131,860,261,295,284,567,206,44,116,31,253,492,203,332,160,615,36,93,55,692 }, - { 0,319,422,207,945,693,577,887,32,804,95,344,104,904,61,20,5,43,7,725,113,510,306,102,49,263,153,426,33,83,22,9 }, - { 1,22,0,5,12,2,36,21,28,86,49,105,9,10,23,13,141,95,31,55,128,37,51,4,83,202,3,64,96,7,32,44 }, - { 15,515,13,700,1,0,753,2,23,3,4,5,51,10,115,197,6,33,12,9,165,7,8,77,11,961,180,269,141,22,120,457 }, - { 15,1,23,120,77,13,515,51,141,202,700,180,110,137,260,753,326,5,128,102,0,21,2,165,269,33,197,450,318,217,93,115 }, - { 15,515,1,700,0,753,13,2,23,180,51,5,120,4,9,115,197,12,7,165,21,33,6,82,3,8,523,901,31,141,457,260 }, - { 16,18,24,7,92,35,60,75,9,13,71,14,0,108,50,21,126,121,1,140,23,5,132,146,2,12,128,10,64,141,70,87 }, - { 180,341,901,15,515,1,365,700,0,2,197,753,115,4,10,13,752,5,3,8,165,317,141,23,143,873,44,31,569,55,93,6 }, - { 9,0,175,35,101,28,39,67,68,1,65,83,30,69,364,336,22,114,55,124,194,158,100,289,252,166,64,345,103,36,50,88 }, - { 64,165,180,197,115,247,217,237,21,13,32,316,22,141,352,72,288,304,95,225,76,391,386,16,468,90,49,35,365,640,372,23 }, - { 15,515,700,13,753,1,0,2,23,4,5,3,115,51,141,197,12,10,180,961,7,9,21,33,217,6,8,165,457,11,77,341 }, - { 132,121,199,146,60,279,493,640,407,598,126,195,534,581,955,590,107,5,150,35,522,49,259,16,18,360,156,0,147,362,21,167 }, - { 0,1,28,9,22,5,36,12,65,24,67,96,2,83,18,50,114,55,21,16,7,10,23,14,13,160,137,51,48,218,103,69 }, - { 93,120,957,77,30,968,459,110,137,160,613,102,202,352,373,141,31,372,217,330,190,318,269,260,203,44,28,473,228,177,863,704 }, - { 15,515,700,0,753,1,13,23,2,51,5,9,120,82,4,7,901,197,10,8,260,180,341,12,33,6,3,523,165,102,115,141 }, - { 206,417,8,141,499,44,244,93,31,10,137,253,559,116,728,144,120,564,269,638,203,352,143,260,341,752,268,717,951,180,160,110 }, - { 530,254,228,1,96,21,406,39,827,31,669,840,613,829,137,679,166,98,23,51,960,438,131,93,48,224,219,317,310,36,876,190 }, - { 15,515,700,753,13,457,0,197,719,1,165,82,23,8,120,730,2,10,12,180,134,5,9,141,260,4,351,51,115,3,341,899 }, - { 0,16,68,9,24,28,18,35,252,109,39,419,124,158,154,55,101,71,22,118,60,7,49,65,333,14,1,10,329,364,677,346 }, - { 1,15,13,23,515,51,120,0,700,180,2,165,5,753,141,197,21,33,202,102,260,4,9,12,7,326,137,450,115,6,82,110 }, - { 535,253,352,564,110,365,82,180,341,10,854,533,55,898,244,901,873,141,752,143,642,559,498,317,36,951,115,964,638,282,661,197 }, - { 31,44,125,338,116,64,242,36,1,10,55,22,456,237,180,13,299,164,506,86,23,165,558,143,0,762,492,479,844,546,93,8 }, - { 13,23,4,1,202,2,0,51,115,77,141,180,5,15,217,3,33,11,515,317,9,10,102,21,700,341,365,318,269,64,32,128 }, - { 9,39,0,166,68,101,28,364,30,158,562,35,175,65,333,154,49,404,706,124,21,252,274,168,190,289,100,570,16,1,310,346 }, - { 15,515,700,753,341,13,0,23,1,33,141,4,260,82,77,51,351,180,9,5,115,137,10,217,11,120,102,40,349,269,202,854 } -#else - #include "rgbcx_table4.h" -#endif - }; - - static uint8_t g_best_total_orderings3[NUM_UNIQUE_TOTAL_ORDERINGS3][32] = - { - { 12,1,3,5,27,2,4,38,8,7,16,18,6,10,41,79,40,23,46,9,20,88,22,37,14,19,24,126,99,119,35,11 }, - { 7,64,116,14,94,30,8,42,1,108,47,55,137,10,134,95,96,115,69,32,63,29,90,113,11,148,16,103,19,9,34,25 }, - { 12,1,0,5,3,7,4,27,8,6,38,40,41,16,18,46,9,10,20,23,79,62,14,22,88,99,37,126,92,19,120,11 }, - { 16,88,27,18,46,48,126,107,79,19,59,38,37,65,23,66,0,2,3,43,12,151,28,25,5,87,72,40,1,20,52,92 }, - { 79,48,88,16,27,65,18,38,46,19,37,4,72,33,126,41,52,0,12,92,5,1,2,107,3,77,23,91,43,51,22,74 }, - { 1,8,41,122,10,22,2,0,87,24,37,120,38,7,39,4,5,3,9,92,62,59,23,16,104,11,27,79,19,26,25,32 }, - { 2,76,99,28,40,86,93,21,138,60,6,0,17,128,145,119,98,144,141,82,147,54,67,75,5,12,27,132,146,1,38,14 }, - { 47,7,64,90,1,118,116,85,57,14,30,94,50,45,137,134,8,42,69,139,55,68,58,108,95,29,10,115,0,32,2,11 }, - { 49,8,10,30,124,11,32,113,130,58,125,9,100,53,104,115,131,103,24,7,1,39,45,36,139,0,137,22,90,44,114,105 }, - { 9,38,72,125,49,41,84,11,13,5,27,0,16,92,8,2,65,105,10,18,48,29,127,131,36,14,1,46,111,79,130,12 }, - { 130,8,10,100,104,131,49,32,53,39,30,36,113,24,11,22,124,44,83,58,7,103,1,4,9,125,5,0,91,33,115,74 }, - { 114,11,58,8,120,49,9,124,142,111,41,30,10,0,97,130,62,84,38,5,72,125,92,127,100,27,139,113,13,132,32,1 }, - { 60,46,28,27,40,20,0,17,18,2,126,16,6,38,86,23,79,54,1,93,5,88,41,14,21,111,7,48,3,84,72,62 }, - { 72,92,38,65,84,48,41,79,27,16,29,111,88,5,18,46,1,0,152,14,37,19,77,42,132,7,22,13,119,56,12,2 }, - { 7,55,1,95,29,56,64,116,143,8,14,30,47,94,152,90,65,67,10,133,42,72,146,84,16,48,6,0,25,108,77,21 }, - { 27,23,20,5,0,79,38,2,3,1,59,46,4,41,33,86,37,87,88,92,7,126,43,8,22,152,151,150,149,148,147,146 }, - { 12,0,1,2,7,6,3,5,28,4,8,14,60,40,17,19,21,86,126,93,10,18,9,29,48,99,65,25,84,119,72,41 }, - { 60,40,99,2,54,12,0,1,19,28,98,93,6,138,21,5,27,17,151,14,76,46,16,18,38,29,86,144,107,7,25,41 }, - { 12,0,1,2,3,5,6,7,4,28,8,60,14,40,16,17,21,10,19,9,86,38,126,41,93,27,29,48,62,84,79,99 }, - { 0,1,2,10,5,8,3,25,4,29,32,34,63,7,77,26,16,48,65,56,14,22,129,103,72,24,18,152,140,53,96,42 }, - { 46,126,18,54,12,16,1,0,5,2,27,98,20,23,6,3,88,48,28,7,19,8,4,60,151,38,37,21,79,14,65,40 }, - { 76,6,141,86,119,2,138,67,28,145,0,93,17,1,40,60,146,99,147,14,21,144,132,7,5,29,55,27,16,75,19,12 }, - { 71,5,51,39,22,80,0,43,10,122,8,62,41,24,104,87,35,37,2,91,33,120,36,38,1,131,9,100,130,66,3,4 }, - { 126,18,46,27,20,16,88,23,12,79,54,59,48,0,73,1,37,151,5,19,28,38,2,66,60,3,65,98,14,26,6,43 }, - { 22,10,8,5,0,71,35,80,104,39,24,51,100,1,62,32,2,130,11,41,7,9,53,43,49,83,122,120,30,44,37,38 }, - { 1,34,14,129,53,63,42,26,121,148,7,44,96,10,0,24,100,32,64,116,140,22,5,19,29,103,135,108,8,61,39,83 }, - { 1,7,34,63,44,25,135,14,24,108,22,0,83,94,5,129,35,101,47,121,2,19,42,53,6,110,103,8,148,10,16,123 }, - { 12,28,16,60,18,1,6,21,14,0,86,19,2,48,93,17,38,29,7,5,65,126,46,72,41,79,84,119,40,56,54,88 }, - { 0,2,12,27,5,46,38,40,41,79,88,99,3,23,1,62,20,4,22,37,92,35,18,8,16,24,10,60,7,120,98,54 }, - { 1,7,14,56,8,0,84,67,10,2,133,72,42,111,5,30,21,4,9,3,25,94,16,116,47,11,65,18,132,90,55,64 }, - { 30,8,124,139,45,11,58,90,113,137,7,115,10,32,1,49,94,85,9,47,108,103,0,97,63,14,50,114,53,106,100,25 }, - { 65,38,48,27,16,79,72,18,88,19,46,77,84,92,37,41,0,29,1,14,12,111,2,5,31,36,87,74,105,40,28,51 }, - { 10,8,30,113,130,100,53,32,115,103,104,7,1,121,39,49,131,44,24,36,63,137,34,45,22,90,108,83,26,11,94,139 }, - { 51,52,43,33,5,74,16,37,71,91,38,3,36,87,48,22,4,0,122,41,39,18,66,27,79,24,65,88,59,23,62,92 }, - { 1,7,63,53,108,121,94,44,103,100,14,10,129,47,32,26,24,25,148,42,135,22,0,61,83,8,39,104,5,64,115,34 }, - { 1,8,10,7,5,0,80,32,62,2,24,44,53,83,9,41,30,22,100,11,14,25,120,4,26,6,3,16,122,34,19,35 }, - { 74,4,36,48,33,91,39,79,22,16,65,5,131,38,24,71,27,52,0,105,51,18,88,104,3,31,10,37,72,19,41,130 }, - { 59,43,38,79,23,27,92,51,0,16,46,5,18,88,41,37,66,3,87,20,48,2,122,4,22,12,1,126,19,65,33,24 }, - { 12,28,1,27,0,16,2,46,65,60,21,3,5,18,6,19,48,14,4,7,79,88,86,29,22,72,93,40,23,8,17,41 }, - { 22,91,39,33,24,71,5,131,36,10,51,0,130,8,104,2,35,125,9,43,52,49,83,80,100,41,122,3,37,38,4,16 }, - { 12,0,1,2,5,3,4,8,7,27,18,38,10,6,16,46,9,20,41,23,126,79,22,14,19,99,88,54,37,48,62,35 }, - { 12,27,1,2,3,0,46,4,38,16,8,28,7,79,18,5,84,6,88,10,14,21,23,20,40,22,60,19,9,29,72,65 }, - { 1,14,7,55,95,29,8,94,30,56,10,108,77,116,152,64,32,48,63,42,143,148,16,25,137,65,11,0,115,9,19,72 }, - { 37,79,66,38,16,52,48,59,43,27,87,33,41,4,23,51,3,5,88,18,92,46,73,122,22,71,20,0,65,19,2,120 }, - { 24,32,83,22,53,1,8,10,7,30,35,5,103,0,100,101,121,113,34,123,63,2,44,25,71,115,80,14,26,108,51,39 }, - { 97,45,111,58,85,139,0,90,47,7,120,106,142,30,50,132,41,62,84,1,119,114,14,56,117,8,38,29,2,64,116,5 }, - { 12,28,16,18,1,60,6,14,2,21,0,86,126,19,48,93,7,27,17,29,5,65,54,38,72,79,84,88,119,145,8,111 }, - { 118,47,64,116,57,85,7,14,50,1,42,0,45,68,86,69,2,111,134,28,90,55,16,29,56,48,84,144,60,30,112,41 }, - { 12,1,2,0,7,6,28,5,3,4,8,14,60,21,18,40,17,86,10,9,16,29,19,93,126,79,38,84,72,27,111,119 }, - { 11,8,49,130,10,125,9,124,100,114,131,30,58,104,32,39,24,113,36,105,0,41,22,120,5,53,111,38,142,44,83,35 }, - { 50,70,47,118,85,57,106,0,45,7,64,90,81,14,2,134,28,62,86,55,69,1,78,119,68,56,18,67,16,60,29,21 }, - { 43,37,33,87,51,41,66,5,122,38,22,59,92,0,23,91,27,16,71,79,18,52,120,4,3,24,46,20,73,39,62,36 }, - { 79,48,4,16,27,88,43,33,18,38,65,37,46,3,19,51,52,22,66,87,74,5,41,91,23,59,0,71,122,72,20,92 }, - { 32,100,10,8,30,104,24,44,39,113,83,103,1,7,22,53,115,63,135,121,26,35,34,5,0,108,137,90,91,45,2,130 }, - { 0,1,2,5,16,12,6,7,14,3,19,18,29,20,4,21,40,8,17,35,23,48,126,22,25,56,26,10,98,27,38,65 }, - { 143,67,56,146,1,7,133,55,64,141,134,69,6,47,14,29,84,21,111,147,57,16,95,72,118,132,50,0,2,18,119,42 }, - { 1,7,67,14,133,111,8,84,0,21,2,47,64,132,55,10,95,147,119,42,16,5,72,56,4,3,6,29,9,25,18,30 }, - { 68,57,69,112,144,86,102,2,134,55,0,70,118,64,75,47,14,28,93,143,67,7,50,149,1,21,29,56,119,95,60,78 }, - { 58,97,114,30,124,45,11,139,8,90,0,142,7,10,41,113,84,62,49,111,85,1,9,5,137,120,32,14,2,117,47,38 }, - { 23,66,18,79,38,20,43,27,16,88,46,59,126,37,87,12,73,92,3,5,48,0,19,54,2,51,28,1,41,65,122,22 }, - { 0,12,2,27,5,40,46,38,1,41,3,79,88,23,99,4,20,62,22,54,92,18,8,37,16,35,10,7,19,120,144,24 }, - { 1,14,25,26,0,7,44,34,129,42,24,5,135,22,19,148,6,96,83,2,29,16,63,35,101,64,140,136,116,110,3,10 }, - { 12,1,2,27,3,4,38,5,7,8,18,16,46,6,0,40,41,10,79,23,88,9,20,22,14,19,37,92,48,126,28,21 }, - { 7,1,10,32,108,103,94,47,8,53,25,14,34,115,100,129,121,130,148,42,64,116,63,26,44,0,24,30,113,4,104,22 }, - { 47,134,7,14,55,69,64,95,1,29,85,118,56,116,45,57,102,143,50,90,42,30,16,94,0,8,67,75,133,2,18,48 }, - { 12,1,2,0,7,6,28,8,14,5,3,4,40,21,17,18,60,86,16,93,126,10,9,29,99,38,119,25,19,54,27,84 }, - { 59,16,27,18,23,88,79,37,46,66,38,20,73,126,3,43,48,87,92,51,41,12,19,5,52,107,65,0,151,122,54,2 }, - { 1,21,147,7,119,14,76,132,55,0,86,145,2,6,69,67,16,143,111,138,17,28,29,60,18,93,8,19,40,56,84,5 }, - { 144,86,112,2,68,102,69,0,149,93,75,28,57,55,145,60,21,67,99,134,143,40,146,119,82,110,62,6,29,26,78,14 }, - { 102,57,55,69,143,75,146,67,56,68,134,2,29,141,0,21,6,14,133,118,64,1,7,95,47,84,111,28,147,82,72,119 }, - { 0,70,57,119,50,145,2,86,28,118,69,78,149,47,60,68,67,55,93,81,134,21,14,62,64,7,5,1,132,85,41,16 }, - { 51,5,43,71,122,87,41,37,91,39,0,22,33,36,38,24,66,120,62,2,80,16,92,10,59,4,27,23,35,79,8,3 }, - { 12,1,2,0,7,6,28,5,8,14,3,21,40,4,60,17,86,18,16,93,10,9,126,119,99,29,19,41,38,27,25,92 }, - { 27,18,46,126,23,16,88,79,20,151,59,73,48,38,0,54,12,2,37,1,19,5,28,60,66,41,3,109,86,65,40,6 }, - { 48,79,4,33,16,74,65,38,88,27,91,52,18,36,22,19,46,0,37,3,51,5,71,39,72,43,24,41,92,87,2,10 }, - { 86,2,144,93,28,112,141,6,102,21,99,60,75,0,68,82,69,146,67,149,55,40,145,76,111,147,56,119,110,143,26,132 }, - { 6,138,2,99,86,17,40,93,28,21,145,141,0,60,119,147,128,76,67,54,1,12,5,27,144,14,38,98,146,41,29,19 }, - { 1,8,0,10,2,29,7,5,3,56,4,25,14,152,63,32,65,72,96,42,34,108,48,9,26,16,84,103,67,148,22,129 }, - { 149,145,0,86,2,28,93,144,62,60,119,101,21,41,5,35,78,99,26,40,12,68,57,67,110,120,69,18,55,76,132,70 }, - { 12,28,16,1,48,19,6,60,2,14,18,21,0,27,46,65,86,29,5,7,72,93,40,3,17,84,56,88,126,4,38,8 }, - { 1,8,5,10,7,24,2,62,0,41,22,122,120,9,4,3,32,87,11,37,38,83,100,44,25,104,16,26,39,80,14,6 }, - { 0,119,62,86,145,149,28,132,93,2,120,67,60,41,35,5,144,21,123,38,111,81,84,56,12,44,24,50,92,55,40,22 }, - { 2,93,99,28,40,144,60,0,86,150,76,21,149,98,6,25,1,61,82,26,12,5,54,141,7,18,145,16,27,138,110,38 }, - { 24,8,10,22,32,35,100,5,1,53,0,7,71,80,30,123,83,104,51,11,2,39,44,113,9,62,25,103,34,101,43,41 }, - { 12,1,2,0,7,6,28,5,40,60,8,16,3,18,14,4,86,21,17,93,41,10,9,99,27,119,38,19,126,22,48,145 }, - { 45,47,50,7,85,90,97,1,64,139,116,118,30,58,14,106,70,111,0,57,94,42,137,142,29,120,8,56,18,134,84,41 }, - { 12,0,2,5,27,38,1,46,41,40,79,144,3,22,88,23,28,60,99,62,6,24,26,7,4,16,10,35,37,18,14,20 }, - { 37,38,59,92,0,5,23,51,79,41,27,22,2,3,87,16,46,4,1,43,20,33,18,88,24,71,8,10,48,19,126,122 }, - { 12,28,16,60,1,18,6,21,19,14,48,0,2,86,93,5,46,29,17,27,65,7,3,72,38,126,119,40,84,37,56,4 }, - { 0,2,5,1,16,6,27,28,18,38,60,7,14,21,46,40,86,41,19,48,93,8,3,79,22,4,10,37,62,23,24,111 }, - { 85,7,90,30,47,139,45,50,94,58,137,1,8,64,14,116,118,115,113,11,124,108,0,10,97,57,32,70,42,106,29,114 }, - { 33,36,22,71,51,5,91,39,0,52,43,24,131,74,16,37,38,122,41,3,87,48,4,104,35,80,10,2,105,62,27,18 }, - { 12,1,27,2,0,16,3,28,46,18,4,6,5,72,21,79,38,7,14,60,88,8,65,19,48,29,23,40,22,20,86,126 }, - { 0,12,2,27,5,38,46,41,1,40,79,3,88,23,22,99,20,37,62,4,18,6,16,35,60,28,24,7,92,8,14,10 }, - { 7,47,1,30,137,8,116,94,90,64,14,115,108,118,57,10,148,113,42,85,32,11,63,50,103,45,124,134,55,9,69,34 }, - { 55,7,1,29,56,143,64,47,67,133,14,146,95,72,84,8,116,111,6,134,141,21,65,0,69,30,16,45,85,42,50,10 }, - { 14,1,42,8,10,29,108,63,55,148,95,32,7,19,25,115,103,34,56,129,77,0,16,152,94,30,113,26,2,5,48,4 }, - { 111,120,142,97,58,0,41,45,62,132,114,84,139,30,5,8,38,2,7,85,119,90,117,1,124,11,56,47,28,27,35,72 }, - { 1,0,14,2,6,5,16,19,7,29,42,18,3,25,12,35,21,8,26,17,40,4,20,48,109,99,22,96,55,101,10,61 }, - { 12,0,1,5,3,2,4,7,27,8,38,6,40,18,16,10,20,46,9,41,23,22,79,14,62,19,37,126,88,11,92,48 }, - { 10,8,104,39,24,32,22,83,44,100,30,130,53,91,113,5,11,1,35,33,7,49,0,2,103,71,36,124,9,80,131,34 }, - { 1,7,0,14,8,34,5,25,35,26,6,63,10,123,2,16,103,19,44,32,135,121,108,80,62,30,115,94,149,144,53,18 }, - { 75,68,146,141,102,67,2,21,6,57,69,143,0,55,82,86,28,144,147,29,93,112,56,119,133,14,76,60,84,134,111,145 }, - { 10,32,115,7,8,53,1,108,30,113,94,137,100,63,90,34,130,103,121,47,44,25,104,39,24,26,85,14,49,36,22,131 }, - { 39,24,10,22,8,130,91,104,83,49,5,33,100,11,0,35,32,131,71,36,9,44,53,2,80,51,30,1,41,7,43,62 }, - { 38,36,65,105,27,72,31,79,41,131,5,48,125,39,0,16,92,46,22,13,18,84,24,37,88,2,33,74,91,71,130,49 }, - { 0,106,62,50,45,119,85,81,132,28,2,86,41,47,38,60,35,117,5,29,7,30,145,90,55,70,14,111,18,67,93,56 }, - { 0,2,5,1,3,25,19,26,4,34,29,10,22,16,8,7,24,14,48,65,53,18,6,77,44,56,72,61,121,21,136,40 }, - { 7,1,94,8,47,115,10,32,113,103,30,108,137,63,14,64,116,148,129,42,90,25,34,118,53,57,11,49,85,9,96,50 }, - { 14,0,1,26,19,5,42,2,25,24,29,22,6,44,61,16,7,96,136,3,140,34,35,55,135,18,48,77,83,4,8,10 }, - { 1,7,14,0,25,6,34,5,26,16,63,2,19,8,35,101,108,29,94,10,18,42,123,144,129,47,61,21,3,62,149,4 }, - { 12,0,2,1,28,5,6,120,7,60,40,16,18,86,27,14,21,93,8,62,41,38,3,17,4,119,99,48,19,126,10,9 }, - { 86,144,93,2,28,149,0,60,99,112,110,145,40,21,102,26,75,62,69,1,12,101,119,25,76,67,7,68,55,5,6,14 }, - { 8,30,10,32,113,49,115,137,124,103,45,90,7,139,11,1,58,53,130,94,108,100,9,63,85,125,34,47,0,24,44,104 }, - { 120,142,111,41,58,114,97,0,11,62,84,124,5,30,8,38,132,127,27,139,92,10,72,45,49,9,28,2,29,56,16,1 }, - { 8,113,30,137,7,32,10,90,94,115,1,103,108,63,47,85,49,53,11,45,34,50,14,25,9,124,100,130,139,121,42,26 }, - { 64,7,14,47,134,55,1,42,95,69,116,90,94,30,8,29,56,137,45,108,85,10,57,16,102,143,118,19,63,32,11,50 }, - { 62,132,0,119,120,41,111,86,35,28,5,84,56,38,2,93,145,60,67,12,92,27,29,72,55,117,21,24,133,149,22,45 }, - { 57,68,69,118,134,64,50,47,55,14,7,2,102,144,0,112,70,86,85,1,95,29,116,143,42,75,16,56,28,45,21,48 }, - { 0,12,2,1,5,28,6,40,60,27,7,38,16,14,86,18,93,41,62,46,99,35,8,23,3,17,22,21,10,19,79,20 }, - { 12,1,2,27,16,3,38,111,4,0,18,5,7,46,40,8,79,6,14,28,88,10,48,41,19,84,21,9,22,23,20,72 }, - { 53,103,32,7,1,100,22,63,71,44,10,115,108,24,92,104,26,30,122,94,8,39,83,34,137,135,90,91,121,5,87,47 }, - { 87,37,41,0,22,38,2,92,1,24,4,8,3,59,10,5,39,23,71,79,122,27,16,46,33,7,91,20,18,51,9,120 }, - { 1,7,8,10,0,5,35,32,53,44,14,30,2,80,25,34,6,62,26,103,16,19,63,9,149,24,121,41,22,11,113,83 }, - { 11,58,8,30,124,49,10,113,9,114,139,45,97,32,7,137,90,1,0,130,115,125,100,24,5,94,53,41,14,13,35,38 }, - { 125,105,9,36,131,49,8,130,39,11,10,5,22,38,41,104,0,31,13,24,27,16,2,72,65,91,48,32,84,18,100,74 }, - { 12,1,0,2,6,3,7,5,4,8,14,28,16,60,18,10,21,17,19,9,40,27,86,93,29,38,54,11,25,48,46,41 }, - { 84,41,38,72,92,29,111,5,65,120,79,0,27,56,48,14,132,16,119,22,86,88,46,28,62,12,1,2,93,18,24,127 }, - { 99,28,40,60,2,93,138,0,98,17,86,54,76,12,27,1,21,144,128,38,5,14,46,18,25,16,109,6,41,145,7,29 }, - { 1,63,10,32,148,14,103,34,42,7,8,108,116,53,64,96,25,121,26,94,140,0,29,19,55,24,100,136,5,4,44,115 }, - { 131,100,130,49,10,8,36,104,39,0,48,41,11,38,4,24,27,22,16,44,79,5,33,2,53,9,125,74,91,120,32,83 }, - { 36,39,131,74,4,91,22,33,125,104,130,48,10,24,16,5,49,8,100,105,79,0,9,65,71,2,18,83,31,11,19,44 }, - { 0,12,2,1,6,5,7,28,40,60,16,14,18,62,86,27,93,8,17,38,21,41,35,99,3,19,10,23,22,4,9,48 }, - { 1,7,67,14,21,147,111,55,132,119,0,8,2,76,64,16,47,84,6,18,86,95,145,10,42,29,133,5,56,134,17,72 }, - { 69,55,47,134,102,143,7,57,118,95,14,64,29,56,1,50,75,67,146,2,0,133,68,16,21,6,141,85,116,18,72,65 }, - { 1,44,7,24,83,63,34,103,22,121,53,32,25,35,0,115,108,5,14,8,10,101,94,30,2,123,110,26,137,47,90,19 }, - { 14,1,25,42,34,0,26,96,19,29,140,5,53,10,2,121,3,24,44,22,55,77,129,7,63,16,8,4,6,61,100,48 }, - { 30,90,7,8,137,94,85,1,47,113,115,108,45,139,124,11,10,32,50,58,103,14,63,64,9,116,49,42,25,148,0,53 }, - { 40,99,2,60,28,17,0,54,93,98,86,138,6,12,21,76,1,5,27,144,128,38,19,46,14,41,145,7,16,67,3,109 }, - { 45,58,30,139,90,7,85,137,97,8,124,47,1,11,106,114,50,94,0,113,10,115,14,32,9,64,108,41,49,29,62,116 }, - { 14,42,10,1,63,96,32,25,34,8,129,29,0,103,55,19,26,53,77,5,95,2,4,7,3,16,148,56,18,24,121,108 }, - { 21,2,75,86,6,76,144,28,119,99,93,147,141,67,102,145,60,132,146,128,0,82,40,138,55,111,143,17,133,112,69,14 }, - { 111,120,41,62,84,132,0,5,38,119,56,92,72,142,27,28,29,35,58,80,2,86,65,79,12,14,1,24,145,16,21,48 }, - { 146,67,141,69,133,21,6,143,57,55,111,147,56,1,14,132,7,2,134,102,0,119,29,84,76,64,86,72,28,68,47,75 }, - { 12,1,0,5,27,3,7,4,38,8,6,41,16,40,46,10,18,79,2,9,23,86,20,22,62,14,37,88,92,19,24,11 }, - { 0,12,2,1,27,5,38,28,60,6,40,7,16,46,18,14,41,99,93,62,3,79,86,23,149,8,22,35,88,17,19,10 }, - { 141,6,21,67,147,102,146,2,76,119,132,69,55,111,86,75,28,133,143,0,1,145,14,128,56,99,17,60,29,93,84,68 }, - { 21,76,1,119,86,145,2,0,14,7,6,138,146,55,17,28,132,93,67,40,60,143,29,147,111,16,69,141,5,56,19,133 }, - { 1,8,108,14,7,116,64,42,10,63,94,32,115,103,113,96,30,34,55,47,95,148,29,140,129,25,134,53,69,26,19,11 }, - { 12,1,3,5,4,2,0,7,8,38,27,16,18,6,10,20,41,40,79,46,9,23,22,88,92,37,14,24,62,19,48,99 }, - { 1,14,7,0,6,25,5,16,19,2,42,26,29,35,61,8,18,129,101,21,3,110,34,148,96,10,17,4,22,40,12,20 }, - { 0,2,5,1,3,19,22,26,16,24,29,7,14,6,4,25,18,44,8,48,12,61,20,21,10,35,65,56,23,40,17,107 }, - { 1,7,8,29,56,0,10,14,2,42,72,5,4,65,3,30,84,94,67,9,25,133,111,11,32,108,16,63,21,96,26,48 } - }; - - static inline uint32_t iabs(int32_t i) { return (i < 0) ? static_cast(-i) : static_cast(i); } - static inline uint64_t iabs(int64_t i) { return (i < 0) ? static_cast(-i) : static_cast(i); } - - static inline uint8_t to_5(uint32_t v) { v = v * 31 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } - static inline uint8_t to_6(uint32_t v) { v = v * 63 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } - - template inline S maximum(S a, S b) { return (a > b) ? a : b; } - template inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); } - template inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); } - - template inline S minimum(S a, S b) { return (a < b) ? a : b; } - template inline S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); } - template inline S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); } - - template inline T square(T a) { return a * a; } - - static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high; return value; } - static inline uint8_t clamp255(int32_t i) { return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i); } - - template inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); } - static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high; return value; } - - static inline int squarei(int a) { return a * a; } - static inline int absi(int a) { return (a < 0) ? -a : a; } - - template inline F lerp(F a, F b, F s) { return a + (b - a) * s; } - - enum class eNoClamp { cNoClamp }; - - struct color32 - { - union - { - struct - { - uint8_t r; - uint8_t g; - uint8_t b; - uint8_t a; - }; - - uint8_t c[4]; - - uint32_t m; - }; - - color32() { } - - color32(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); } - color32(eNoClamp unused, uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { (void)unused; set_noclamp_rgba(vr, vg, vb, va); } - - void set(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { c[0] = static_cast(vr); c[1] = static_cast(vg); c[2] = static_cast(vb); c[3] = static_cast(va); } - - void set_noclamp_rgb(uint32_t vr, uint32_t vg, uint32_t vb) { c[0] = static_cast(vr); c[1] = static_cast(vg); c[2] = static_cast(vb); } - void set_noclamp_rgba(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); } - - void set_clamped(int vr, int vg, int vb, int va) { c[0] = clamp255(vr); c[1] = clamp255(vg); c[2] = clamp255(vb); c[3] = clamp255(va); } - - uint8_t operator[] (uint32_t idx) const { assert(idx < 4); return c[idx]; } - uint8_t &operator[] (uint32_t idx) { assert(idx < 4); return c[idx]; } - - bool operator== (const color32&rhs) const { return m == rhs.m; } - - void set_rgb(const color32& other) { c[0] = static_cast(other.c[0]); c[1] = static_cast(other.c[1]); c[2] = static_cast(other.c[2]); } - - static color32 comp_min(const color32& a, const color32& b) { return color32(eNoClamp::cNoClamp, std::min(a[0], b[0]), std::min(a[1], b[1]), std::min(a[2], b[2]), std::min(a[3], b[3])); } - static color32 comp_max(const color32& a, const color32& b) { return color32(eNoClamp::cNoClamp, std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); } - }; - + // Rate Distortion Optimization (RDO) enum dxt_constants { cDXT1SelectorBits = 2U, cDXT1SelectorValues = 1U << cDXT1SelectorBits, cDXT1SelectorMask = cDXT1SelectorValues - 1U, @@ -1491,7 +294,7 @@ namespace rgbcx uint8_t m_low_color[cTotalEndpointBytes]; uint8_t m_high_color[cTotalEndpointBytes]; uint8_t m_selectors[cTotalSelectorBytes]; - + inline uint32_t get_low_color() const { return m_low_color[0] | (m_low_color[1] << 8U); } inline uint32_t get_high_color() const { return m_high_color[0] | (m_high_color[1] << 8U); } inline bool is_3color() const { return get_low_color() <= get_high_color(); } @@ -1500,6 +303,25 @@ namespace rgbcx inline uint32_t get_selector(uint32_t x, uint32_t y) const { assert((x < 4U) && (y < 4U)); return (m_selectors[y] >> (x * cDXT1SelectorBits)) & cDXT1SelectorMask; } inline void set_selector(uint32_t x, uint32_t y, uint32_t val) { assert((x < 4U) && (y < 4U) && (val < 4U)); m_selectors[y] &= (~(cDXT1SelectorMask << (x * cDXT1SelectorBits))); m_selectors[y] |= (val << (x * cDXT1SelectorBits)); } + inline uint32_t get_endpoint_bits() const { return m_low_color[0] | (m_low_color[1] << 8) | (m_high_color[0] << 16) | (m_high_color[1] << 24); } + inline void set_endpoint_bits(uint32_t s) { m_low_color[0] = (uint8_t)s; m_low_color[1] = (uint8_t)(s >> 8); m_high_color[0] = (uint8_t)(s >> 16); m_high_color[1] = (uint8_t)(s >> 24); } + + inline uint32_t get_selector_bits() const { return m_selectors[0] | (m_selectors[1] << 8) | (m_selectors[2] << 16) | (m_selectors[3] << 24); } + inline void set_selector_bits(uint32_t s) { m_selectors[0] = (uint8_t)s; m_selectors[1] = (uint8_t)(s >> 8); m_selectors[2] = (uint8_t)(s >> 16); m_selectors[3] = (uint8_t)(s >> 24); } + + inline bool any_selectors_transparent() const + { + uint32_t sel_bits = get_selector_bits(); + for (uint32_t i = 0; i < 16; i++) + { + if ((sel_bits & 3) == 3) + return true; + + sel_bits >>= 2; + } + return false; + } + static inline uint16_t pack_color(const color32& color, bool scaled, uint32_t bias = 127U) { uint32_t r = color.r, g = color.g, b = color.b; @@ -1533,2650 +355,78 @@ namespace rgbcx } }; - static const uint32_t TOTAL_ORDER_4_0_16 = 15; - static const uint32_t TOTAL_ORDER_4_1_16 = 700; - static const uint32_t TOTAL_ORDER_4_2_16 = 753; - static const uint32_t TOTAL_ORDER_4_3_16 = 515; - static uint16_t g_total_ordering4_hash[4096]; - static float g_selector_factors4[NUM_UNIQUE_TOTAL_ORDERINGS4][3]; - - static const uint32_t TOTAL_ORDER_3_0_16 = 12; - static const uint32_t TOTAL_ORDER_3_1_16 = 15; - static const uint32_t TOTAL_ORDER_3_2_16 = 89; - static uint16_t g_total_ordering3_hash[256]; - static float g_selector_factors3[NUM_UNIQUE_TOTAL_ORDERINGS3][3]; - - struct hist4 - { - uint8_t m_hist[4]; - - hist4() - { - memset(m_hist, 0, sizeof(m_hist)); - } - - hist4(uint32_t i, uint32_t j, uint32_t k, uint32_t l) - { - m_hist[0] = (uint8_t)i; - m_hist[1] = (uint8_t)j; - m_hist[2] = (uint8_t)k; - m_hist[3] = (uint8_t)l; - } - - inline bool operator== (const hist4 &h) const - { - if (m_hist[0] != h.m_hist[0]) return false; - if (m_hist[1] != h.m_hist[1]) return false; - if (m_hist[2] != h.m_hist[2]) return false; - if (m_hist[3] != h.m_hist[3]) return false; - return true; - } - - inline bool any_16() const - { - return (m_hist[0] == 16) || (m_hist[1] == 16) || (m_hist[2] == 16) || (m_hist[3] == 16); - } - - inline uint32_t lookup_total_ordering_index() const - { - if (m_hist[0] == 16) - return TOTAL_ORDER_4_0_16; - else if (m_hist[1] == 16) - return TOTAL_ORDER_4_1_16; - else if (m_hist[2] == 16) - return TOTAL_ORDER_4_2_16; - else if (m_hist[3] == 16) - return TOTAL_ORDER_4_3_16; - - // Must sum to 16, so m_hist[3] isn't needed. - return g_total_ordering4_hash[m_hist[0] | (m_hist[1] << 4) | (m_hist[2] << 8)]; - } - }; - - struct hist3 - { - uint8_t m_hist[3]; - - hist3() - { - memset(m_hist, 0, sizeof(m_hist)); - } - - hist3(uint32_t i, uint32_t j, uint32_t k) - { - m_hist[0] = (uint8_t)i; - m_hist[1] = (uint8_t)j; - m_hist[2] = (uint8_t)k; - } - - inline bool operator== (const hist3 &h) const - { - if (m_hist[0] != h.m_hist[0]) return false; - if (m_hist[1] != h.m_hist[1]) return false; - if (m_hist[2] != h.m_hist[2]) return false; - return true; - } - - inline bool any_16() const - { - return (m_hist[0] == 16) || (m_hist[1] == 16) || (m_hist[2] == 16); - } - - inline uint32_t lookup_total_ordering_index() const - { - if (m_hist[0] == 16) - return TOTAL_ORDER_3_0_16; - else if (m_hist[1] == 16) - return TOTAL_ORDER_3_1_16; - else if (m_hist[2] == 16) - return TOTAL_ORDER_3_2_16; - - // Must sum to 16, so m_hist[2] isn't needed. - return g_total_ordering3_hash[m_hist[0] | (m_hist[1] << 4)]; - } - }; - - struct bc1_match_entry + struct bc4_block { - uint8_t m_hi; - uint8_t m_lo; - uint8_t m_e; - }; - - static bc1_approx_mode g_bc1_approx_mode; - static bc1_match_entry g_bc1_match5_equals_1[256], g_bc1_match6_equals_1[256]; - static bc1_match_entry g_bc1_match5_half[256], g_bc1_match6_half[256]; - - static inline int scale_5_to_8(int v) { return (v << 3) | (v >> 2); } - static inline int scale_6_to_8(int v) { return (v << 2) | (v >> 4); } - - // v0, v1 = unexpanded DXT1 endpoint values (5/6-bits) - // c0, c1 = expanded DXT1 endpoint values (8-bits) - static inline int interp_5_6_ideal(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 2 + c1) / 3; } - static inline int interp_5_6_ideal_round(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 2 + c1 + 1) / 3; } - static inline int interp_half_5_6_ideal(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 + c1) / 2; } - - static inline int interp_5_nv(int v0, int v1) { assert(v0 < 32 && v1 < 32); return ((2 * v0 + v1) * 22) / 8; } - static inline int interp_6_nv(int c0, int c1) { assert(c0 < 256 && c1 < 256); const int gdiff = c1 - c0; return (256 * c0 + (gdiff / 4) + 128 + gdiff * 80) / 256; } + enum { cBC4SelectorBits = 3, cTotalSelectorBytes = 6, cMaxSelectorValues = 8 }; + uint8_t m_endpoints[2]; - static inline int interp_half_5_nv(int v0, int v1) { assert(v0 < 32 && v1 < 32); return ((v0 + v1) * 33) / 8; } - static inline int interp_half_6_nv(int c0, int c1) { assert(c0 < 256 && c1 < 256); const int gdiff = c1 - c0; return (256 * c0 + gdiff/4 + 128 + gdiff * 128) / 256; } + uint8_t m_selectors[cTotalSelectorBytes]; - static inline int interp_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 43 + c1 * 21 + 32) >> 6; } - static inline int interp_half_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 + c1 + 1) >> 1; } + inline uint32_t get_low_alpha() const { return m_endpoints[0]; } + inline uint32_t get_high_alpha() const { return m_endpoints[1]; } + inline bool is_alpha6_block() const { return get_low_alpha() <= get_high_alpha(); } - static inline int interp_5(int v0, int v1, int c0, int c1, bc1_approx_mode mode) - { - assert(scale_5_to_8(v0) == c0 && scale_5_to_8(v1) == c1); - switch (mode) + inline uint64_t get_selector_bits() const { - case bc1_approx_mode::cBC1NVidia: return interp_5_nv(v0, v1); - case bc1_approx_mode::cBC1AMD: return interp_5_6_amd(c0, c1); - default: - case bc1_approx_mode::cBC1Ideal: return interp_5_6_ideal(c0, c1); - case bc1_approx_mode::cBC1IdealRound4: return interp_5_6_ideal_round(c0, c1); + return ((uint64_t)((uint32_t)m_selectors[0] | ((uint32_t)m_selectors[1] << 8U) | ((uint32_t)m_selectors[2] << 16U) | ((uint32_t)m_selectors[3] << 24U))) | + (((uint64_t)m_selectors[4]) << 32U) | + (((uint64_t)m_selectors[5]) << 40U); } - } - static inline int interp_6(int v0, int v1, int c0, int c1, bc1_approx_mode mode) - { - (void)v0; (void)v1; - assert(scale_6_to_8(v0) == c0 && scale_6_to_8(v1) == c1); - switch (mode) + inline void set_selector_bits(uint64_t v) { - case bc1_approx_mode::cBC1NVidia: return interp_6_nv(c0, c1); - case bc1_approx_mode::cBC1AMD: return interp_5_6_amd(c0, c1); - default: - case bc1_approx_mode::cBC1Ideal: return interp_5_6_ideal(c0, c1); - case bc1_approx_mode::cBC1IdealRound4: return interp_5_6_ideal_round(c0, c1); + for (uint32_t i = 0; i < 6; i++) + { + m_selectors[i] = (uint8_t)v; + v >>= 8; + } } - } - static inline int interp_half_5(int v0, int v1, int c0, int c1, bc1_approx_mode mode) - { - assert(scale_5_to_8(v0) == c0 && scale_5_to_8(v1) == c1); - switch (mode) + inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const { - case bc1_approx_mode::cBC1NVidia: return interp_half_5_nv(v0, v1); - case bc1_approx_mode::cBC1AMD: return interp_half_5_6_amd(c0, c1); - case bc1_approx_mode::cBC1Ideal: - case bc1_approx_mode::cBC1IdealRound4: - default: - return interp_half_5_6_ideal(c0, c1); + assert((x < 4U) && (y < 4U)); + return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1); } - } - static inline int interp_half_6(int v0, int v1, int c0, int c1, bc1_approx_mode mode) - { - (void)v0; (void)v1; - assert(scale_6_to_8(v0) == c0 && scale_6_to_8(v1) == c1); - switch (mode) + static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h) { - case bc1_approx_mode::cBC1NVidia: return interp_half_6_nv(c0, c1); - case bc1_approx_mode::cBC1AMD: return interp_half_5_6_amd(c0, c1); - case bc1_approx_mode::cBC1Ideal: - case bc1_approx_mode::cBC1IdealRound4: - default: - return interp_half_5_6_ideal(c0, c1); + pDst[0] = static_cast(l); + pDst[1] = static_cast(h); + pDst[2] = static_cast((l * 4 + h) / 5); + pDst[3] = static_cast((l * 3 + h * 2) / 5); + pDst[4] = static_cast((l * 2 + h * 3) / 5); + pDst[5] = static_cast((l + h * 4) / 5); + pDst[6] = 0; + pDst[7] = 255; + return 6; } - } - static void prepare_bc1_single_color_table_half(bc1_match_entry* pTable, const uint8_t* pExpand, int size, bc1_approx_mode mode) - { - for (int i = 0; i < 256; i++) + static inline uint32_t get_block_values8(uint8_t* pDst, uint32_t l, uint32_t h) { - int lowest_e = 256; - for (int lo = 0; lo < size; lo++) - { - const int lo_e = pExpand[lo]; - - for (int hi = 0; hi < size; hi++) - { - const int hi_e = pExpand[hi]; - - const int v = (size == 32) ? interp_half_5(hi, lo, hi_e, lo_e, mode) : interp_half_6(hi, lo, hi_e, lo_e, mode); - - int e = iabs(v - i); - - // We only need to factor in 3% error in BC1 ideal mode. - if ((mode == bc1_approx_mode::cBC1Ideal) || (mode == bc1_approx_mode::cBC1IdealRound4)) - e += (iabs(hi_e - lo_e) * 3) / 100; - - // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation. - if ((e < lowest_e) || ((e == lowest_e) && (lo == hi))) - { - pTable[i].m_hi = static_cast(hi); - pTable[i].m_lo = static_cast(lo); - - assert(e <= UINT8_MAX); - pTable[i].m_e = static_cast(e); - - lowest_e = e; - } - - } // hi - } // lo + pDst[0] = static_cast(l); + pDst[1] = static_cast(h); + pDst[2] = static_cast((l * 6 + h) / 7); + pDst[3] = static_cast((l * 5 + h * 2) / 7); + pDst[4] = static_cast((l * 4 + h * 3) / 7); + pDst[5] = static_cast((l * 3 + h * 4) / 7); + pDst[6] = static_cast((l * 2 + h * 5) / 7); + pDst[7] = static_cast((l + h * 6) / 7); + return 8; } - } - static void prepare_bc1_single_color_table(bc1_match_entry* pTable, const uint8_t* pExpand, int size, bc1_approx_mode mode) - { - for (int i = 0; i < 256; i++) + static inline uint32_t get_block_values(uint8_t* pDst, uint32_t l, uint32_t h) { - int lowest_e = 256; - for (int lo = 0; lo < size; lo++) - { - const int lo_e = pExpand[lo]; - - for (int hi = 0; hi < size; hi++) - { - const int hi_e = pExpand[hi]; - - const int v = (size == 32) ? interp_5(hi, lo, hi_e, lo_e, mode) : interp_6(hi, lo, hi_e, lo_e, mode); - - int e = iabs(v - i); - - if ((mode == bc1_approx_mode::cBC1Ideal) || (mode == bc1_approx_mode::cBC1IdealRound4)) - e += (iabs(hi_e - lo_e) * 3) / 100; - - // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation. - if ((e < lowest_e) || ((e == lowest_e) && (lo == hi))) - { - pTable[i].m_hi = static_cast(hi); - pTable[i].m_lo = static_cast(lo); - - assert(e <= UINT8_MAX); - pTable[i].m_e = static_cast(e); - - lowest_e = e; - } - - } // hi - } // lo + if (l > h) + return get_block_values8(pDst, l, h); + else + return get_block_values6(pDst, l, h); } - } - - // This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w)) - // where w is [0,1/3,2/3,1]. 9 is the perfect multiplier. - static const uint32_t g_weight_vals4[4] = { 0x000009, 0x010204, 0x040201, 0x090000 }; - - // multiplier is 4 for 3-color - static const uint32_t g_weight_vals3[3] = { 0x000004, 0x040000, 0x010101 }; - - static inline void compute_selector_factors4(const hist4 &h, float &iz00, float &iz10, float &iz11) - { - uint32_t weight_accum = 0; - for (uint32_t sel = 0; sel < 4; sel++) - weight_accum += g_weight_vals4[sel] * h.m_hist[sel]; - - float z00 = (float)((weight_accum >> 16) & 0xFF); - float z10 = (float)((weight_accum >> 8) & 0xFF); - float z11 = (float)(weight_accum & 0xFF); - float z01 = z10; - - float det = z00 * z11 - z01 * z10; - if (fabs(det) < 1e-8f) - det = 0.0f; - else - det = (3.0f / 255.0f) / det; - - iz00 = z11 * det; - iz10 = -z10 * det; - iz11 = z00 * det; - } - - static inline void compute_selector_factors3(const hist3 &h, float &iz00, float &iz10, float &iz11) - { - uint32_t weight_accum = 0; - for (uint32_t sel = 0; sel < 3; sel++) - weight_accum += g_weight_vals3[sel] * h.m_hist[sel]; - - float z00 = (float)((weight_accum >> 16) & 0xFF); - float z10 = (float)((weight_accum >> 8) & 0xFF); - float z11 = (float)(weight_accum & 0xFF); - float z01 = z10; + }; - float det = z00 * z11 - z01 * z10; - if (fabs(det) < 1e-8f) - det = 0.0f; - else - det = (2.0f / 255.0f) / det; - - iz00 = z11 * det; - iz10 = -z10 * det; - iz11 = z00 * det; - } - - static bool g_initialized; - - void init(bc1_approx_mode mode) - { - g_bc1_approx_mode = mode; - - uint8_t bc1_expand5[32]; - for (int i = 0; i < 32; i++) - bc1_expand5[i] = static_cast((i << 3) | (i >> 2)); - prepare_bc1_single_color_table(g_bc1_match5_equals_1, bc1_expand5, 32, mode); - prepare_bc1_single_color_table_half(g_bc1_match5_half, bc1_expand5, 32, mode); - - uint8_t bc1_expand6[64]; - for (int i = 0; i < 64; i++) - bc1_expand6[i] = static_cast((i << 2) | (i >> 4)); - prepare_bc1_single_color_table(g_bc1_match6_equals_1, bc1_expand6, 64, mode); - prepare_bc1_single_color_table_half(g_bc1_match6_half, bc1_expand6, 64, mode); - - for (uint32_t i = 0; i < NUM_UNIQUE_TOTAL_ORDERINGS4; i++) - { - hist4 h; - h.m_hist[0] = (uint8_t)g_unique_total_orders4[i][0]; - h.m_hist[1] = (uint8_t)g_unique_total_orders4[i][1]; - h.m_hist[2] = (uint8_t)g_unique_total_orders4[i][2]; - h.m_hist[3] = (uint8_t)g_unique_total_orders4[i][3]; - - if (!h.any_16()) - { - const uint32_t index = h.m_hist[0] | (h.m_hist[1] << 4) | (h.m_hist[2] << 8); - assert(index < 4096); - g_total_ordering4_hash[index] = (uint16_t)i; - } - - compute_selector_factors4(h, g_selector_factors4[i][0], g_selector_factors4[i][1], g_selector_factors4[i][2]); - } - - for (uint32_t i = 0; i < NUM_UNIQUE_TOTAL_ORDERINGS3; i++) - { - hist3 h; - h.m_hist[0] = (uint8_t)g_unique_total_orders3[i][0]; - h.m_hist[1] = (uint8_t)g_unique_total_orders3[i][1]; - h.m_hist[2] = (uint8_t)g_unique_total_orders3[i][2]; - - if (!h.any_16()) - { - const uint32_t index = h.m_hist[0] | (h.m_hist[1] << 4); - assert(index < 256); - g_total_ordering3_hash[index] = (uint16_t)i; - } - - compute_selector_factors3(h, g_selector_factors3[i][0], g_selector_factors3[i][1], g_selector_factors3[i][2]); - } - - g_initialized = true; - } - - void encode_bc1_solid_block(void* pDst, uint32_t fr, uint32_t fg, uint32_t fb, bool allow_3color) - { - bc1_block* pDst_block = static_cast(pDst); - - uint32_t mask = 0xAA; - int max16 = -1, min16 = 0; - - if (allow_3color) - { - const uint32_t err4 = g_bc1_match5_equals_1[fr].m_e + g_bc1_match6_equals_1[fg].m_e + g_bc1_match5_equals_1[fb].m_e; - const uint32_t err3 = g_bc1_match5_half[fr].m_e + g_bc1_match6_half[fg].m_e + g_bc1_match5_half[fb].m_e; - - if (err3 < err4) - { - max16 = (g_bc1_match5_half[fr].m_hi << 11) | (g_bc1_match6_half[fg].m_hi << 5) | g_bc1_match5_half[fb].m_hi; - min16 = (g_bc1_match5_half[fr].m_lo << 11) | (g_bc1_match6_half[fg].m_lo << 5) | g_bc1_match5_half[fb].m_lo; - - if (max16 > min16) - std::swap(max16, min16); - } - } - - if (max16 == -1) - { - max16 = (g_bc1_match5_equals_1[fr].m_hi << 11) | (g_bc1_match6_equals_1[fg].m_hi << 5) | g_bc1_match5_equals_1[fb].m_hi; - min16 = (g_bc1_match5_equals_1[fr].m_lo << 11) | (g_bc1_match6_equals_1[fg].m_lo << 5) | g_bc1_match5_equals_1[fb].m_lo; - - if (min16 == max16) - { - // Always forbid 3 color blocks - // This is to guarantee that BC3 blocks never use punchthrough alpha (3 color) mode, which isn't supported on some (all?) GPU's. - mask = 0; - - // Make l > h - if (min16 > 0) - min16--; - else - { - // l = h = 0 - assert(min16 == max16 && max16 == 0); - - max16 = 1; - min16 = 0; - mask = 0x55; - } - - assert(max16 > min16); - } - - if (max16 < min16) - { - std::swap(max16, min16); - mask ^= 0x55; - } - } - - pDst_block->set_low_color(static_cast(max16)); - pDst_block->set_high_color(static_cast(min16)); - pDst_block->m_selectors[0] = static_cast(mask); - pDst_block->m_selectors[1] = static_cast(mask); - pDst_block->m_selectors[2] = static_cast(mask); - pDst_block->m_selectors[3] = static_cast(mask); - } - - static const float g_midpoint5[32] = { .015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f, .370588f, .403922f, .435294f, .466667f, .5f, .533333f, .564706f, .596078f, .629412f, .662745f, .694118f, .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f }; - static const float g_midpoint6[64] = { .007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f, .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f, .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f, .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f, .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f, .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f }; - - struct vec3F { float c[3]; }; - - static inline void compute_least_squares_endpoints4_rgb( - vec3F* pXl, vec3F* pXh, - int total_r, int total_g, int total_b, - float iz00, float iz10, float iz11, - uint32_t s, const uint32_t r_sum[17], const uint32_t g_sum[17], const uint32_t b_sum[17]) - { - const float iz01 = iz10; - - const uint32_t f1 = g_unique_total_orders4[s][0]; - const uint32_t f2 = g_unique_total_orders4[s][0] + g_unique_total_orders4[s][1]; - const uint32_t f3 = g_unique_total_orders4[s][0] + g_unique_total_orders4[s][1] + g_unique_total_orders4[s][2]; - uint32_t uq00_r = (r_sum[f2] - r_sum[f1]) + (r_sum[f3] - r_sum[f2]) * 2 + (r_sum[16] - r_sum[f3]) * 3; - uint32_t uq00_g = (g_sum[f2] - g_sum[f1]) + (g_sum[f3] - g_sum[f2]) * 2 + (g_sum[16] - g_sum[f3]) * 3; - uint32_t uq00_b = (b_sum[f2] - b_sum[f1]) + (b_sum[f3] - b_sum[f2]) * 2 + (b_sum[16] - b_sum[f3]) * 3; - - float q10_r = (float)(total_r * 3 - uq00_r); - float q10_g = (float)(total_g * 3 - uq00_g); - float q10_b = (float)(total_b * 3 - uq00_b); - - pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; - pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; - - pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; - pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; - - pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; - pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; - } - - static inline bool compute_least_squares_endpoints4_rgb(const color32* pColors, const uint8_t* pSelectors, vec3F* pXl, vec3F* pXh, int total_r, int total_g, int total_b) - { - uint32_t uq00_r = 0, uq00_g = 0, uq00_b = 0; - uint32_t weight_accum = 0; - for (uint32_t i = 0; i < 16; i++) - { - const uint8_t r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2]; - const uint8_t sel = pSelectors[i]; - - weight_accum += g_weight_vals4[sel]; - uq00_r += sel * r; - uq00_g += sel * g; - uq00_b += sel * b; - } - - int q10_r = total_r * 3 - uq00_r; - int q10_g = total_g * 3 - uq00_g; - int q10_b = total_b * 3 - uq00_b; - - float z00 = (float)((weight_accum >> 16) & 0xFF); - float z10 = (float)((weight_accum >> 8) & 0xFF); - float z11 = (float)(weight_accum & 0xFF); - float z01 = z10; - - float det = z00 * z11 - z01 * z10; - if (fabs(det) < 1e-8f) - return false; - - det = (3.0f / 255.0f) / det; - - float iz00, iz01, iz10, iz11; - iz00 = z11 * det; - iz01 = -z01 * det; - iz10 = -z10 * det; - iz11 = z00 * det; - - pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; - pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; - - pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; - pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; - - pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; - pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; - - return true; - } - - static inline void compute_least_squares_endpoints3_rgb( - vec3F* pXl, vec3F* pXh, - int total_r, int total_g, int total_b, - float iz00, float iz10, float iz11, - uint32_t s, const uint32_t r_sum[17], const uint32_t g_sum[17], const uint32_t b_sum[17]) - { - const float iz01 = iz10; - - // Compensates for BC1 3-color ordering, which is selector 0, 2, 1 - const uint32_t f1 = g_unique_total_orders3[s][0]; - const uint32_t f2 = g_unique_total_orders3[s][0] + g_unique_total_orders3[s][2]; - uint32_t uq00_r = (r_sum[16] - r_sum[f2]) * 2 + (r_sum[f2] - r_sum[f1]); - uint32_t uq00_g = (g_sum[16] - g_sum[f2]) * 2 + (g_sum[f2] - g_sum[f1]); - uint32_t uq00_b = (b_sum[16] - b_sum[f2]) * 2 + (b_sum[f2] - b_sum[f1]); - - float q10_r = (float)(total_r * 2 - uq00_r); - float q10_g = (float)(total_g * 2 - uq00_g); - float q10_b = (float)(total_b * 2 - uq00_b); - - pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; - pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; - - pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; - pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; - - pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; - pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; - } - - static inline bool compute_least_squares_endpoints3_rgb(bool use_black, const color32* pColors, const uint8_t* pSelectors, vec3F* pXl, vec3F* pXh) - { - int uq00_r = 0, uq00_g = 0, uq00_b = 0; - uint32_t weight_accum = 0; - int total_r = 0, total_g = 0, total_b = 0; - for (uint32_t i = 0; i < 16; i++) - { - const uint8_t r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2]; - if (use_black) - { - if ((r | g | b) < 4) - continue; - } - - const uint8_t sel = pSelectors[i]; - assert(sel <= 3); - if (sel == 3) - continue; - - weight_accum += g_weight_vals3[sel]; - - static const uint8_t s_tran[3] = { 0, 2, 1 }; - const uint8_t tsel = s_tran[sel]; - uq00_r += tsel * r; - uq00_g += tsel * g; - uq00_b += tsel * b; - - total_r += r; - total_g += g; - total_b += b; - } - - int q10_r = total_r * 2 - uq00_r; - int q10_g = total_g * 2 - uq00_g; - int q10_b = total_b * 2 - uq00_b; - - float z00 = (float)((weight_accum >> 16) & 0xFF); - float z10 = (float)((weight_accum >> 8) & 0xFF); - float z11 = (float)(weight_accum & 0xFF); - float z01 = z10; - - float det = z00 * z11 - z01 * z10; - if (fabs(det) < 1e-8f) - return false; - - det = (2.0f / 255.0f) / det; - - float iz00, iz01, iz10, iz11; - iz00 = z11 * det; - iz01 = -z01 * det; - iz10 = -z10 * det; - iz11 = z00 * det; - - pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r; - pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r; - - pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g; - pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g; - - pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b; - pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b; - - return true; - } - - static inline void bc1_get_block_colors4(uint32_t block_r[4], uint32_t block_g[4], uint32_t block_b[4], uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb) - { - block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); - block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2); - - if (g_bc1_approx_mode == bc1_approx_mode::cBC1Ideal) - { - block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3; - block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3; - } - else if (g_bc1_approx_mode == bc1_approx_mode::cBC1IdealRound4) - { - block_r[1] = (block_r[0] * 2 + block_r[3] + 1) / 3; block_g[1] = (block_g[0] * 2 + block_g[3] + 1) / 3; block_b[1] = (block_b[0] * 2 + block_b[3] + 1) / 3; - block_r[2] = (block_r[3] * 2 + block_r[0] + 1) / 3; block_g[2] = (block_g[3] * 2 + block_g[0] + 1) / 3; block_b[2] = (block_b[3] * 2 + block_b[0] + 1) / 3; - } - else if (g_bc1_approx_mode == bc1_approx_mode::cBC1AMD) - { - block_r[1] = interp_5_6_amd(block_r[0], block_r[3]); block_g[1] = interp_5_6_amd(block_g[0], block_g[3]); block_b[1] = interp_5_6_amd(block_b[0], block_b[3]); - block_r[2] = interp_5_6_amd(block_r[3], block_r[0]); block_g[2] = interp_5_6_amd(block_g[3], block_g[0]); block_b[2] = interp_5_6_amd(block_b[3], block_b[0]); - } - else - { - block_r[1] = interp_5_nv(lr, hr); block_g[1] = interp_6_nv(block_g[0], block_g[3]); block_b[1] = interp_5_nv(lb, hb); - block_r[2] = interp_5_nv(hr, lr); block_g[2] = interp_6_nv(block_g[3], block_g[0]); block_b[2] = interp_5_nv(hb, lb); - } - } - - static inline void bc1_get_block_colors3(uint32_t block_r[3], uint32_t block_g[3], uint32_t block_b[3], uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb) - { - block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); - block_r[1] = (hr << 3) | (hr >> 2); block_g[1] = (hg << 2) | (hg >> 4); block_b[1] = (hb << 3) | (hb >> 2); - - if ((g_bc1_approx_mode == bc1_approx_mode::cBC1Ideal) || (g_bc1_approx_mode == bc1_approx_mode::cBC1IdealRound4)) - { - block_r[2] = (block_r[0] + block_r[1]) / 2; block_g[2] = (block_g[0] + block_g[1]) / 2; block_b[2] = (block_b[0] + block_b[1]) / 2; - } - else if (g_bc1_approx_mode == bc1_approx_mode::cBC1AMD) - { - block_r[2] = interp_half_5_6_amd(block_r[0], block_r[1]); block_g[2] = interp_half_5_6_amd(block_g[0], block_g[1]); block_b[2] = interp_half_5_6_amd(block_b[0], block_b[1]); - } - else - { - block_r[2] = interp_half_5_nv(lr, hr); block_g[2] = interp_half_6_nv(block_g[0], block_g[1]); block_b[2] = interp_half_5_nv(lb, hb); - } - } - - static inline void bc1_find_sels4_noerr(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16]) - { - uint32_t block_r[4], block_g[4], block_b[4]; - bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); - - int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; - - int dots[4]; - for (uint32_t i = 0; i < 4; i++) - dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; - - int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; - - ar *= 2; ag *= 2; ab *= 2; - - static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; - - for (uint32_t i = 0; i < 16; i += 4) - { - const int d0 = pSrc_pixels[i+0].r * ar + pSrc_pixels[i+0].g * ag + pSrc_pixels[i+0].b * ab; - const int d1 = pSrc_pixels[i+1].r * ar + pSrc_pixels[i+1].g * ag + pSrc_pixels[i+1].b * ab; - const int d2 = pSrc_pixels[i+2].r * ar + pSrc_pixels[i+2].g * ag + pSrc_pixels[i+2].b * ab; - const int d3 = pSrc_pixels[i+3].r * ar + pSrc_pixels[i+3].g * ag + pSrc_pixels[i+3].b * ab; - - sels[i+0] = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)]; - sels[i+1] = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)]; - sels[i+2] = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)]; - sels[i+3] = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)]; - } - } - - static inline uint32_t bc1_find_sels4_fasterr(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) - { - uint32_t block_r[4], block_g[4], block_b[4]; - bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); - - int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; - - int dots[4]; - for (uint32_t i = 0; i < 4; i++) - dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; - - int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; - - ar *= 2; ag *= 2; ab *= 2; - - static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; - - uint32_t total_err = 0; - - for (uint32_t i = 0; i < 16; i += 4) - { - const int d0 = pSrc_pixels[i+0].r * ar + pSrc_pixels[i+0].g * ag + pSrc_pixels[i+0].b * ab; - const int d1 = pSrc_pixels[i+1].r * ar + pSrc_pixels[i+1].g * ag + pSrc_pixels[i+1].b * ab; - const int d2 = pSrc_pixels[i+2].r * ar + pSrc_pixels[i+2].g * ag + pSrc_pixels[i+2].b * ab; - const int d3 = pSrc_pixels[i+3].r * ar + pSrc_pixels[i+3].g * ag + pSrc_pixels[i+3].b * ab; - - uint8_t sel0 = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)]; - uint8_t sel1 = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)]; - uint8_t sel2 = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)]; - uint8_t sel3 = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)]; - - sels[i+0] = sel0; - sels[i+1] = sel1; - sels[i+2] = sel2; - sels[i+3] = sel3; - - total_err += squarei(pSrc_pixels[i+0].r - block_r[sel0]) + squarei(pSrc_pixels[i+0].g - block_g[sel0]) + squarei(pSrc_pixels[i+0].b - block_b[sel0]); - total_err += squarei(pSrc_pixels[i+1].r - block_r[sel1]) + squarei(pSrc_pixels[i+1].g - block_g[sel1]) + squarei(pSrc_pixels[i+1].b - block_b[sel1]); - total_err += squarei(pSrc_pixels[i+2].r - block_r[sel2]) + squarei(pSrc_pixels[i+2].g - block_g[sel2]) + squarei(pSrc_pixels[i+2].b - block_b[sel2]); - total_err += squarei(pSrc_pixels[i+3].r - block_r[sel3]) + squarei(pSrc_pixels[i+3].g - block_g[sel3]) + squarei(pSrc_pixels[i+3].b - block_b[sel3]); - - if (total_err >= cur_err) - break; - } - - return total_err; - } - - static inline uint32_t bc1_find_sels4_check2_err(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) - { - uint32_t block_r[4], block_g[4], block_b[4]; - bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); - - int dr = block_r[3] - block_r[0], dg = block_g[3] - block_g[0], db = block_b[3] - block_b[0]; - - const float f = 4.0f / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f); - - uint32_t total_err = 0; - - for (uint32_t i = 0; i < 16; i++) - { - const int r = pSrc_pixels[i].r; - const int g = pSrc_pixels[i].g; - const int b = pSrc_pixels[i].b; - - int sel = (int)((float)((r - (int)block_r[0]) * dr + (g - (int)block_g[0]) * dg + (b - (int)block_b[0]) * db) * f + .5f); - sel = clampi(sel, 1, 3); - - uint32_t err0 = squarei((int)block_r[sel - 1] - (int)r) + squarei((int)block_g[sel - 1] - (int)g) + squarei((int)block_b[sel - 1] - (int)b); - uint32_t err1 = squarei((int)block_r[sel] - (int)r) + squarei((int)block_g[sel] - (int)g) + squarei((int)block_b[sel] - (int)b); - - int best_sel = sel; - uint32_t best_err = err1; - if (err0 == err1) - { - // Prefer non-interpolation - if ((best_sel - 1) == 0) - best_sel = 0; - } - else if (err0 < best_err) - { - best_sel = sel - 1; - best_err = err0; - } - - total_err += best_err; - - if (total_err >= cur_err) - break; - - sels[i] = (uint8_t)best_sel; - } - return total_err; - } - - static inline uint32_t bc1_find_sels4_fullerr(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) - { - uint32_t block_r[4], block_g[4], block_b[4]; - bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); - - uint32_t total_err = 0; - - for (uint32_t i = 0; i < 16; i++) - { - const int r = pSrc_pixels[i].r; - const int g = pSrc_pixels[i].g; - const int b = pSrc_pixels[i].b; - - uint32_t best_err = squarei((int)block_r[0] - (int)r) + squarei((int)block_g[0] - (int)g) + squarei((int)block_b[0] - (int)b); - uint8_t best_sel = 0; - - for (uint32_t j = 1; (j < 4) && best_err; j++) - { - uint32_t err = squarei((int)block_r[j] - (int)r) + squarei((int)block_g[j] - (int)g) + squarei((int)block_b[j] - (int)b); - if ( (err < best_err) || ((err == best_err) && (j == 3)) ) - { - best_err = err; - best_sel = (uint8_t)j; - } - } - - total_err += best_err; - - if (total_err >= cur_err) - break; - - sels[i] = (uint8_t)best_sel; - } - return total_err; - } - - static inline uint32_t bc1_find_sels4(uint32_t flags, const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) - { - uint32_t err; - - if (flags & cEncodeBC1UseFasterMSEEval) - err = bc1_find_sels4_fasterr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err); - else if (flags & cEncodeBC1UseFullMSEEval) - err = bc1_find_sels4_fullerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err); - else - err = bc1_find_sels4_check2_err(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err); - - return err; - } - - static inline uint32_t bc1_find_sels3_fullerr(bool use_black, const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16], uint32_t cur_err) - { - uint32_t block_r[3], block_g[3], block_b[3]; - bc1_get_block_colors3(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb); - - uint32_t total_err = 0; - - for (uint32_t i = 0; i < 16; i++) - { - const int r = pSrc_pixels[i].r; - const int g = pSrc_pixels[i].g; - const int b = pSrc_pixels[i].b; - - uint32_t best_err = squarei((int)block_r[0] - (int)r) + squarei((int)block_g[0] - (int)g) + squarei((int)block_b[0] - (int)b); - uint32_t best_sel = 0; - - uint32_t err1 = squarei((int)block_r[1] - (int)r) + squarei((int)block_g[1] - (int)g) + squarei((int)block_b[1] - (int)b); - if (err1 < best_err) - { - best_err = err1; - best_sel = 1; - } - - uint32_t err2 = squarei((int)block_r[2] - (int)r) + squarei((int)block_g[2] - (int)g) + squarei((int)block_b[2] - (int)b); - if (err2 < best_err) - { - best_err = err2; - best_sel = 2; - } - - if (use_black) - { - uint32_t err3 = squarei(r) + squarei(g) + squarei(b); - if (err3 < best_err) - { - best_err = err3; - best_sel = 3; - } - } - - total_err += best_err; - if (total_err >= cur_err) - return total_err; - - sels[i] = (uint8_t)best_sel; - } - - return total_err; - } - - static inline void precise_round_565(const vec3F &xl, const vec3F &xh, - int &trial_lr, int &trial_lg, int &trial_lb, - int &trial_hr, int &trial_hg, int &trial_hb) - { - trial_lr = (int)(xl.c[0] * 31.0f); - trial_lg = (int)(xl.c[1] * 63.0f); - trial_lb = (int)(xl.c[2] * 31.0f); - - trial_hr = (int)(xh.c[0] * 31.0f); - trial_hg = (int)(xh.c[1] * 63.0f); - trial_hb = (int)(xh.c[2] * 31.0f); - - if ((uint32_t)(trial_lr | trial_lb | trial_hr | trial_hb) > 31U) - { - trial_lr = ((uint32_t)trial_lr > 31U) ? (~trial_lr >> 31) & 31 : trial_lr; - trial_hr = ((uint32_t)trial_hr > 31U) ? (~trial_hr >> 31) & 31 : trial_hr; - - trial_lb = ((uint32_t)trial_lb > 31U) ? (~trial_lb >> 31) & 31 : trial_lb; - trial_hb = ((uint32_t)trial_hb > 31U) ? (~trial_hb >> 31) & 31 : trial_hb; - } - - if ((uint32_t)(trial_lg | trial_hg) > 63U) - { - trial_lg = ((uint32_t)trial_lg > 63U) ? (~trial_lg >> 31) & 63 : trial_lg; - trial_hg = ((uint32_t)trial_hg > 63U) ? (~trial_hg >> 31) & 63 : trial_hg; - } - - trial_lr = (trial_lr + (xl.c[0] > g_midpoint5[trial_lr])) & 31; - trial_lg = (trial_lg + (xl.c[1] > g_midpoint6[trial_lg])) & 63; - trial_lb = (trial_lb + (xl.c[2] > g_midpoint5[trial_lb])) & 31; - - trial_hr = (trial_hr + (xh.c[0] > g_midpoint5[trial_hr])) & 31; - trial_hg = (trial_hg + (xh.c[1] > g_midpoint6[trial_hg])) & 63; - trial_hb = (trial_hb + (xh.c[2] > g_midpoint5[trial_hb])) & 31; - } - - static inline void precise_round_565_noscale(vec3F xl, vec3F xh, - int &trial_lr, int &trial_lg, int &trial_lb, - int &trial_hr, int &trial_hg, int &trial_hb) - { - xl.c[0] *= 1.0f/255.0f; - xl.c[1] *= 1.0f/255.0f; - xl.c[2] *= 1.0f/255.0f; - - xh.c[0] *= 1.0f/255.0f; - xh.c[1] *= 1.0f/255.0f; - xh.c[2] *= 1.0f/255.0f; - - precise_round_565(xl, xh, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb); - } - - static inline void bc1_encode4(bc1_block *pDst_block, int lr, int lg, int lb, int hr, int hg, int hb, const uint8_t sels[16]) - { - uint32_t lc16 = bc1_block::pack_unscaled_color(lr, lg, lb); - uint32_t hc16 = bc1_block::pack_unscaled_color(hr, hg, hb); - - // Always forbid 3 color blocks - if (lc16 == hc16) - { - uint8_t mask = 0; - - // Make l > h - if (hc16 > 0) - hc16--; - else - { - // lc16 = hc16 = 0 - assert(lc16 == hc16 && hc16 == 0); - - hc16 = 0; - lc16 = 1; - mask = 0x55; // select hc16 - } - - assert(lc16 > hc16); - pDst_block->set_low_color(static_cast(lc16)); - pDst_block->set_high_color(static_cast(hc16)); - - pDst_block->m_selectors[0] = mask; - pDst_block->m_selectors[1] = mask; - pDst_block->m_selectors[2] = mask; - pDst_block->m_selectors[3] = mask; - } - else - { - uint8_t invert_mask = 0; - if (lc16 < hc16) - { - std::swap(lc16, hc16); - invert_mask = 0x55; - } - - assert(lc16 > hc16); - pDst_block->set_low_color((uint16_t)lc16); - pDst_block->set_high_color((uint16_t)hc16); - - uint32_t packed_sels = 0; - static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 }; - for (uint32_t i = 0; i < 16; i++) - packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2)); - - pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask; - pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask; - pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask; - pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask; - } - } - - static inline void bc1_encode3(bc1_block *pDst_block, int lr, int lg, int lb, int hr, int hg, int hb, const uint8_t sels[16]) - { - uint32_t lc16 = bc1_block::pack_unscaled_color(lr, lg, lb); - uint32_t hc16 = bc1_block::pack_unscaled_color(hr, hg, hb); - - bool invert_flag = false; - if (lc16 > hc16) - { - std::swap(lc16, hc16); - invert_flag = true; - } - - assert(lc16 <= hc16); - - pDst_block->set_low_color((uint16_t)lc16); - pDst_block->set_high_color((uint16_t)hc16); - - uint32_t packed_sels = 0; - - if (invert_flag) - { - static const uint8_t s_sel_trans_inv[4] = { 1, 0, 2, 3 }; - - for (uint32_t i = 0; i < 16; i++) - packed_sels |= ((uint32_t)s_sel_trans_inv[sels[i]] << (i * 2)); - } - else - { - for (uint32_t i = 0; i < 16; i++) - packed_sels |= ((uint32_t)sels[i] << (i * 2)); - } - - pDst_block->m_selectors[0] = (uint8_t)packed_sels; - pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8); - pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16); - pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24); - } - - struct bc1_encode_results - { - int lr, lg, lb; - int hr, hg, hb; - uint8_t sels[16]; - bool m_3color; - }; - - static bool try_3color_block_useblack(const color32* pSrc_pixels, uint32_t flags, uint32_t &cur_err, bc1_encode_results &results) - { - int total_r = 0, total_g = 0, total_b = 0; - int max_r = 0, max_g = 0, max_b = 0; - int min_r = 255, min_g = 255, min_b = 255; - int total_pixels = 0; - for (uint32_t i = 0; i < 16; i++) - { - const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; - if ((r | g | b) < 4) - continue; - - max_r = std::max(max_r, r); max_g = std::max(max_g, g); max_b = std::max(max_b, b); - min_r = std::min(min_r, r); min_g = std::min(min_g, g); min_b = std::min(min_b, b); - total_r += r; total_g += g; total_b += b; - - total_pixels++; - } - - if (!total_pixels) - return false; - - int half_total_pixels = total_pixels >> 1; - int avg_r = (total_r + half_total_pixels) / total_pixels; - int avg_g = (total_g + half_total_pixels) / total_pixels; - int avg_b = (total_b + half_total_pixels) / total_pixels; - - uint32_t low_c = 0, high_c = 0; - - int icov[6] = { 0, 0, 0, 0, 0, 0 }; - for (uint32_t i = 0; i < 16; i++) - { - int r = (int)pSrc_pixels[i].r; - int g = (int)pSrc_pixels[i].g; - int b = (int)pSrc_pixels[i].b; - - if ((r | g | b) < 4) - continue; - - r -= avg_r; - g -= avg_g; - b -= avg_b; - - icov[0] += r * r; - icov[1] += r * g; - icov[2] += r * b; - icov[3] += g * g; - icov[4] += g * b; - icov[5] += b * b; - } - - float cov[6]; - for (uint32_t i = 0; i < 6; i++) - cov[i] = (float)(icov[i]) * (1.0f / 255.0f); - - float xr = (float)(max_r - min_r); - float xg = (float)(max_g - min_g); - float xb = (float)(max_b - min_b); - - if (icov[2] < 0) - xr = -xr; - - if (icov[4] < 0) - xg = -xg; - - for (uint32_t power_iter = 0; power_iter < 4; power_iter++) - { - float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; - float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; - float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; - xr = r; xg = g; xb = b; - } - - float k = maximum(fabsf(xr), fabsf(xg), fabsf(xb)); - int saxis_r = 306, saxis_g = 601, saxis_b = 117; - if (k >= 2) - { - float m = 1024.0f / k; - saxis_r = (int)(xr * m); - saxis_g = (int)(xg * m); - saxis_b = (int)(xb * m); - } - - int low_dot = INT_MAX, high_dot = INT_MIN; - for (uint32_t i = 0; i < 16; i++) - { - int r = (int)pSrc_pixels[i].r, g = (int)pSrc_pixels[i].g, b = (int)pSrc_pixels[i].b; - - if ((r | g | b) < 4) - continue; - - int dot = r * saxis_r + g * saxis_g + b * saxis_b; - if (dot < low_dot) - { - low_dot = dot; - low_c = i; - } - if (dot > high_dot) - { - high_dot = dot; - high_c = i; - } - } - - int lr = to_5(pSrc_pixels[low_c].r); - int lg = to_6(pSrc_pixels[low_c].g); - int lb = to_5(pSrc_pixels[low_c].b); - - int hr = to_5(pSrc_pixels[high_c].r); - int hg = to_6(pSrc_pixels[high_c].g); - int hb = to_5(pSrc_pixels[high_c].b); - - uint8_t trial_sels[16]; - uint32_t trial_err = bc1_find_sels3_fullerr(true, pSrc_pixels, lr, lg, lb, hr, hg, hb, trial_sels, UINT32_MAX); - - if (trial_err) - { - const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; - for (uint32_t trials = 0; trials < total_ls_passes; trials++) - { - vec3F xl, xh; - int lr2, lg2, lb2, hr2, hg2, hb2; - if (!compute_least_squares_endpoints3_rgb(true, pSrc_pixels, trial_sels, &xl, &xh)) - { - lr2 = g_bc1_match5_half[avg_r].m_hi; - lg2 = g_bc1_match6_half[avg_g].m_hi; - lb2 = g_bc1_match5_half[avg_b].m_hi; - - hr2 = g_bc1_match5_half[avg_r].m_lo; - hg2 = g_bc1_match6_half[avg_g].m_lo; - hb2 = g_bc1_match5_half[avg_b].m_lo; - } - else - { - precise_round_565(xl, xh, hr2, hg2, hb2, lr2, lg2, lb2); - } - - if ((lr == lr2) && (lg == lg2) && (lb == lb2) && (hr == hr2) && (hg == hg2) && (hb == hb2)) - break; - - uint8_t trial_sels2[16]; - uint32_t trial_err2 = bc1_find_sels3_fullerr(true, pSrc_pixels, lr2, lg2, lb2, hr2, hg2, hb2, trial_sels2, trial_err); - - if (trial_err2 < trial_err) - { - trial_err = trial_err2; - lr = lr2; lg = lg2; lb = lb2; - hr = hr2; hg = hg2; hb = hb2; - memcpy(trial_sels, trial_sels2, sizeof(trial_sels)); - } - else - break; - } - } - - if (trial_err < cur_err) - { - results.m_3color = true; - results.lr = lr; - results.lg = lg; - results.lb = lb; - results.hr = hr; - results.hg = hg; - results.hb = hb; - memcpy(results.sels, trial_sels, 16); - - cur_err = trial_err; - - return true; - } - - return false; - } - - static bool try_3color_block(const color32* pSrc_pixels, uint32_t flags, uint32_t &cur_err, - int avg_r, int avg_g, int avg_b, int lr, int lg, int lb, int hr, int hg, int hb, int total_r, int total_g, int total_b, uint32_t total_orderings_to_try, - bc1_encode_results &results) - { - uint8_t trial_sels[16]; - uint32_t trial_err = bc1_find_sels3_fullerr(false, pSrc_pixels, lr, lg, lb, hr, hg, hb, trial_sels, UINT32_MAX); - - if (trial_err) - { - const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; - for (uint32_t trials = 0; trials < total_ls_passes; trials++) - { - vec3F xl, xh; - int lr2, lg2, lb2, hr2, hg2, hb2; - if (!compute_least_squares_endpoints3_rgb(false, pSrc_pixels, trial_sels, &xl, &xh)) - { - lr2 = g_bc1_match5_half[avg_r].m_hi; - lg2 = g_bc1_match6_half[avg_g].m_hi; - lb2 = g_bc1_match5_half[avg_b].m_hi; - - hr2 = g_bc1_match5_half[avg_r].m_lo; - hg2 = g_bc1_match6_half[avg_g].m_lo; - hb2 = g_bc1_match5_half[avg_b].m_lo; - } - else - { - precise_round_565(xl, xh, hr2, hg2, hb2, lr2, lg2, lb2); - } - - if ((lr == lr2) && (lg == lg2) && (lb == lb2) && (hr == hr2) && (hg == hg2) && (hb == hb2)) - break; - - uint8_t trial_sels2[16]; - uint32_t trial_err2 = bc1_find_sels3_fullerr(false, pSrc_pixels, lr2, lg2, lb2, hr2, hg2, hb2, trial_sels2, trial_err); - - if (trial_err2 < trial_err) - { - trial_err = trial_err2; - lr = lr2; lg = lg2; lb = lb2; - hr = hr2; hg = hg2; hb = hb2; - memcpy(trial_sels, trial_sels2, sizeof(trial_sels)); - } - else - break; - } - } - - if ((trial_err) && (flags & cEncodeBC1UseLikelyTotalOrderings) && (total_orderings_to_try)) - { - hist3 h; - for (uint32_t i = 0; i < 16; i++) - { - assert(trial_sels[i] < 3); - h.m_hist[trial_sels[i]]++; - } - - const uint32_t orig_total_order_index = h.lookup_total_ordering_index(); - - int r0, g0, b0, r3, g3, b3; - r0 = (lr << 3) | (lr >> 2); g0 = (lg << 2) | (lg >> 4); b0 = (lb << 3) | (lb >> 2); - r3 = (hr << 3) | (hr >> 2); g3 = (hg << 2) | (hg >> 4); b3 = (hb << 3) | (hb >> 2); - - int ar = r3 - r0, ag = g3 - g0, ab = b3 - b0; - - int dots[16]; - for (uint32_t i = 0; i < 16; i++) - { - int r = pSrc_pixels[i].r; - int g = pSrc_pixels[i].g; - int b = pSrc_pixels[i].b; - int d = 0x1000000 + (r * ar + g * ag + b * ab); - assert(d >= 0); - dots[i] = (d << 4) + i; - } - - std::sort(dots, dots + 16); - - uint32_t r_sum[17], g_sum[17], b_sum[17]; - uint32_t r = 0, g = 0, b = 0; - for (uint32_t i = 0; i < 16; i++) - { - const uint32_t p = dots[i] & 15; - - r_sum[i] = r; - g_sum[i] = g; - b_sum[i] = b; - - r += pSrc_pixels[p].r; - g += pSrc_pixels[p].g; - b += pSrc_pixels[p].b; - } - - r_sum[16] = total_r; - g_sum[16] = total_g; - b_sum[16] = total_b; - - const uint32_t q_total = (flags & cEncodeBC1Exhaustive) ? NUM_UNIQUE_TOTAL_ORDERINGS3 : std::min(total_orderings_to_try, MAX_TOTAL_ORDERINGS3); - for (uint32_t q = 0; q < q_total; q++) - { - const uint32_t s = (flags & cEncodeBC1Exhaustive) ? q : g_best_total_orderings3[orig_total_order_index][q]; - - int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; - - vec3F xl, xh; - - if ((s == TOTAL_ORDER_3_0_16) || (s == TOTAL_ORDER_3_1_16) || (s == TOTAL_ORDER_3_2_16)) - { - trial_lr = g_bc1_match5_half[avg_r].m_hi; - trial_lg = g_bc1_match6_half[avg_g].m_hi; - trial_lb = g_bc1_match5_half[avg_b].m_hi; - - trial_hr = g_bc1_match5_half[avg_r].m_lo; - trial_hg = g_bc1_match6_half[avg_g].m_lo; - trial_hb = g_bc1_match5_half[avg_b].m_lo; - } - else - { - compute_least_squares_endpoints3_rgb(&xl, &xh, total_r, total_g, total_b, - g_selector_factors3[s][0], g_selector_factors3[s][1], g_selector_factors3[s][2], s, r_sum, g_sum, b_sum); - - precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); - } - - uint8_t trial_sels2[16]; - uint32_t trial_err2 = bc1_find_sels3_fullerr(false, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels2, UINT32_MAX); - - if (trial_err2 < trial_err) - { - trial_err = trial_err2; - - lr = trial_lr; - lg = trial_lg; - lb = trial_lb; - - hr = trial_hr; - hg = trial_hg; - hb = trial_hb; - - memcpy(trial_sels, trial_sels2, sizeof(trial_sels)); - } - - } // s - } - - if (trial_err < cur_err) - { - results.m_3color = true; - results.lr = lr; - results.lg = lg; - results.lb = lb; - results.hr = hr; - results.hg = hg; - results.hb = hb; - memcpy(results.sels, trial_sels, 16); - - cur_err = trial_err; - - return true; - } - - return false; - } - - void encode_bc1(uint32_t level, void* pDst, const uint8_t* pPixels, bool allow_3color, bool allow_transparent_texels_for_black) - { - uint32_t flags = 0, total_orderings4 = 1, total_orderings3 = 1; - - static_assert(MAX_TOTAL_ORDERINGS3 >= 32, "MAX_TOTAL_ORDERINGS3 >= 32"); - static_assert(MAX_TOTAL_ORDERINGS4 >= 32, "MAX_TOTAL_ORDERINGS4 >= 32"); - - switch (level) - { - case 0: - // Faster/higher quality than stb_dxt default. - flags = cEncodeBC1BoundingBoxInt; - break; - case 1: - // Faster/higher quality than stb_dxt default. A bit higher average quality vs. mode 0. - flags = cEncodeBC1Use2DLS; - break; - case 2: - // On average mode 2 is a little weaker than modes 0/1, but it's stronger on outliers (very tough textures). - // Slightly stronger than stb_dxt. - flags = 0; - break; - case 3: - // Slightly stronger than stb_dxt HIGHQUAL. - flags = cEncodeBC1TwoLeastSquaresPasses; - break; - case 4: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1Use6PowerIters; - break; - default: - case 5: - // stb_dxt HIGHQUAL + permit 3 color (if it's enabled). - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - break; - case 6: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - break; - case 7: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 4; - break; - case 8: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 8; - break; - case 9: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 11; - total_orderings3 = 3; - break; - case 10: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 20; - total_orderings3 = 8; - break; - case 11: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 28; - total_orderings3 = 16; - break; - case 12: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 32; - total_orderings3 = 32; - break; - case 13: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (20 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 32; - total_orderings3 = 32; - break; - case 14: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (32 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 32; - total_orderings3 = 32; - break; - case 15: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (32 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = ((((32 + MAX_TOTAL_ORDERINGS4) / 2) + 32) / 2); - total_orderings3 = 32; - break; - case 16: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = (32 + MAX_TOTAL_ORDERINGS4) / 2; - total_orderings3 = 32; - break; - case 17: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = MAX_TOTAL_ORDERINGS4; - total_orderings3 = 32; - break; - case 18: - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | cEncodeBC1Iterative | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = MAX_TOTAL_ORDERINGS4; - total_orderings3 = 32; - break; - case 19: - // This hidden mode is *extremely* slow and abuses the encoder. It's just for testing/training. - flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters | cEncodeBC1Exhaustive | cEncodeBC1Iterative | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts; - flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0); - total_orderings4 = 32; - total_orderings3 = 32; - break; - } - - encode_bc1(pDst, pPixels, flags, total_orderings4, total_orderings3); - } - - static inline void encode_bc1_pick_initial(const color32 *pSrc_pixels, uint32_t flags, bool grayscale_flag, - int min_r, int min_g, int min_b, int max_r, int max_g, int max_b, - int avg_r, int avg_g, int avg_b, int total_r, int total_g, int total_b, - int &lr, int &lg, int &lb, int &hr, int &hg, int &hb) - { - if (grayscale_flag) - { - const int fr = pSrc_pixels[0].r; - - // Grayscale blocks are a common enough case to specialize. - if ((max_r - min_r) < 2) - { - lr = lb = hr = hb = to_5(fr); - lg = hg = to_6(fr); - } - else - { - lr = lb = to_5(min_r); - lg = to_6(min_r); - - hr = hb = to_5(max_r); - hg = to_6(max_r); - } - } - else if (flags & cEncodeBC1Use2DLS) - { - // 2D Least Squares approach from Humus's example, with added inset and optimal rounding. - int big_chan = 0, min_chan_val = min_r, max_chan_val = max_r; - if ((max_g - min_g) > (max_chan_val - min_chan_val)) - { - big_chan = 1; min_chan_val = min_g; max_chan_val = max_g; - } - if ((max_b - min_b) > (max_chan_val - min_chan_val)) - { - big_chan = 2; min_chan_val = min_b; max_chan_val = max_b; - } - int sum_xy_r = 0, sum_xy_g = 0, sum_xy_b = 0; - vec3F l, h; - if (big_chan == 0) - { - for (uint32_t i = 0; i < 16; i++) - { - const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; - sum_xy_r += r * r; sum_xy_g += r * g; sum_xy_b += r * b; - } - - int sum_x = total_r; - int sum_x2 = sum_xy_r; - - float div = (float)(16 * sum_x2 - sum_x * sum_x); - float b_y = 0.0f, b_z = 0.0f; - if (fabs(div) > 1e-8f) - { - div = 1.0f / div; - b_y = (16 * sum_xy_g - sum_x * total_g) * div; - b_z = (16 * sum_xy_b - sum_x * total_b) * div; - } - - float a_y = (total_g - b_y * sum_x) / 16.0f; - float a_z = (total_b - b_z * sum_x) / 16.0f; - - l.c[1] = a_y + b_y * min_chan_val; - l.c[2] = a_z + b_z * min_chan_val; - - h.c[1] = a_y + b_y * max_chan_val; - h.c[2] = a_z + b_z * max_chan_val; - - float dg = (h.c[1] - l.c[1]); - float db = (h.c[2] - l.c[2]); - - h.c[1] = l.c[1] + dg * (15.0f/16.0f); - h.c[2] = l.c[2] + db * (15.0f/16.0f); - - l.c[1] = l.c[1] + dg * (1.0f/16.0f); - l.c[2] = l.c[2] + db * (1.0f/16.0f); - - float d = (float)(max_chan_val - min_chan_val); - float fmin_chan_val = min_chan_val + d * (1.0f/16.0f); - float fmax_chan_val = min_chan_val + d * (15.0f/16.0f); - - l.c[0] = fmin_chan_val; - h.c[0] = fmax_chan_val; - } - else if (big_chan == 1) - { - for (uint32_t i = 0; i < 16; i++) - { - const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; - sum_xy_r += g * r; sum_xy_g += g * g; sum_xy_b += g * b; - } - - int sum_x = total_g; - int sum_x2 = sum_xy_g; - - float div = (float)(16 * sum_x2 - sum_x * sum_x); - float b_x = 0.0f, b_z = 0.0f; - if (fabs(div) > 1e-8f) - { - div = 1.0f / div; - b_x = (16 * sum_xy_r - sum_x * total_r) * div; - b_z = (16 * sum_xy_b - sum_x * total_b) * div; - } - - float a_x = (total_r - b_x * sum_x) / 16.0f; - float a_z = (total_b - b_z * sum_x) / 16.0f; - - l.c[0] = a_x + b_x * min_chan_val; - l.c[2] = a_z + b_z * min_chan_val; - - h.c[0] = a_x + b_x * max_chan_val; - h.c[2] = a_z + b_z * max_chan_val; - - float dr = (h.c[0] - l.c[0]); - float db = (h.c[2] - l.c[2]); - - h.c[0] = l.c[0] + dr * (15.0f/16.0f); - h.c[2] = l.c[2] + db * (15.0f/16.0f); - - l.c[0] = l.c[0] + dr * (1.0f/16.0f); - l.c[2] = l.c[2] + db * (1.0f/16.0f); - - float d = (float)(max_chan_val - min_chan_val); - float fmin_chan_val = min_chan_val + d * (1.0f/16.0f); - float fmax_chan_val = min_chan_val + d * (15.0f/16.0f); - - l.c[1] = fmin_chan_val; - h.c[1] = fmax_chan_val; - } - else - { - for (uint32_t i = 0; i < 16; i++) - { - const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; - sum_xy_r += b * r; sum_xy_g += b * g; sum_xy_b += b * b; - } - - int sum_x = total_b; - int sum_x2 = sum_xy_b; - - float div = (float)(16 * sum_x2 - sum_x * sum_x); - float b_x = 0.0f, b_y = 0.0f; - if (fabs(div) > 1e-8f) - { - div = 1.0f / div; - b_x = (16 * sum_xy_r - sum_x * total_r) * div; - b_y = (16 * sum_xy_g - sum_x * total_g) * div; - } - - float a_x = (total_r - b_x * sum_x) / 16.0f; - float a_y = (total_g - b_y * sum_x) / 16.0f; - - l.c[0] = a_x + b_x * min_chan_val; - l.c[1] = a_y + b_y * min_chan_val; - - h.c[0] = a_x + b_x * max_chan_val; - h.c[1] = a_y + b_y * max_chan_val; - - float dr = (h.c[0] - l.c[0]); - float dg = (h.c[1] - l.c[1]); - - h.c[0] = l.c[0] + dr * (15.0f/16.0f); - h.c[1] = l.c[1] + dg * (15.0f/16.0f); - - l.c[0] = l.c[0] + dr * (1.0f/16.0f); - l.c[1] = l.c[1] + dg * (1.0f/16.0f); - - float d = (float)(max_chan_val - min_chan_val); - float fmin_chan_val = min_chan_val + d * (1.0f/16.0f); - float fmax_chan_val = min_chan_val + d * (15.0f/16.0f); - - l.c[2] = fmin_chan_val; - h.c[2] = fmax_chan_val; - } - - precise_round_565_noscale(l, h, lr, lg, lb, hr, hg, hb); - } - else if (flags & cEncodeBC1BoundingBox) - { - // Algorithm from icbc.h compress_dxt1_fast() - vec3F l, h; - l.c[0] = min_r * (1.0f/255.0f); - l.c[1] = min_g * (1.0f/255.0f); - l.c[2] = min_b * (1.0f/255.0f); - - h.c[0] = max_r * (1.0f/255.0f); - h.c[1] = max_g * (1.0f/255.0f); - h.c[2] = max_b * (1.0f/255.0f); - - const float bias = 8.0f / 255.0f; - float inset_r = (h.c[0] - l.c[0] - bias) * (1.0f/16.0f); - float inset_g = (h.c[1] - l.c[1] - bias) * (1.0f/16.0f); - float inset_b = (h.c[2] - l.c[2] - bias) * (1.0f/16.0f); - - l.c[0] = clampf(l.c[0] + inset_r, 0.0f, 1.0f); - l.c[1] = clampf(l.c[1] + inset_g, 0.0f, 1.0f); - l.c[2] = clampf(l.c[2] + inset_b, 0.0f, 1.0f); - - h.c[0] = clampf(h.c[0] - inset_r, 0.0f, 1.0f); - h.c[1] = clampf(h.c[1] - inset_g, 0.0f, 1.0f); - h.c[2] = clampf(h.c[2] - inset_b, 0.0f, 1.0f); - - int icov_xz = 0, icov_yz = 0; - for (uint32_t i = 0; i < 16; i++) - { - int r = (int)pSrc_pixels[i].r - avg_r; - int g = (int)pSrc_pixels[i].g - avg_g; - int b = (int)pSrc_pixels[i].b - avg_b; - icov_xz += r * b; - icov_yz += g * b; - } - - if (icov_xz < 0) - std::swap(l.c[0], h.c[0]); - - if (icov_yz < 0) - std::swap(l.c[1], h.c[1]); - - precise_round_565(l, h, lr, lg, lb, hr, hg, hb); - } - else if (flags & cEncodeBC1BoundingBoxInt) - { - // Algorithm from icbc.h compress_dxt1_fast(), but converted to integer. - int inset_r = (max_r - min_r - 8) >> 4; - int inset_g = (max_g - min_g - 8) >> 4; - int inset_b = (max_b - min_b - 8) >> 4; - - min_r += inset_r; - min_g += inset_g; - min_b += inset_b; - if ((uint32_t)(min_r | min_g | min_b) > 255U) - { - min_r = clampi(min_r, 0, 255); - min_g = clampi(min_g, 0, 255); - min_b = clampi(min_b, 0, 255); - } - - max_r -= inset_r; - max_g -= inset_g; - max_b -= inset_b; - if ((uint32_t)(max_r | max_g | max_b) > 255U) - { - max_r = clampi(max_r, 0, 255); - max_g = clampi(max_g, 0, 255); - max_b = clampi(max_b, 0, 255); - } - - int icov_xz = 0, icov_yz = 0; - for (uint32_t i = 0; i < 16; i++) - { - int r = (int)pSrc_pixels[i].r - avg_r; - int g = (int)pSrc_pixels[i].g - avg_g; - int b = (int)pSrc_pixels[i].b - avg_b; - icov_xz += r * b; - icov_yz += g * b; - } - - int x0 = min_r; - int y0 = min_g; - int x1 = max_r; - int y1 = max_g; - - if (icov_xz < 0) - std::swap(x0, x1); - - if (icov_yz < 0) - std::swap(y0, y1); - - lr = to_5(x0); - lg = to_6(y0); - lb = to_5(min_b); - - hr = to_5(x1); - hg = to_6(y1); - hb = to_5(max_b); - } - else - { - // Select 2 colors along the principle axis. (There must be a faster/simpler way.) - uint32_t low_c = 0, high_c = 0; - - int icov[6] = { 0, 0, 0, 0, 0, 0 }; - for (uint32_t i = 0; i < 16; i++) - { - int r = (int)pSrc_pixels[i].r - avg_r; - int g = (int)pSrc_pixels[i].g - avg_g; - int b = (int)pSrc_pixels[i].b - avg_b; - icov[0] += r * r; - icov[1] += r * g; - icov[2] += r * b; - icov[3] += g * g; - icov[4] += g * b; - icov[5] += b * b; - } - - int saxis_r = 306, saxis_g = 601, saxis_b = 117; - - float xr = (float)(max_r - min_r); - float xg = (float)(max_g - min_g); - float xb = (float)(max_b - min_b); - - if (icov[2] < 0) - xr = -xr; - - if (icov[4] < 0) - xg = -xg; - - float cov[6]; - for (uint32_t i = 0; i < 6; i++) - cov[i] = (float)(icov[i]) * (1.0f / 255.0f); - - const uint32_t total_power_iters = (flags & cEncodeBC1Use6PowerIters) ? 6 : 4; - for (uint32_t power_iter = 0; power_iter < total_power_iters; power_iter++) - { - float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; - float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; - float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; - xr = r; xg = g; xb = b; - } - - float k = maximum(fabsf(xr), fabsf(xg), fabsf(xb)); - if (k >= 2) - { - float m = 2048.0f / k; - saxis_r = (int)(xr * m); - saxis_g = (int)(xg * m); - saxis_b = (int)(xb * m); - } - - int low_dot = INT_MAX, high_dot = INT_MIN; - - saxis_r = (int)((uint32_t)saxis_r << 4U); - saxis_g = (int)((uint32_t)saxis_g << 4U); - saxis_b = (int)((uint32_t)saxis_b << 4U); - - for (uint32_t i = 0; i < 16; i += 4) - { - int dot0 = ((pSrc_pixels[i].r * saxis_r + pSrc_pixels[i].g * saxis_g + pSrc_pixels[i].b * saxis_b) & ~0xF) + i; - int dot1 = ((pSrc_pixels[i + 1].r * saxis_r + pSrc_pixels[i + 1].g * saxis_g + pSrc_pixels[i + 1].b * saxis_b) & ~0xF) + i + 1; - int dot2 = ((pSrc_pixels[i + 2].r * saxis_r + pSrc_pixels[i + 2].g * saxis_g + pSrc_pixels[i + 2].b * saxis_b) & ~0xF) + i + 2; - int dot3 = ((pSrc_pixels[i + 3].r * saxis_r + pSrc_pixels[i + 3].g * saxis_g + pSrc_pixels[i + 3].b * saxis_b) & ~0xF) + i + 3; - - int min_d01 = std::min(dot0, dot1); - int max_d01 = std::max(dot0, dot1); - - int min_d23 = std::min(dot2, dot3); - int max_d23 = std::max(dot2, dot3); - - int min_d = std::min(min_d01, min_d23); - int max_d = std::max(max_d01, max_d23); - - low_dot = std::min(low_dot, min_d); - high_dot = std::max(high_dot, max_d); - } - low_c = low_dot & 15; - high_c = high_dot & 15; - - lr = to_5(pSrc_pixels[low_c].r); - lg = to_6(pSrc_pixels[low_c].g); - lb = to_5(pSrc_pixels[low_c].b); - - hr = to_5(pSrc_pixels[high_c].r); - hg = to_6(pSrc_pixels[high_c].g); - hb = to_5(pSrc_pixels[high_c].b); - } - } - - static const int8_t s_adjacent_voxels[16][4] = - { - { 1,0,0, 3 }, // 0 - { 0,1,0, 4 }, // 1 - { 0,0,1, 5 }, // 2 - { -1,0,0, 0 }, // 3 - { 0,-1,0, 1 }, // 4 - { 0,0,-1, 2 }, // 5 - { 1,1,0, 9 }, // 6 - { 1,0,1, 10 }, // 7 - { 0,1,1, 11 }, // 8 - { -1,-1,0, 6 }, // 9 - { -1,0,-1, 7 }, // 10 - { 0,-1,-1, 8 }, // 11 - { -1,1,0, 13 }, // 12 - { 1,-1,0, 12 }, // 13 - { 0,-1,1, 15 }, // 14 - { 0,1,-1, 14 }, // 15 - }; - - // From icbc's high quality mode. - static inline void encode_bc1_endpoint_search(const color32 *pSrc_pixels, bool any_black_pixels, - uint32_t flags, bc1_encode_results &results, uint32_t cur_err) - { - int &lr = results.lr, &lg = results.lg, &lb = results.lb, &hr = results.hr, &hg = results.hg, &hb = results.hb; - uint8_t *sels = results.sels; - - int prev_improvement_index = 0, forbidden_direction = -1; - - const int endpoint_search_rounds = (flags & cEncodeBC1EndpointSearchRoundsMask) >> cEncodeBC1EndpointSearchRoundsShift; - for (int i = 0; i < endpoint_search_rounds; i++) - { - assert(s_adjacent_voxels[ s_adjacent_voxels[i & 15][3] ][3] == (i & 15)); - - if (forbidden_direction == (i & 31)) - continue; - - const int8_t delta[3] = { s_adjacent_voxels[i & 15][0], s_adjacent_voxels[i & 15][1], s_adjacent_voxels[i & 15][2] }; - - int trial_lr = lr, trial_lg = lg, trial_lb = lb, trial_hr = hr, trial_hg = hg, trial_hb = hb; - - if ((i >> 4) & 1) - { - trial_lr = clampi(trial_lr + delta[0], 0, 31); - trial_lg = clampi(trial_lg + delta[1], 0, 63); - trial_lb = clampi(trial_lb + delta[2], 0, 31); - } - else - { - trial_hr = clampi(trial_hr + delta[0], 0, 31); - trial_hg = clampi(trial_hg + delta[1], 0, 63); - trial_hb = clampi(trial_hb + delta[2], 0, 31); - } - - uint8_t trial_sels[16]; - - uint32_t trial_err; - if (results.m_3color) - { - trial_err = bc1_find_sels3_fullerr( - ((any_black_pixels) && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)), - pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err); - } - else - { - trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err); - } - - if (trial_err < cur_err) - { - cur_err = trial_err; - - forbidden_direction = s_adjacent_voxels[i & 15][3] | (i & 16); - - lr = trial_lr; lg = trial_lg; lb = trial_lb; hr = trial_hr; hg = trial_hg; hb = trial_hb; - - memcpy(sels, trial_sels, 16); - - prev_improvement_index = i; - } - - if (i - prev_improvement_index > 32) - break; - } - } - - void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try, uint32_t total_orderings_to_try3) - { - assert(g_initialized); - - const color32* pSrc_pixels = (const color32*)pPixels; - bc1_block* pDst_block = static_cast(pDst); - - int avg_r, avg_g, avg_b, min_r, min_g, min_b, max_r, max_g, max_b; - - const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b; - - uint32_t j; - for (j = 15; j >= 1; --j) - if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb)) - break; - - if (j == 0) - { - encode_bc1_solid_block(pDst, fr, fg, fb, (flags & (cEncodeBC1Use3ColorBlocks | cEncodeBC1Use3ColorBlocksForBlackPixels)) != 0); - return; - } - - int total_r = fr, total_g = fg, total_b = fb; - - max_r = fr; max_g = fg; max_b = fb; - min_r = fr; min_g = fg; min_b = fb; - - uint32_t grayscale_flag = (fr == fg) && (fr == fb); - uint32_t any_black_pixels = (fr | fg | fb) < 4; - - for (uint32_t i = 1; i < 16; i++) - { - const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; - - grayscale_flag &= ((r == g) && (r == b)); - any_black_pixels |= ((r | g | b) < 4); - - max_r = std::max(max_r, r); max_g = std::max(max_g, g); max_b = std::max(max_b, b); - min_r = std::min(min_r, r); min_g = std::min(min_g, g); min_b = std::min(min_b, b); - total_r += r; total_g += g; total_b += b; - } - - avg_r = (total_r + 8) >> 4; avg_g = (total_g + 8) >> 4; avg_b = (total_b + 8) >> 4; - - bc1_encode_results results; - results.m_3color = false; - - uint8_t *sels = results.sels; - int &lr = results.lr, &lg = results.lg, &lb = results.lb, &hr = results.hr, &hg = results.hg, &hb = results.hb; - int orig_lr = 0, orig_lg = 0, orig_lb = 0, orig_hr = 0, orig_hg = 0, orig_hb = 0; - - lr = 0; lg = 0; lb = 0; hr = 0; hg = 0; hb = 0; - - const bool needs_block_error = ((flags & (cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use3ColorBlocks | cEncodeBC1UseFullMSEEval | cEncodeBC1EndpointSearchRoundsMask)) != 0) || - (any_black_pixels && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)); - - uint32_t cur_err = UINT32_MAX; - - if (!needs_block_error) - { - assert((flags & cEncodeBC1TryAllInitialEndponts) == 0); - - encode_bc1_pick_initial(pSrc_pixels, flags, grayscale_flag != 0, - min_r, min_g, min_b, max_r, max_g, max_b, - avg_r, avg_g, avg_b, total_r, total_g, total_b, - lr, lg, lb, hr, hg, hb); - - orig_lr = lr; orig_lg = lg; orig_lb = lb; orig_hr = hr; orig_hg = hg; orig_hb = hb; - - bc1_find_sels4_noerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); - - const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; - for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) - { - int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; - - vec3F xl, xh; - if (!compute_least_squares_endpoints4_rgb(pSrc_pixels, sels, &xl, &xh, total_r, total_g, total_b)) - { - // All selectors equal - treat it as a solid block which should always be equal or better. - trial_lr = g_bc1_match5_equals_1[avg_r].m_hi; - trial_lg = g_bc1_match6_equals_1[avg_g].m_hi; - trial_lb = g_bc1_match5_equals_1[avg_b].m_hi; - - trial_hr = g_bc1_match5_equals_1[avg_r].m_lo; - trial_hg = g_bc1_match6_equals_1[avg_g].m_lo; - trial_hb = g_bc1_match5_equals_1[avg_b].m_lo; - - // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. - } - else - { - precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); - } - - if ((lr == trial_lr) && (lg == trial_lg) && (lb == trial_lb) && (hr == trial_hr) && (hg == trial_hg) && (hb == trial_hb)) - break; - - bc1_find_sels4_noerr(pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, sels); - - lr = trial_lr; - lg = trial_lg; - lb = trial_lb; - hr = trial_hr; - hg = trial_hg; - hb = trial_hb; - - } // ls_pass - } - else - { - const uint32_t total_rounds = (flags & cEncodeBC1TryAllInitialEndponts) ? 2 : 1; - for (uint32_t round = 0; round < total_rounds; round++) - { - uint32_t modified_flags = flags; - if (round == 1) - { - modified_flags &= ~(cEncodeBC1Use2DLS | cEncodeBC1BoundingBox); - modified_flags |= cEncodeBC1BoundingBox; - } - - int round_lr, round_lg, round_lb, round_hr, round_hg, round_hb; - uint8_t round_sels[16]; - - encode_bc1_pick_initial(pSrc_pixels, modified_flags, grayscale_flag != 0, - min_r, min_g, min_b, max_r, max_g, max_b, - avg_r, avg_g, avg_b, total_r, total_g, total_b, - round_lr, round_lg, round_lb, round_hr, round_hg, round_hb); - - int orig_round_lr = round_lr, orig_round_lg = round_lg, orig_round_lb = round_lb, orig_round_hr = round_hr, orig_round_hg = round_hg, orig_round_hb = round_hb; - - uint32_t round_err = bc1_find_sels4(flags, pSrc_pixels, round_lr, round_lg, round_lb, round_hr, round_hg, round_hb, round_sels, UINT32_MAX); - - const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1; - for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) - { - int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; - - vec3F xl, xh; - if (!compute_least_squares_endpoints4_rgb(pSrc_pixels, round_sels, &xl, &xh, total_r, total_g, total_b)) - { - // All selectors equal - treat it as a solid block which should always be equal or better. - trial_lr = g_bc1_match5_equals_1[avg_r].m_hi; - trial_lg = g_bc1_match6_equals_1[avg_g].m_hi; - trial_lb = g_bc1_match5_equals_1[avg_b].m_hi; - - trial_hr = g_bc1_match5_equals_1[avg_r].m_lo; - trial_hg = g_bc1_match6_equals_1[avg_g].m_lo; - trial_hb = g_bc1_match5_equals_1[avg_b].m_lo; - - // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. - } - else - { - precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); - } - - if ((round_lr == trial_lr) && (round_lg == trial_lg) && (round_lb == trial_lb) && (round_hr == trial_hr) && (round_hg == trial_hg) && (round_hb == trial_hb)) - break; - - uint8_t trial_sels[16]; - uint32_t trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, round_err); - - if (trial_err < round_err) - { - round_lr = trial_lr; - round_lg = trial_lg; - round_lb = trial_lb; - - round_hr = trial_hr; - round_hg = trial_hg; - round_hb = trial_hb; - - round_err = trial_err; - memcpy(round_sels, trial_sels, 16); - } - else - break; - - } // ls_pass - - if (round_err <= cur_err) - { - cur_err = round_err; - - lr = round_lr; - lg = round_lg; - lb = round_lb; - hr = round_hr; - hg = round_hg; - hb = round_hb; - - orig_lr = orig_round_lr; - orig_lg = orig_round_lg; - orig_lb = orig_round_lb; - orig_hr = orig_round_hr; - orig_hg = orig_round_hg; - orig_hb = orig_round_hb; - - memcpy(sels, round_sels, 16); - } - - } // round - } - - if ((cur_err) && (flags & cEncodeBC1UseLikelyTotalOrderings)) - { - assert(needs_block_error); - - const uint32_t total_iters = (flags & cEncodeBC1Iterative) ? 2 : 1; - for (uint32_t iter_index = 0; iter_index < total_iters; iter_index++) - { - const uint32_t orig_err = cur_err; - - hist4 h; - for (uint32_t i = 0; i < 16; i++) - { - assert(sels[i] < 4); - h.m_hist[sels[i]]++; - } - - const uint32_t orig_total_order_index = h.lookup_total_ordering_index(); - - int r0, g0, b0, r3, g3, b3; - r0 = (lr << 3) | (lr >> 2); g0 = (lg << 2) | (lg >> 4); b0 = (lb << 3) | (lb >> 2); - r3 = (hr << 3) | (hr >> 2); g3 = (hg << 2) | (hg >> 4); b3 = (hb << 3) | (hb >> 2); - - int ar = r3 - r0, ag = g3 - g0, ab = b3 - b0; - - int dots[16]; - for (uint32_t i = 0; i < 16; i++) - { - int r = pSrc_pixels[i].r; - int g = pSrc_pixels[i].g; - int b = pSrc_pixels[i].b; - int d = 0x1000000 + (r * ar + g * ag + b * ab); - assert(d >= 0); - dots[i] = (d << 4) + i; - } - - std::sort(dots, dots + 16); - - uint32_t r_sum[17], g_sum[17], b_sum[17]; - uint32_t r = 0, g = 0, b = 0; - for (uint32_t i = 0; i < 16; i++) - { - const uint32_t p = dots[i] & 15; - - r_sum[i] = r; - g_sum[i] = g; - b_sum[i] = b; - - r += pSrc_pixels[p].r; - g += pSrc_pixels[p].g; - b += pSrc_pixels[p].b; - } - - r_sum[16] = total_r; - g_sum[16] = total_g; - b_sum[16] = total_b; - - const uint32_t q_total = (flags & cEncodeBC1Exhaustive) ? NUM_UNIQUE_TOTAL_ORDERINGS4 : clampi(total_orderings_to_try, MIN_TOTAL_ORDERINGS, MAX_TOTAL_ORDERINGS4); - for (uint32_t q = 0; q < q_total; q++) - { - const uint32_t s = (flags & cEncodeBC1Exhaustive) ? q : g_best_total_orderings4[orig_total_order_index][q]; - - int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb; - - vec3F xl, xh; - - if ((s == TOTAL_ORDER_4_0_16) || (s == TOTAL_ORDER_4_1_16) || (s == TOTAL_ORDER_4_2_16) || (s == TOTAL_ORDER_4_3_16)) - { - trial_lr = g_bc1_match5_equals_1[avg_r].m_hi; - trial_lg = g_bc1_match6_equals_1[avg_g].m_hi; - trial_lb = g_bc1_match5_equals_1[avg_b].m_hi; - - trial_hr = g_bc1_match5_equals_1[avg_r].m_lo; - trial_hg = g_bc1_match6_equals_1[avg_g].m_lo; - trial_hb = g_bc1_match5_equals_1[avg_b].m_lo; - } - else - { - compute_least_squares_endpoints4_rgb(&xl, &xh, total_r, total_g, total_b, - g_selector_factors4[s][0], g_selector_factors4[s][1], g_selector_factors4[s][2], s, r_sum, g_sum, b_sum); - - precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb); - } - - uint8_t trial_sels[16]; - - uint32_t trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err); - - if (trial_err < cur_err) - { - cur_err = trial_err; - - lr = trial_lr; - lg = trial_lg; - lb = trial_lb; - - hr = trial_hr; - hg = trial_hg; - hb = trial_hb; - - memcpy(sels, trial_sels, 16); - } - - } // s - - if ((!cur_err) || (cur_err == orig_err)) - break; - - } // iter_index - } - - if ( ((flags & (cEncodeBC1Use3ColorBlocks | cEncodeBC1Use3ColorBlocksForBlackPixels)) != 0) && (cur_err) ) - { - if (flags & cEncodeBC1Use3ColorBlocks) - { - assert(needs_block_error); - try_3color_block(pSrc_pixels, flags, cur_err, avg_r, avg_g, avg_b, orig_lr, orig_lg, orig_lb, orig_hr, orig_hg, orig_hb, total_r, total_g, total_b, total_orderings_to_try3, results); - } - - if ((any_black_pixels) && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)) - { - assert(needs_block_error); - try_3color_block_useblack(pSrc_pixels, flags, cur_err, results); - } - } - - if ( (flags & cEncodeBC1EndpointSearchRoundsMask) && (cur_err) ) - { - assert(needs_block_error); - - encode_bc1_endpoint_search(pSrc_pixels, any_black_pixels != 0, flags, results, cur_err); - } - - if (results.m_3color) - bc1_encode3(pDst_block, results.lr, results.lg, results.lb, results.hr, results.hg, results.hb, results.sels); - else - bc1_encode4(pDst_block, results.lr, results.lg, results.lb, results.hr, results.hg, results.hb, results.sels); - } - - // BC3-5 - - struct bc4_block - { - enum { cBC4SelectorBits = 3, cTotalSelectorBytes = 6, cMaxSelectorValues = 8 }; - uint8_t m_endpoints[2]; - - uint8_t m_selectors[cTotalSelectorBytes]; - - inline uint32_t get_low_alpha() const { return m_endpoints[0]; } - inline uint32_t get_high_alpha() const { return m_endpoints[1]; } - inline bool is_alpha6_block() const { return get_low_alpha() <= get_high_alpha(); } - - inline uint64_t get_selector_bits() const - { - return ((uint64_t)((uint32_t)m_selectors[0] | ((uint32_t)m_selectors[1] << 8U) | ((uint32_t)m_selectors[2] << 16U) | ((uint32_t)m_selectors[3] << 24U))) | - (((uint64_t)m_selectors[4]) << 32U) | - (((uint64_t)m_selectors[5]) << 40U); - } - - inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const - { - assert((x < 4U) && (y < 4U)); - return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits))& (cMaxSelectorValues - 1); - } - - static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h) - { - pDst[0] = static_cast(l); - pDst[1] = static_cast(h); - pDst[2] = static_cast((l * 4 + h) / 5); - pDst[3] = static_cast((l * 3 + h * 2) / 5); - pDst[4] = static_cast((l * 2 + h * 3) / 5); - pDst[5] = static_cast((l + h * 4) / 5); - pDst[6] = 0; - pDst[7] = 255; - return 6; - } - - static inline uint32_t get_block_values8(uint8_t* pDst, uint32_t l, uint32_t h) - { - pDst[0] = static_cast(l); - pDst[1] = static_cast(h); - pDst[2] = static_cast((l * 6 + h) / 7); - pDst[3] = static_cast((l * 5 + h * 2) / 7); - pDst[4] = static_cast((l * 4 + h * 3) / 7); - pDst[5] = static_cast((l * 3 + h * 4) / 7); - pDst[6] = static_cast((l * 2 + h * 5) / 7); - pDst[7] = static_cast((l + h * 6) / 7); - return 8; - } - - static inline uint32_t get_block_values(uint8_t* pDst, uint32_t l, uint32_t h) - { - if (l > h) - return get_block_values8(pDst, l, h); - else - return get_block_values6(pDst, l, h); - } - }; - - void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride) - { - assert(g_initialized); - - uint32_t min0_v, max0_v, min1_v, max1_v, min2_v, max2_v, min3_v, max3_v; - - { - min0_v = max0_v = pPixels[0 * stride]; - min1_v = max1_v = pPixels[1 * stride]; - min2_v = max2_v = pPixels[2 * stride]; - min3_v = max3_v = pPixels[3 * stride]; - } - - { - uint32_t v0 = pPixels[4 * stride]; min0_v = std::min(min0_v, v0); max0_v = std::max(max0_v, v0); - uint32_t v1 = pPixels[5 * stride]; min1_v = std::min(min1_v, v1); max1_v = std::max(max1_v, v1); - uint32_t v2 = pPixels[6 * stride]; min2_v = std::min(min2_v, v2); max2_v = std::max(max2_v, v2); - uint32_t v3 = pPixels[7 * stride]; min3_v = std::min(min3_v, v3); max3_v = std::max(max3_v, v3); - } - - { - uint32_t v0 = pPixels[8 * stride]; min0_v = std::min(min0_v, v0); max0_v = std::max(max0_v, v0); - uint32_t v1 = pPixels[9 * stride]; min1_v = std::min(min1_v, v1); max1_v = std::max(max1_v, v1); - uint32_t v2 = pPixels[10 * stride]; min2_v = std::min(min2_v, v2); max2_v = std::max(max2_v, v2); - uint32_t v3 = pPixels[11 * stride]; min3_v = std::min(min3_v, v3); max3_v = std::max(max3_v, v3); - } - - { - uint32_t v0 = pPixels[12 * stride]; min0_v = std::min(min0_v, v0); max0_v = std::max(max0_v, v0); - uint32_t v1 = pPixels[13 * stride]; min1_v = std::min(min1_v, v1); max1_v = std::max(max1_v, v1); - uint32_t v2 = pPixels[14 * stride]; min2_v = std::min(min2_v, v2); max2_v = std::max(max2_v, v2); - uint32_t v3 = pPixels[15 * stride]; min3_v = std::min(min3_v, v3); max3_v = std::max(max3_v, v3); - } - - const uint32_t min_v = minimum(min0_v, min1_v, min2_v, min3_v); - const uint32_t max_v = maximum(max0_v, max1_v, max2_v, max3_v); - - uint8_t* pDst_bytes = static_cast(pDst); - pDst_bytes[0] = (uint8_t)max_v; - pDst_bytes[1] = (uint8_t)min_v; - - if (max_v == min_v) - { - memset(pDst_bytes + 2, 0, 6); - return; - } - - const uint32_t delta = max_v - min_v; - - // min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors. - const int t0 = delta * 13; - const int t1 = delta * 11; - const int t2 = delta * 9; - const int t3 = delta * 7; - const int t4 = delta * 5; - const int t5 = delta * 3; - const int t6 = delta * 1; - - // BC4 floors in its divisions, which we compensate for with the 4 bias. - // This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one). - const int bias = 4 - min_v * 14; - - static const uint32_t s_tran0[8] = { 1U , 7U , 6U , 5U , 4U , 3U , 2U , 0U }; - static const uint32_t s_tran1[8] = { 1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U }; - static const uint32_t s_tran2[8] = { 1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U }; - static const uint32_t s_tran3[8] = { 1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U }; - - uint64_t a0, a1, a2, a3; - { - const int v0 = pPixels[0 * stride] * 14 + bias; - const int v1 = pPixels[1 * stride] * 14 + bias; - const int v2 = pPixels[2 * stride] * 14 + bias; - const int v3 = pPixels[3 * stride] * 14 + bias; - a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]; - a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]; - a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]; - a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]; - } - - { - const int v0 = pPixels[4 * stride] * 14 + bias; - const int v1 = pPixels[5 * stride] * 14 + bias; - const int v2 = pPixels[6 * stride] * 14 + bias; - const int v3 = pPixels[7 * stride] * 14 + bias; - a0 |= (uint64_t)(s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U); - a1 |= (uint64_t)(s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U); - a2 |= (uint64_t)(s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U); - a3 |= (uint64_t)(s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U); - } - - { - const int v0 = pPixels[8 * stride] * 14 + bias; - const int v1 = pPixels[9 * stride] * 14 + bias; - const int v2 = pPixels[10 * stride] * 14 + bias; - const int v3 = pPixels[11 * stride] * 14 + bias; - a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U); - a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U); - a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U); - a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U); - } - - { - const int v0 = pPixels[12 * stride] * 14 + bias; - const int v1 = pPixels[13 * stride] * 14 + bias; - const int v2 = pPixels[14 * stride] * 14 + bias; - const int v3 = pPixels[15 * stride] * 14 + bias; - a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U); - a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U); - a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U); - a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U); - } - - const uint64_t f = a0 | a1 | a2 | a3; - - pDst_bytes[2] = (uint8_t)f; - pDst_bytes[3] = (uint8_t)(f >> 8U); - pDst_bytes[4] = (uint8_t)(f >> 16U); - pDst_bytes[5] = (uint8_t)(f >> 24U); - pDst_bytes[6] = (uint8_t)(f >> 32U); - pDst_bytes[7] = (uint8_t)(f >> 40U); - } - - void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try) - { - assert(g_initialized); - - // 3-color blocks are not allowed with BC3 (on most GPU's). - flags &= ~(cEncodeBC1Use3ColorBlocksForBlackPixels | cEncodeBC1Use3ColorBlocks); - - encode_bc4(pDst, pPixels + 3, 4); - encode_bc1(static_cast(pDst) + 8, pPixels, flags, total_orderings_to_try); - } - - void encode_bc3(uint32_t level, void* pDst, const uint8_t* pPixels) - { - assert(g_initialized); - - encode_bc4(pDst, pPixels + 3, 4); - encode_bc1(level, static_cast(pDst) + 8, pPixels, false, false); - } - - void encode_bc5(void* pDst, const uint8_t* pPixels, uint32_t chan0, uint32_t chan1, uint32_t stride) - { - assert(g_initialized); - - encode_bc4(pDst, pPixels + chan0, stride); - encode_bc4(static_cast(pDst) + 8, pPixels + chan1, stride); - } - - // Returns true if the block uses 3 color punchthrough alpha mode. - bool unpack_bc1(const void* pBlock_bits, void* pPixels, bool set_alpha, bc1_approx_mode mode) - { - color32* pDst_pixels = static_cast(pPixels); - - static_assert(sizeof(bc1_block) == 8, "sizeof(bc1_block) == 8"); - static_assert(sizeof(bc4_block) == 8, "sizeof(bc4_block) == 8"); - - const bc1_block* pBlock = static_cast(pBlock_bits); - - const uint32_t l = pBlock->get_low_color(); - const uint32_t h = pBlock->get_high_color(); - - color32 c[4]; - - const int cr0 = (l >> 11) & 31; - const int cg0 = (l >> 5) & 63; - const int cb0 = l & 31; - const int r0 = (cr0 << 3) | (cr0 >> 2); - const int g0 = (cg0 << 2) | (cg0 >> 4); - const int b0 = (cb0 << 3) | (cb0 >> 2); - - const int cr1 = (h >> 11) & 31; - const int cg1 = (h >> 5) & 63; - const int cb1 = h & 31; - const int r1 = (cr1 << 3) | (cr1 >> 2); - const int g1 = (cg1 << 2) | (cg1 >> 4); - const int b1 = (cb1 << 3) | (cb1 >> 2); - - bool used_punchthrough = false; - - if (l > h) - { - c[0].set_noclamp_rgba(r0, g0, b0, 255); - c[1].set_noclamp_rgba(r1, g1, b1, 255); - switch (mode) - { - case bc1_approx_mode::cBC1Ideal: - c[2].set_noclamp_rgba((r0 * 2 + r1) / 3, (g0 * 2 + g1) / 3, (b0 * 2 + b1) / 3, 255); - c[3].set_noclamp_rgba((r1 * 2 + r0) / 3, (g1 * 2 + g0) / 3, (b1 * 2 + b0) / 3, 255); - break; - case bc1_approx_mode::cBC1IdealRound4: - c[2].set_noclamp_rgba((r0 * 2 + r1 + 1) / 3, (g0 * 2 + g1 + 1) / 3, (b0 * 2 + b1 + 1) / 3, 255); - c[3].set_noclamp_rgba((r1 * 2 + r0 + 1) / 3, (g1 * 2 + g0 + 1) / 3, (b1 * 2 + b0 + 1) / 3, 255); - break; - case bc1_approx_mode::cBC1NVidia: - c[2].set_noclamp_rgba(interp_5_nv(cr0, cr1), interp_6_nv(g0, g1), interp_5_nv(cb0, cb1), 255); - c[3].set_noclamp_rgba(interp_5_nv(cr1, cr0), interp_6_nv(g1, g0), interp_5_nv(cb1, cb0), 255); - break; - case bc1_approx_mode::cBC1AMD: - c[2].set_noclamp_rgba(interp_5_6_amd(r0, r1), interp_5_6_amd(g0, g1), interp_5_6_amd(b0, b1), 255); - c[3].set_noclamp_rgba(interp_5_6_amd(r1, r0), interp_5_6_amd(g1, g0), interp_5_6_amd(b1, b0), 255); - break; - } - } - else - { - c[0].set_noclamp_rgba(r0, g0, b0, 255); - c[1].set_noclamp_rgba(r1, g1, b1, 255); - switch (mode) - { - case bc1_approx_mode::cBC1Ideal: - case bc1_approx_mode::cBC1IdealRound4: - c[2].set_noclamp_rgba((r0 + r1) / 2, (g0 + g1) / 2, (b0 + b1) / 2, 255); - break; - case bc1_approx_mode::cBC1NVidia: - c[2].set_noclamp_rgba(interp_half_5_nv(cr0, cr1), interp_half_6_nv(g0, g1), interp_half_5_nv(cb0, cb1), 255); - break; - case bc1_approx_mode::cBC1AMD: - c[2].set_noclamp_rgba(interp_half_5_6_amd(r0, r1), interp_half_5_6_amd(g0, g1), interp_half_5_6_amd(b0, b1), 255); - break; - } - - c[3].set_noclamp_rgba(0, 0, 0, 0); - used_punchthrough = true; - } - - if (set_alpha) - { - for (uint32_t y = 0; y < 4; y++, pDst_pixels += 4) - { - pDst_pixels[0] = c[pBlock->get_selector(0, y)]; - pDst_pixels[1] = c[pBlock->get_selector(1, y)]; - pDst_pixels[2] = c[pBlock->get_selector(2, y)]; - pDst_pixels[3] = c[pBlock->get_selector(3, y)]; - } - } - else - { - for (uint32_t y = 0; y < 4; y++, pDst_pixels += 4) - { - pDst_pixels[0].set_rgb(c[pBlock->get_selector(0, y)]); - pDst_pixels[1].set_rgb(c[pBlock->get_selector(1, y)]); - pDst_pixels[2].set_rgb(c[pBlock->get_selector(2, y)]); - pDst_pixels[3].set_rgb(c[pBlock->get_selector(3, y)]); - } - } - - return used_punchthrough; - } - - void unpack_bc4(const void* pBlock_bits, uint8_t* pPixels, uint32_t stride) - { - static_assert(sizeof(bc4_block) == 8, "sizeof(bc4_block) == 8"); - - const bc4_block* pBlock = static_cast(pBlock_bits); - - uint8_t sel_values[8]; - bc4_block::get_block_values(sel_values, pBlock->get_low_alpha(), pBlock->get_high_alpha()); - - const uint64_t selector_bits = pBlock->get_selector_bits(); - - for (uint32_t y = 0; y < 4; y++, pPixels += (stride * 4U)) - { - pPixels[0] = sel_values[pBlock->get_selector(0, y, selector_bits)]; - pPixels[stride * 1] = sel_values[pBlock->get_selector(1, y, selector_bits)]; - pPixels[stride * 2] = sel_values[pBlock->get_selector(2, y, selector_bits)]; - pPixels[stride * 3] = sel_values[pBlock->get_selector(3, y, selector_bits)]; - } - } - - // Returns false if the block uses 3-color punchthrough alpha mode, which isn't supported on some GPU's for BC3. - bool unpack_bc3(const void* pBlock_bits, void* pPixels, bc1_approx_mode mode) - { - color32* pDst_pixels = static_cast(pPixels); - - bool success = true; - - if (unpack_bc1((const uint8_t*)pBlock_bits + sizeof(bc4_block), pDst_pixels, true, mode)) - success = false; - - unpack_bc4(pBlock_bits, &pDst_pixels[0].a, sizeof(color32)); - - return success; - } - - // writes RG - void unpack_bc5(const void* pBlock_bits, void* pPixels, uint32_t chan0, uint32_t chan1, uint32_t stride) - { - unpack_bc4(pBlock_bits, (uint8_t *)pPixels + chan0, stride); - unpack_bc4((const uint8_t*)pBlock_bits + sizeof(bc4_block), (uint8_t *)pPixels + chan1, stride); - } - -} // namespace rgbcx +} +#endif // #ifndef RGBCX_INCLUDE_H +#ifdef RGBCX_IMPLEMENTATION #endif //#ifdef RGBCX_IMPLEMENTATION /* diff --git a/libkram/bc7enc/rgbcx_table4_small.h b/libkram/bc7enc/rgbcx_table4_small.h new file mode 100644 index 00000000..30ccc709 --- /dev/null +++ b/libkram/bc7enc/rgbcx_table4_small.h @@ -0,0 +1,969 @@ +{ 202, 120, 13, 318, 15, 23, 403, 450, 5, 51, 260, 128, 77, 21, 33, 494, 515, 523, 4, 141, 269, 1, 2, 700, 137, 49, 48, 102, 7, 64, 753, 82 }, +{ 13,141,23,217,115,51,77,2,64,21,0,4,5,317,137,269,202,33,318,7,291,352,9,10,3,180,32,6,365,102,341,349 }, +{ 29,58,262,1,52,74,6,171,5,287,151,334,27,500,75,26,331,223,53,635,220,19,50,45,46,17,14,396,163,409,324,70 }, +{ 40,51,33,453,14,23,62,56,12,196,730,475,153,99,403,775,117,130,585,34,4,17,162,11,139,57,102,38,108,47,123,440 }, +{ 33,23,51,13,102,64,202,128,12,40,15,196,153,10,1,2,77,99,141,0,515,5,117,3,120,403,700,165,22,14,269,453 }, +{ 13,23,51,4,77,141,202,33,115,64,32,128,0,11,177,40,15,102,2,217,7,137,269,21,90,59,515,1,180,403,22,6 }, +{ 26,235,19,47,648,624,78,145,27,112,122,64,444,6,630,453,25,42,65,130,711,85,390,113,416,108,665,29,730,138,644,95 }, +{ 64,141,352,751,217,247,237,437,177,269,86,954,947,875,32,318,95,77,304,92,597,180,232,291,128,864,349,588,372,202,312,1 }, +{ 642,898,180,638,901,341,82,197,10,951,15,515,165,762,700,253,811,753,752,365,143,479,244,569,8,110,351,873,55,31,499,116 }, +{ 221,23,51,125,438,254,13,21,39,49,308,656,0,115,530,159,158,401,30,166,912,386,165,688,518,9,105,627,424,22,421,33 }, +{ 143,31,1,44,197,8,180,125,116,55,13,498,23,341,638,242,93,15,2,141,0,901,752,115,36,206,165,479,338,365,515,762 }, +{ 12,23,51,13,14,15,37,99,515,38,700,117,2,196,134,153,753,64,54,33,128,120,21,0,328,5,139,82,453,719,457,1 }, +{ 13,15,23,515,961,700,457,753,51,115,4,165,197,2,38,569,1,474,0,37,99,719,5,12,629,14,11,3,33,77,64,10 }, +{ 15,515,700,753,1,0,2,4,3,23,134,12,961,5,10,197,11,33,82,120,457,51,165,7,6,341,217,21,77,9,40,180 }, +{ 13,51,23,457,719,961,730,401,165,453,0,117,386,15,134,1,758,153,12,54,515,99,11,2,700,5,753,4,308,33,6,899 }, +{ 134,898,82,117,13,33,77,102,23,260,341,351,120,901,197,153,961,111,196,110,180,457,854,10,450,8,165,40,4,115,0,365 }, +{ 60,18,126,167,35,16,191,71,24,92,121,271,68,107,212,146,118,150,199,7,21,1,9,575,727,5,566,48,0,132,108,273 }, +{ 62,136,129,123,128,41,162,17,249,211,214,789,618,710,38,678,248,507,57,64,152,269,119,3,177,183,597,106,4,179,216,90 }, +{ 403,523,51,475,494,453,817,899,202,23,450,13,421,120,102,730,33,128,4,1,805,5,7,153,757,260,318,196,77,457,326,65 }, +{ 4,59,3,62,12,33,56,193,27,21,102,17,40,77,76,84,32,0,6,123,119,177,128,11,18,611,605,25,13,51,73,210 }, +{ 43,20,319,422,414,945,0,7,819,61,5,376,325,173,804,904,470,693,97,707,14,49,22,104,147,107,95,32,426,1,330,577 }, +{ 13,23,51,2,0,115,4,141,217,33,10,77,1,15,64,180,3,515,7,6,22,102,11,5,40,9,165,700,202,197,317,341 }, +{ 28,49,0,105,1,24,65,159,35,55,95,239,16,2,109,7,9,14,170,320,347,168,424,158,10,301,124,5,67,21,64,36 }, +{ 15,515,700,753,0,1,13,2,117,4,12,10,5,165,457,3,9,134,11,7,6,51,77,64,961,82,33,197,14,341,120,141 }, +{ 7,71,14,149,97,18,60,16,150,92,398,189,140,124,24,273,35,2,69,302,154,68,0,336,517,43,66,28,118,251,230,1 }, +{ 4,102,33,77,40,59,11,624,210,12,128,342,5,503,91,139,64,32,25,494,202,678,416,0,403,275,21,450,196,318,523,177 }, +{ 25,19,42,6,122,813,256,235,85,26,436,53,297,573,680,390,445,63,27,416,80,233,65,73,389,283,45,605,194,17,250,343 }, +{ 402,102,202,128,33,300,403,23,12,77,40,21,342,117,483,99,25,494,6,4,63,32,84,569,139,757,475,318,19,26,196,134 }, +{ 158,9,0,109,39,49,65,22,35,168,55,24,68,124,159,16,185,344,333,154,254,272,175,289,1,577,95,28,105,810,30,169 }, +{ 197,180,115,237,498,165,2,5,287,546,400,3,61,34,509,13,297,80,341,52,45,186,58,881,23,873,468,176,64,17,311,250 }, +{ 120,968,373,260,704,110,450,202,137,318,77,95,269,326,217,717,661,652,851,349,93,1,518,98,827,291,21,177,82,33,848,719 }, +{ 44,116,144,268,434,489,367,384,98,127,918,93,948,31,206,940,855,0,203,137,9,22,617,141,332,105,393,492,959,282,299,131 }, +{ 13,77,23,33,51,0,64,141,102,4,2,115,1,6,202,15,10,128,269,7,177,180,3,40,22,11,515,217,117,318,700,137 }, +{ 15,515,700,753,4,11,141,40,165,23,64,180,13,202,32,3,51,125,5,197,21,128,0,93,77,1,120,82,269,117,110,59 }, +{ 176,231,585,62,34,14,412,161,56,236,527,57,17,3,51,202,4,23,369,283,128,13,472,440,84,361,136,457,381,130,719,53 }, +{ 9,0,180,217,237,101,141,352,88,100,230,64,175,317,115,498,68,39,30,1,702,83,213,36,365,208,752,13,252,321,952,546 }, +{ 28,9,22,1,49,0,109,39,83,95,86,30,13,105,128,55,141,168,158,67,31,159,208,12,96,5,185,2,160,64,137,23 }, +{ 72,4,38,12,51,89,477,11,57,76,401,308,23,474,99,148,413,179,59,13,431,152,54,569,17,3,205,629,197,421,405,15 }, +{ 457,13,23,961,15,51,515,700,165,12,753,629,11,1,719,117,0,3,2,37,569,197,40,328,33,5,153,134,99,64,38,196 }, +{ 254,100,310,9,30,1,39,625,166,265,190,0,272,557,131,731,31,98,578,688,404,93,101,88,49,21,127,264,44,36,252,478 }, +{ 51,23,12,13,15,128,99,120,10,202,515,153,64,82,700,33,165,2,5,117,403,1,141,0,3,196,37,453,753,197,260,93 }, +{ 38,99,542,139,453,117,196,23,457,13,328,111,37,134,961,11,12,51,40,775,587,401,474,54,153,477,41,629,33,475,14,277 }, +{ 6,85,25,233,343,91,26,63,138,29,19,65,283,4,81,235,42,122,605,64,648,256,174,370,74,389,718,59,45,194,445,416 }, +{ 49,5,97,20,197,21,18,193,0,64,408,729,173,350,43,422,165,7,14,104,61,32,509,713,523,102,120,95,125,397,35,232 }, +{ 144,116,268,434,384,489,367,206,93,855,940,44,98,332,617,127,959,911,137,282,203,31,22,219,141,9,131,276,417,0,1,120 }, +{ 17,106,64,62,32,255,136,292,476,162,129,241,123,141,41,237,720,214,209,352,519,211,186,148,752,247,507,90,21,77,197,119 }, +{ 2,29,52,50,5,58,14,6,27,1,366,357,45,53,17,19,171,151,26,181,133,38,218,764,287,583,61,113,3,487,600,281 }, +{ 130,59,196,412,381,730,711,236,77,210,202,402,453,99,401,108,361,803,291,283,153,4,57,51,128,183,14,719,503,117,23,11 }, +{ 13,23,51,141,77,4,33,64,115,0,217,10,180,202,2,102,11,9,15,165,40,21,128,352,22,7,197,3,317,515,269,1 }, +{ 23,13,202,51,120,15,21,5,141,1,128,269,137,515,64,102,125,48,98,33,260,523,318,93,700,165,450,77,2,12,403,82 }, +{ 1,2,14,46,29,67,38,52,5,171,58,24,103,69,96,70,83,181,54,75,163,223,16,45,112,309,155,0,186,35,18,108 }, +{ 15,515,700,753,13,0,1,2,153,5,23,10,117,3,9,7,134,165,12,6,341,33,4,14,77,457,115,21,719,180,217,82 }, +{ 197,165,509,13,391,180,308,115,23,546,5,498,2,29,3,401,901,61,34,80,14,457,250,569,237,873,38,297,45,15,468,386 }, +{ 19,73,27,250,200,714,444,472,26,53,34,17,813,322,283,390,128,297,78,123,432,14,436,136,106,690,57,122,389,80,503,3 }, +{ 3,17,21,45,62,32,38,12,155,14,2,328,5,99,401,536,828,13,227,488,106,51,719,119,540,76,165,221,115,629,209,41 }, +{ 115,341,873,197,365,13,901,180,569,752,317,1,10,498,143,634,261,0,509,15,943,237,44,31,116,601,165,127,282,23,141,64 }, +{ 453,51,23,403,33,421,475,102,15,153,196,515,13,700,117,523,12,40,753,21,4,134,0,494,670,899,22,801,730,10,11,401 }, +{ 23,13,51,33,12,117,153,134,453,196,15,99,515,40,14,700,128,102,11,753,77,64,403,202,0,401,475,37,65,2,3,38 }, +{ 2,7,5,14,70,1,29,61,52,45,6,112,66,16,21,32,592,46,38,135,87,58,186,315,290,128,113,0,64,48,227,23 }, +{ 33,23,102,51,128,13,64,202,141,1,77,10,153,40,196,117,2,3,0,5,15,269,403,12,137,134,318,165,120,6,453,99 }, +{ 16,92,7,20,43,35,126,71,60,14,107,18,68,97,0,121,279,149,24,246,191,48,118,575,55,140,362,783,230,150,375,566 }, +{ 13,23,4,33,77,64,51,102,141,128,32,10,0,202,40,115,59,22,90,11,177,21,291,6,7,318,180,117,137,2,95,165 }, +{ 507,162,129,41,4,211,62,38,123,59,57,248,183,130,99,11,3,361,202,17,402,556,266,305,803,210,128,184,152,136,313,117 }, +{ 643,123,193,650,802,18,25,389,718,256,65,289,84,91,619,511,415,90,235,63,57,510,324,216,862,102,6,183,108,397,217,736 }, +{ 13,23,15,1,515,51,0,2,700,5,753,165,141,115,12,3,4,180,21,197,457,7,6,10,120,9,33,202,77,32,8,11 }, +{ 23,51,13,453,64,403,12,21,5,202,128,475,165,141,523,95,125,115,3,1,4,730,120,32,2,494,180,719,457,197,450,401 }, +{ 204,74,135,66,6,174,192,7,138,172,85,353,348,580,280,97,95,500,29,64,426,32,87,889,65,81,25,2,52,43,568,673 }, +{ 35,0,68,69,24,9,1,16,65,103,149,133,18,114,28,50,83,2,189,7,46,14,101,336,175,124,251,55,71,218,38,238 }, +{ 16,101,0,118,9,18,24,68,35,154,71,124,60,212,191,520,55,806,694,167,28,39,364,375,1,346,252,65,604,302,22,21 }, +{ 0,9,16,35,1,24,68,18,65,21,103,67,13,149,28,189,71,23,101,238,114,7,335,133,486,141,22,212,48,50,30,118 }, +{ 13,202,23,77,33,51,128,5,21,141,115,32,102,64,4,0,318,269,10,15,291,2,494,177,11,217,3,515,22,137,6,700 }, +{ 16,92,60,35,7,18,24,68,150,149,14,71,0,375,97,126,118,107,230,191,246,273,140,55,175,653,9,575,2,28,566,517 }, +{ 76,90,21,179,316,148,205,32,464,288,184,257,245,1,89,2,460,57,152,45,38,358,645,5,12,449,350,48,37,17,4,14 }, +{ 19,27,26,813,80,297,17,495,436,53,73,200,4,378,250,59,106,25,45,128,361,42,113,469,122,390,77,40,736,6,11,136 }, +{ 6,26,235,138,19,145,112,70,331,262,25,42,52,624,27,453,122,47,500,78,648,85,29,2,630,632,409,113,50,226,108,75 }, +{ 7,16,14,24,92,35,18,2,46,9,60,140,0,87,50,5,54,13,12,38,171,23,126,21,58,64,1,70,128,71,220,163 }, +{ 90,205,257,184,32,179,460,5,245,45,2,288,769,524,57,21,152,229,17,1,497,4,292,59,619,452,432,76,476,11,266,14 }, +{ 15,515,700,753,4,5,11,141,13,1,33,3,0,128,202,23,180,21,2,64,269,32,117,134,120,40,102,318,153,17,137,352 }, +{ 47,130,711,108,453,412,730,196,390,283,78,27,51,183,381,236,128,200,719,14,153,472,503,34,59,250,3,4,57,803,123,432 }, +{ 12,277,51,474,111,153,23,99,13,37,961,94,629,542,569,431,79,139,38,134,117,453,33,188,196,40,115,15,11,157,401,515 }, +{ 17,495,469,106,26,378,80,27,161,483,19,742,527,436,383,862,73,136,53,814,297,6,119,84,62,56,25,3,209,611,4,128 }, +{ 81,681,636,91,0,750,370,104,718,138,18,693,173,784,29,397,348,74,192,673,174,65,6,207,64,280,306,52,671,32,355,319 }, +{ 15,515,700,753,33,77,4,102,115,117,40,13,1,153,134,11,5,217,23,196,2,21,3,317,32,365,0,341,291,59,12,51 }, +{ 0,9,28,35,68,1,65,67,101,39,69,175,16,238,13,22,96,124,18,24,251,30,55,12,23,2,50,141,114,5,154,103 }, +{ 23,33,77,13,117,40,11,102,64,4,51,403,153,453,10,0,196,134,128,65,12,291,86,99,95,59,15,141,202,180,137,719 }, +{ 214,90,289,6,874,64,25,65,235,42,751,249,256,312,194,85,746,875,174,32,525,288,519,835,247,348,233,544,217,524,437,352 }, +{ 1,22,2,0,36,67,28,5,49,95,12,50,168,83,105,55,7,9,14,194,103,23,114,21,584,46,10,13,38,69,208,159 }, +{ 269,141,13,202,33,180,318,77,291,137,102,352,128,23,349,51,31,217,372,317,125,197,44,21,11,5,901,1,18,0,4,494 }, +{ 435,144,274,88,203,418,30,1,190,410,96,778,100,530,521,326,466,795,686,166,960,321,382,264,367,822,131,31,692,9,213,93 }, +{ 76,72,90,21,37,179,12,205,32,428,148,38,308,405,4,413,57,184,749,245,316,221,54,645,288,1,152,155,464,257,2,14 }, +{ 77,33,64,102,13,141,23,2,40,1,51,10,0,115,6,180,202,128,4,3,177,269,15,7,22,165,291,14,217,318,137,11 }, +{ 397,81,4,32,65,788,693,804,681,11,249,21,91,64,690,494,3,0,422,56,348,725,194,123,23,59,523,319,61,510,95,90 }, +{ 60,126,16,7,92,121,314,246,35,107,150,132,14,146,24,18,199,298,232,71,359,140,672,97,392,649,5,423,95,21,22,388 }, +{ 15,515,141,217,115,700,13,23,120,317,753,180,33,260,110,137,341,51,1,365,4,77,64,202,0,40,36,352,197,269,10,21 }, +{ 111,134,117,474,23,13,961,12,569,431,37,15,51,115,515,700,277,99,753,38,197,405,457,4,72,94,629,45,11,89,54,148 }, +{ 23,13,51,5,1,15,2,21,12,202,141,0,515,165,120,32,4,64,700,3,115,197,269,125,753,7,9,128,6,180,453,403 }, +{ 13,141,4,23,5,2,115,217,202,51,180,137,269,352,77,1,317,3,21,318,0,15,9,64,10,197,11,341,33,515,752,7 }, +{ 165,125,197,13,391,21,23,558,48,380,97,120,298,33,14,426,66,115,32,386,900,180,6,98,357,237,326,509,51,278,221,457 }, +{ 120,82,15,260,515,1,351,77,450,700,13,21,141,23,753,202,217,93,110,33,51,854,5,128,326,102,137,180,817,48,269,352 }, +{ 23,13,15,51,515,700,961,753,0,457,1,2,4,115,10,453,569,5,33,165,11,719,14,40,64,197,3,21,474,629,38,401 }, +{ 264,166,39,30,9,100,435,254,93,921,190,363,1,625,411,382,897,656,203,478,404,812,438,110,473,88,18,691,156,141,274,272 }, +{ 9,0,252,100,166,39,101,265,364,68,88,329,520,18,419,676,118,167,404,604,16,1,21,30,212,158,553,49,382,274,48,13 }, +{ 15,515,700,753,4,11,141,5,3,13,202,1,180,21,2,165,269,23,40,64,0,318,12,32,128,51,77,117,523,197,120,457 }, +{ 24,1,2,69,35,16,67,18,14,50,0,46,68,9,38,7,133,71,83,149,28,108,189,218,65,114,238,29,75,54,5,96 }, +{ 90,289,214,64,874,13,77,712,66,751,4,23,51,192,32,0,202,194,312,177,33,65,234,104,875,288,59,5,835,416,102,95 }, +{ 0,9,49,127,98,31,301,28,371,159,1,395,512,737,158,761,916,623,16,44,242,39,170,18,293,105,24,272,101,22,23,385 }, +{ 17,62,136,214,123,129,32,292,119,209,710,106,141,162,128,64,45,4,77,249,11,618,211,3,207,130,519,183,38,177,21,269 }, +{ 5,107,581,356,279,32,441,362,493,660,13,298,0,534,49,147,21,22,132,121,97,423,7,590,259,683,14,786,126,508,60,246 }, +{ 51,13,15,730,453,23,515,719,386,457,12,700,403,475,899,1,6,523,753,421,99,401,165,33,2,19,361,5,0,670,120,27 }, +{ 49,28,9,159,272,22,254,131,158,327,95,105,0,39,35,168,347,286,374,55,65,627,424,912,68,578,1,24,239,175,688,169 }, +{ 15,515,700,33,753,4,77,141,341,317,1,10,13,180,102,22,40,117,115,365,5,901,23,197,134,11,217,351,64,82,21,137 }, +{ 134,15,13,515,23,700,12,753,51,474,37,961,197,10,457,569,4,0,99,2,115,38,165,153,94,3,139,11,1,82,33,5 }, +{ 7,2,20,58,5,14,128,66,6,29,32,43,21,52,16,38,631,61,74,97,46,135,113,25,202,192,13,0,884,45,112,87 }, +{ 77,13,33,202,23,128,102,4,141,342,117,0,269,318,134,22,11,21,32,153,403,291,49,64,137,51,40,15,494,5,196,98 }, +{ 2,1,14,6,46,38,29,65,5,36,67,0,103,7,22,86,133,50,108,208,52,83,24,323,283,69,28,18,10,25,23,75 }, +{ 15,515,700,753,1,5,4,2,3,13,0,11,180,341,12,33,10,197,134,365,77,23,21,901,6,117,165,7,37,32,17,102 }, +{ 203,268,206,93,417,940,31,8,120,137,44,499,959,473,202,692,728,559,0,260,10,326,141,564,817,127,341,1,450,22,110,23 }, +{ 15,82,515,120,700,0,10,753,33,8,64,165,110,31,260,93,13,197,23,22,40,4,351,44,77,9,11,153,102,51,1,196 }, +{ 60,0,16,7,14,43,20,71,28,10,2,22,154,18,13,24,92,1,51,576,35,615,805,925,68,126,124,149,97,64,23,55 }, +{ 19,6,26,80,5,84,27,17,25,2,504,129,45,240,56,123,4,119,618,1,76,106,64,51,14,3,128,65,32,710,0,42 }, +{ 15,515,700,753,13,4,77,23,33,51,0,5,8,10,11,31,44,1,82,22,202,64,110,102,93,21,291,40,141,180,9,49 }, +{ 195,98,271,223,132,167,146,407,1,360,121,834,393,591,212,199,293,259,522,107,354,147,156,191,807,590,48,18,125,16,765,541 }, +{ 128,202,77,210,402,318,33,102,6,40,403,29,342,269,196,757,99,139,2,111,42,4,494,117,275,300,13,12,678,0,177,122 }, +{ 13,33,23,40,51,102,4,117,77,64,134,0,128,153,202,196,453,11,15,12,1,22,403,141,59,14,10,475,515,65,700,95 }, +{ 7,16,14,24,18,2,28,0,92,71,1,22,6,35,60,20,168,10,154,118,5,302,124,69,97,109,703,158,420,12,149,66 }, +{ 15,1,515,23,0,13,700,2,51,753,180,5,120,165,197,21,115,4,33,9,141,7,12,6,3,457,386,202,260,523,8,31 }, +{ 60,107,121,132,146,126,199,279,150,92,16,649,441,35,955,7,21,0,423,5,18,195,598,298,493,356,32,653,22,362,953,10 }, +{ 31,44,98,276,284,299,116,935,9,201,0,131,39,127,144,662,1,137,371,492,567,489,93,254,49,268,22,28,30,293,434,737 }, +{ 13,15,23,515,700,0,1,51,753,4,2,10,77,202,5,115,3,165,197,457,9,12,11,961,33,120,22,141,180,7,6,40 }, +{ 123,162,184,257,17,183,229,130,129,3,84,136,99,152,556,383,57,497,12,205,4,62,56,452,80,266,128,14,40,119,27,106 }, +{ 196,33,117,40,153,23,134,13,51,102,453,0,15,475,12,14,515,2,22,700,4,21,753,64,401,670,730,1,9,11,10,99 }, +{ 224,219,187,131,258,385,442,871,836,31,98,908,44,574,127,944,137,839,116,36,613,1,254,39,926,160,829,96,93,371,860,827 }, +{ 121,195,156,132,146,360,590,407,786,522,883,591,259,929,626,941,150,687,5,55,296,379,467,178,586,465,279,21,1,13,60,354 }, +{ 2,1,14,29,6,5,46,52,38,19,114,75,26,65,108,96,25,50,36,70,103,309,17,236,218,74,12,86,0,3,10,112 }, +{ 15,515,82,700,120,753,10,0,8,197,260,165,351,64,13,110,117,93,31,1,9,33,22,23,457,44,450,77,102,898,40,49 }, +{ 7,66,97,2,172,74,226,52,29,135,192,232,43,324,92,5,38,20,222,14,6,568,87,107,353,620,580,16,138,174,448,32 }, +{ 62,129,123,162,136,249,618,183,507,57,4,152,17,59,11,184,117,77,3,128,211,41,130,205,12,40,33,106,64,229,38,313 }, +{ 1,13,15,2,4,515,23,0,3,115,700,5,51,77,341,141,753,180,33,217,197,202,901,6,21,165,11,365,318,317,10,102 }, +{ 6,26,235,19,145,47,112,78,64,27,453,95,29,444,25,624,85,108,648,70,32,130,74,42,711,630,632,138,65,122,113,730 }, +{ 23,51,12,15,13,99,515,153,117,10,700,37,120,82,165,2,753,64,128,0,403,3,5,1,134,197,453,31,202,457,110,21 }, +{ 16,24,18,71,64,35,92,7,246,146,9,108,60,118,199,5,140,2,267,0,230,830,32,133,1,68,50,330,247,563,36,12 }, +{ 15,515,700,753,0,1,13,2,23,3,4,217,51,5,115,8,9,180,341,10,7,6,317,77,33,372,901,197,365,11,120,165 }, +{ 234,639,178,202,77,142,5,455,450,49,416,0,147,427,198,21,315,329,13,318,325,557,120,344,113,259,22,128,61,105,23,494 }, +{ 1,31,36,44,141,180,55,2,64,22,98,116,13,352,0,115,10,127,5,164,253,498,237,165,341,197,4,86,15,170,125,23 }, +{ 15,120,13,141,23,260,217,515,1,77,51,110,180,700,317,82,269,137,115,202,21,753,64,5,351,291,0,450,352,93,36,326 }, +{ 26,6,112,396,19,145,25,122,648,287,42,74,624,222,416,45,138,66,644,151,113,651,29,573,64,280,445,27,525,85,70,58 }, +{ 156,360,5,146,121,21,271,522,354,132,49,13,18,195,16,340,60,591,446,586,727,0,107,407,167,48,1,463,199,566,32,23 }, +{ 5,61,49,147,178,612,660,120,21,182,23,427,259,683,33,4,77,70,13,3,376,98,64,0,481,344,48,595,291,263,141,51 }, +{ 89,79,468,179,358,205,94,405,115,498,72,180,365,431,37,111,341,734,188,317,482,217,11,4,245,152,413,216,12,474,490,752 }, +{ 24,16,35,68,18,71,7,92,0,108,9,14,118,101,336,175,375,302,28,124,154,55,149,60,398,1,65,2,140,273,345,230 }, +{ 51,730,421,801,453,386,23,523,13,475,719,401,670,365,899,403,115,457,758,165,33,494,450,6,423,805,629,56,569,514,958,388 }, +{ 113,45,6,311,29,2,151,614,145,491,112,80,5,27,61,74,315,66,209,631,19,25,58,17,73,26,1,243,70,64,611,287 }, +{ 4,339,188,471,11,59,79,12,377,94,99,33,77,102,51,111,37,152,13,961,474,542,40,342,3,23,128,403,202,177,184,57 }, +{ 15,4,515,11,700,33,82,40,0,120,753,10,8,110,13,93,23,165,77,260,64,31,22,51,44,102,351,1,125,9,197,21 }, +{ 16,24,18,0,35,68,28,71,124,118,60,7,9,55,14,92,109,101,419,175,22,252,154,375,149,302,158,346,2,49,1,126 }, +{ 17,45,227,21,106,3,2,243,209,5,48,32,221,62,207,50,29,186,290,270,263,52,14,496,400,119,46,255,54,430,38,721 }, +{ 340,354,586,658,156,195,698,668,1,296,9,18,883,363,447,379,303,98,411,13,31,163,51,5,371,48,919,846,121,21,360,70 }, +{ 277,153,111,12,23,51,474,99,38,37,139,117,41,457,79,453,542,13,11,33,134,157,629,188,961,14,196,401,102,569,15,94 }, +{ 0,18,16,159,49,24,9,105,35,68,7,28,22,1,60,344,55,101,109,2,14,158,13,23,71,118,455,286,272,424,5,327 }, +{ 0,105,9,49,16,18,158,28,518,24,101,320,1,68,170,301,272,127,7,286,35,890,109,39,159,98,21,344,31,55,371,23 }, +{ 141,1,180,15,13,2,365,217,515,352,317,115,341,0,4,5,269,700,23,21,3,752,197,77,753,51,31,901,10,202,8,64 }, +{ 4,23,51,33,19,17,102,153,485,880,40,403,196,26,300,453,27,117,78,0,12,200,47,5,11,14,342,99,53,77,475,2 }, +{ 62,184,56,440,130,229,183,3,556,152,99,162,12,266,17,548,136,57,305,161,123,14,452,4,383,403,257,34,40,84,33,139 }, +{ 13,23,77,141,64,202,33,51,269,115,0,102,21,4,217,128,5,32,318,137,291,9,15,2,180,10,3,317,177,515,7,6 }, +{ 1,22,36,105,170,0,86,2,31,28,239,64,55,5,10,98,9,44,127,95,654,67,301,143,13,12,49,23,320,141,83,21 }, +{ 15,515,700,753,0,1,13,2,23,901,5,8,51,82,9,180,457,4,7,12,3,6,10,120,341,141,22,898,197,351,115,260 }, +{ 1,39,274,98,100,265,190,30,438,310,166,223,88,96,909,31,264,625,530,9,382,812,21,252,593,0,254,539,44,131,23,778 }, +{ 18,212,167,118,363,1,447,411,146,60,271,16,781,121,647,9,621,562,21,478,664,68,815,5,354,98,48,101,24,446,777,463 }, +{ 24,28,22,0,7,1,2,16,14,65,35,49,158,95,109,159,55,105,10,18,124,9,67,5,239,149,12,289,108,68,21,424 }, +{ 105,22,131,272,286,98,55,239,1,31,320,9,127,327,36,185,28,374,86,219,0,64,187,44,578,164,224,913,535,115,601,13 }, +{ 22,31,28,301,127,98,44,0,105,1,512,395,9,293,109,299,95,338,239,125,242,116,36,320,55,841,900,685,599,23,13,763 }, +{ 2,1,58,29,5,14,52,46,186,334,45,155,151,50,400,75,38,69,502,61,48,227,223,7,163,17,262,67,549,21,70,113 }, +{ 7,107,135,232,97,14,2,92,66,16,172,192,278,387,298,356,38,35,448,52,46,43,60,29,20,126,324,526,357,359,64,5 }, +{ 20,43,104,426,173,7,560,414,707,784,319,81,0,861,422,819,38,74,715,52,376,97,879,32,330,22,49,64,66,95,192,526 }, +{ 104,74,636,66,204,0,355,81,222,25,29,319,145,784,20,65,90,4,174,194,7,64,6,746,138,173,750,715,91,43,192,32 }, +{ 0,9,101,35,68,39,65,28,252,124,67,154,364,336,100,166,30,1,289,55,149,346,16,114,158,88,439,24,429,22,570,194 }, +{ 57,14,4,231,236,585,176,59,369,23,361,13,719,51,300,342,12,457,56,3,62,38,202,401,34,46,2,322,11,215,210,507 }, +{ 1,2,15,3,141,0,515,5,33,700,13,64,77,180,6,128,753,10,4,269,102,202,11,7,134,197,352,120,117,318,12,291 }, +{ 5,1,21,202,13,32,48,23,0,61,259,22,494,120,70,49,51,18,137,128,465,12,178,115,2,453,403,141,58,3,90,450 }, +{ 141,205,4,72,59,79,245,11,352,94,152,76,247,216,21,188,452,217,497,12,89,37,111,339,588,77,64,875,864,115,358,464 }, +{ 15,515,700,753,0,1,2,13,5,4,23,3,8,341,365,51,115,10,120,457,6,141,77,197,31,7,165,9,202,450,961,260 }, +{ 5,2,50,14,58,38,171,46,29,1,45,186,17,52,155,218,48,281,61,487,54,36,67,21,328,334,151,227,760,114,400,133 }, +{ 457,120,70,125,318,64,23,48,795,291,202,761,751,415,77,846,269,758,21,237,96,260,391,165,87,1,128,5,221,13,137,763 }, +{ 13,23,51,33,4,40,117,102,453,64,153,196,0,77,15,11,12,475,1,65,134,10,515,22,21,14,700,59,403,141,2,753 }, +{ 229,152,57,266,452,381,432,12,313,184,99,471,17,4,62,339,157,3,129,59,128,11,369,37,77,38,40,123,5,497,188,257 }, +{ 49,28,109,22,159,9,272,95,105,131,55,35,254,168,39,327,169,0,1,286,175,374,347,158,420,67,36,194,312,424,627,346 }, +{ 5,2,61,29,45,58,80,311,1,17,209,227,52,243,106,869,454,151,592,496,48,334,14,155,6,186,46,171,75,21,255,667 }, +{ 244,44,110,141,260,30,269,352,839,131,574,228,373,276,1,406,219,717,217,137,253,224,120,93,36,31,567,116,661,187,341,88 }, +{ 12,99,79,139,11,453,196,51,277,474,111,23,542,37,94,188,33,13,401,775,40,961,313,102,4,339,153,485,629,134,300,431 }, +{ 16,35,9,0,68,24,149,69,67,18,1,114,65,230,71,7,103,133,50,167,212,118,101,191,140,64,399,28,124,283,55,565 }, +{ 88,30,274,435,131,613,190,100,93,829,166,1,187,795,530,127,382,957,960,160,31,137,466,264,39,800,406,254,28,473,521,219 }, +{ 167,16,18,118,212,24,60,71,101,68,191,9,375,411,363,35,0,1,589,199,302,21,447,55,146,126,92,271,647,121,562,48 }, +{ 64,141,86,177,77,128,147,597,304,95,269,102,275,4,352,49,120,5,372,194,465,13,588,237,947,216,202,180,612,751,107,534 }, +{ 18,65,90,403,523,289,240,214,194,102,701,475,202,217,283,862,389,51,33,0,494,421,453,817,84,64,847,899,352,13,23,437 }, +{ 13,51,23,202,5,12,21,128,15,115,0,1,141,120,64,32,4,2,515,403,165,457,3,10,700,99,453,318,719,450,308,401 }, +{ 98,223,393,31,1,271,834,791,167,44,202,64,93,697,5,116,77,125,450,446,212,18,541,293,51,120,195,132,284,13,807,765 }, +{ 15,515,700,753,4,11,23,13,40,51,82,165,0,110,93,33,141,64,120,5,10,77,3,102,180,32,202,125,8,197,31,21 }, +{ 15,515,700,753,0,1,13,2,901,23,5,341,3,51,82,8,4,180,961,9,115,10,12,6,898,7,351,141,134,22,31,120 }, +{ 234,416,77,5,315,639,325,202,147,198,113,49,450,61,455,142,0,21,22,342,329,494,178,58,102,427,318,230,13,120,43,470 }, +{ 60,146,16,18,156,126,121,271,199,360,132,24,167,0,640,10,71,522,21,92,5,340,107,354,118,150,22,195,446,35,28,212 }, +{ 4,361,11,14,56,368,377,161,27,12,300,77,59,200,17,554,202,33,40,494,495,21,210,80,757,25,128,23,19,38,444,53 }, +{ 141,82,217,351,15,352,120,1,180,260,515,64,854,36,700,317,752,372,13,269,77,753,922,21,349,23,202,110,93,137,51,373 }, +{ 15,515,700,753,77,13,0,1,23,33,102,2,51,4,3,5,291,217,10,9,450,120,341,7,317,6,11,117,115,8,260,180 }, +{ 15,515,120,13,700,23,77,141,1,260,0,753,180,51,137,202,115,365,110,291,217,5,128,9,21,341,197,269,2,450,317,165 }, +{ 174,6,348,85,138,74,280,204,66,233,192,355,289,65,81,580,636,353,25,91,104,343,673,214,64,95,42,712,792,32,194,90 }, +{ 152,497,452,59,4,216,11,79,94,77,128,188,269,339,588,33,76,529,318,32,141,471,12,202,111,21,5,51,37,90,72,177 }, +{ 417,499,10,141,253,244,110,559,8,564,180,260,728,120,352,638,642,341,951,206,143,752,901,93,137,661,922,373,44,31,811,197 }, +{ 13,77,23,33,4,51,0,102,128,59,141,40,64,115,177,10,137,22,202,2,7,11,90,1,117,180,269,14,49,6,134,3 }, +{ 1,2,22,0,36,5,67,50,14,28,12,86,38,46,83,168,194,65,103,114,49,7,10,95,21,69,23,24,128,51,55,13 }, +{ 17,106,119,207,255,306,742,378,84,62,136,45,3,5,240,80,61,56,209,383,311,790,655,32,2,440,76,151,58,29,179,263 }, +{ 3,128,1,141,2,202,33,5,64,15,0,515,102,13,269,10,700,180,134,51,120,6,77,318,23,137,17,117,753,197,82,153 }, +{ 514,38,377,328,11,57,41,248,880,266,556,4,152,361,471,757,485,403,305,102,3,211,313,99,457,130,12,14,157,40,23,54 }, +{ 68,0,167,101,9,118,264,520,16,18,21,478,562,1,124,212,100,936,664,777,191,88,806,154,48,24,759,604,35,252,265,65 }, +{ 230,689,699,213,466,352,217,831,30,443,418,144,854,201,840,855,1,251,203,317,530,957,96,93,822,539,36,752,351,137,83,800 }, +{ 33,77,102,117,15,82,13,134,23,64,0,515,120,153,51,4,40,128,700,260,202,141,196,22,753,11,351,10,1,326,95,269 }, +{ 11,40,33,51,117,13,542,328,14,134,38,153,23,12,485,231,102,54,775,37,3,377,111,139,211,4,457,403,369,475,99,719 }, +{ 33,64,77,128,141,2,1,202,102,13,23,117,0,15,3,153,51,134,10,40,6,5,515,269,137,180,318,165,700,7,196,753 }, +{ 15,515,700,753,4,1,5,11,13,21,33,180,93,141,64,2,23,77,82,3,0,102,32,40,352,341,10,197,98,110,117,901 }, +{ 1,2,14,67,50,46,38,24,103,83,0,5,36,28,29,133,114,96,65,52,18,75,54,108,22,7,238,58,160,9,361,69 }, +{ 258,201,276,137,160,860,116,261,295,843,567,144,131,44,187,268,943,219,284,31,202,935,141,98,662,203,127,96,36,93,224,1 }, +{ 7,2,14,16,46,87,75,52,92,278,29,38,140,70,1,5,35,294,24,262,135,69,171,172,58,409,112,60,50,66,97,12 }, +{ 13,23,0,2,51,1,33,4,115,10,15,141,77,3,5,180,217,515,9,7,64,11,700,6,102,40,197,22,317,753,165,202 }, +{ 74,145,6,66,25,204,42,29,222,337,138,26,7,525,192,174,746,287,544,135,415,2,609,632,112,64,87,0,85,45,712,396 }, +{ 77,33,102,15,217,13,23,141,202,515,51,700,291,4,269,753,317,180,21,64,318,115,128,0,275,2,352,196,3,5,137,11 }, +{ 187,219,258,871,44,442,160,574,137,224,908,116,839,131,36,926,276,201,93,228,202,860,31,613,144,531,406,1,902,30,190,318 }, +{ 1,372,141,5,21,77,225,744,96,30,23,349,13,291,269,284,69,442,459,144,303,839,217,622,160,330,260,48,120,410,189,352 }, +{ 66,222,2,74,29,87,135,6,7,145,52,25,294,337,226,172,138,331,42,70,97,112,26,1,632,192,43,5,415,609,461,353 }, +{ 45,17,106,209,5,2,21,29,48,207,3,186,243,155,255,263,454,119,400,496,270,14,290,62,425,1,171,32,659,52,38,56 }, +{ 93,88,141,120,30,213,260,373,100,717,459,82,110,1,166,450,180,321,217,372,36,269,131,225,22,352,326,466,473,187,244,410 }, +{ 266,57,152,381,313,471,12,229,99,369,339,62,157,3,4,37,77,38,188,17,11,162,40,184,129,59,475,775,128,452,403,453 }, +{ 217,352,317,141,752,15,180,515,372,365,700,341,753,349,77,21,291,1,115,244,64,120,13,98,269,82,5,498,864,351,23,144 }, +{ 14,514,369,102,403,377,51,719,880,153,23,13,457,11,485,4,401,12,328,453,33,40,117,57,629,38,730,236,134,670,361,961 }, +{ 107,7,172,14,92,135,2,359,60,314,46,16,126,278,232,150,279,32,38,392,298,5,35,97,24,192,259,288,330,52,356,312 }, +{ 0,4,25,13,59,90,65,23,26,19,18,12,5,216,91,51,389,33,77,11,22,85,27,81,21,177,746,45,42,194,37,123 }, +{ 5,49,315,202,416,77,455,639,450,21,197,137,350,13,408,0,329,318,494,344,61,402,64,509,347,120,113,48,95,713,308,401 }, +{ 130,47,381,390,59,90,200,214,289,6,65,472,29,64,874,648,50,751,624,26,52,32,4,194,875,714,85,249,247,33,881,19 }, +{ 51,23,453,13,719,12,457,165,37,730,99,4,386,197,401,17,11,2,3,15,5,961,475,6,515,64,54,700,32,115,0,403 }, +{ 15,515,1,13,700,2,23,0,753,5,3,180,51,4,165,12,141,21,197,457,7,115,6,9,352,10,120,202,8,341,11,77 }, +{ 0,9,1,67,35,28,68,16,24,65,18,69,50,114,103,12,22,13,5,101,2,96,23,83,149,21,39,55,7,175,433,124 }, +{ 28,105,22,0,1,320,170,9,49,301,109,95,127,31,98,55,65,35,2,24,168,159,36,713,16,740,13,338,21,44,512,23 }, +{ 13,77,4,51,23,33,102,202,128,59,40,0,64,141,117,403,115,11,15,318,153,269,22,515,475,134,10,494,177,1,90,210 }, +{ 13,23,0,51,77,33,2,141,4,10,1,64,115,102,3,6,22,15,217,11,180,7,40,515,165,202,177,9,269,128,700,5 }, +{ 456,116,492,8,949,268,867,391,203,51,499,13,719,386,31,791,457,918,125,10,23,93,479,685,417,0,22,338,506,551,870,730 }, +{ 17,237,45,180,106,62,32,64,115,41,136,498,255,21,197,129,241,13,3,227,23,352,165,752,350,365,449,155,4,546,476,38 }, +{ 1,15,180,515,0,2,341,700,901,352,4,141,13,3,752,5,753,217,317,115,365,23,197,21,51,165,31,6,269,202,77,7 }, +{ 205,141,216,269,497,4,588,76,59,152,128,452,79,77,875,11,72,94,188,217,352,12,247,37,90,64,32,1,474,23,947,372 }, +{ 64,247,217,237,317,180,752,115,349,141,498,13,437,304,23,372,352,164,579,291,33,864,177,197,0,490,72,10,482,77,269,51 }, +{ 2,1,0,13,15,141,3,77,5,515,64,33,23,180,6,700,4,117,217,7,10,11,102,165,753,197,115,134,40,352,12,269 }, +{ 11,40,38,328,33,542,12,313,41,339,23,157,377,117,369,51,471,99,775,485,13,305,457,57,14,475,37,248,4,54,188,719 }, +{ 33,77,102,40,13,23,0,51,4,128,64,202,117,141,22,196,153,10,134,15,59,269,1,137,65,11,403,318,453,86,515,177 }, +{ 472,80,34,250,495,161,17,14,469,176,128,4,389,106,283,436,216,527,3,297,483,177,53,56,231,194,119,84,719,57,255,59 }, +{ 317,352,180,141,217,752,115,341,365,244,1,269,202,901,253,15,21,498,372,4,137,515,13,2,700,318,5,197,23,143,753,349 }, +{ 9,39,101,18,265,100,333,520,252,16,0,329,593,1,553,364,68,167,310,30,121,254,118,158,363,166,60,604,272,24,286,404 }, +{ 15,515,1,180,700,901,0,2,753,341,752,4,3,13,115,365,317,5,23,197,141,217,165,352,6,22,36,9,137,51,7,10 }, +{ 131,39,9,829,166,613,578,827,1,30,716,254,100,98,31,224,0,406,228,310,616,219,44,846,127,190,938,96,265,371,856,438 }, +{ 17,64,62,106,141,751,136,292,32,129,352,41,38,476,86,128,214,237,5,177,123,209,217,45,269,954,162,710,180,3,90,4 }, +{ 25,42,235,65,650,736,605,6,630,85,123,343,233,256,26,122,63,389,141,249,416,444,368,194,19,108,138,174,90,0,544,511 }, +{ 184,229,152,57,266,432,497,452,17,381,619,257,313,12,4,205,59,3,99,471,157,128,5,129,339,369,77,11,32,45,202,2 }, +{ 137,202,160,860,141,30,93,567,36,276,295,261,131,39,9,964,201,843,1,98,800,318,116,22,943,187,10,219,206,44,269,535 }, +{ 0,493,125,64,49,9,279,10,35,18,93,55,293,31,14,13,194,165,325,48,22,132,21,107,98,389,44,581,342,259,174,137 }, +{ 15,515,700,753,4,33,13,77,23,5,51,32,102,40,93,11,349,141,21,8,82,202,64,31,110,10,117,0,1,44,3,318 }, +{ 110,253,854,811,352,141,244,951,180,642,661,384,498,143,752,317,911,10,269,206,559,351,261,120,902,533,922,959,365,160,332,217 }, +{ 2,29,70,1,75,52,6,220,26,112,145,331,74,163,19,69,38,324,46,58,14,5,25,21,278,223,50,307,66,7,67,409 }, +{ 13,23,77,33,51,4,64,141,115,102,0,2,128,177,40,11,202,10,6,180,7,15,269,1,32,217,59,22,291,3,137,515 }, +{ 340,897,691,478,658,264,914,382,100,812,363,1,724,156,166,698,88,521,39,404,682,447,296,96,303,411,30,909,9,274,656,772 }, +{ 9,18,310,101,265,159,326,120,105,158,33,363,77,195,51,55,13,39,354,132,23,7,28,639,16,137,98,1,252,272,709,49 }, +{ 57,313,471,12,99,369,157,339,266,152,38,37,475,453,328,775,11,40,59,188,77,514,401,403,342,4,139,33,377,51,229,14 }, +{ 16,7,24,14,35,140,60,92,18,69,71,2,189,1,46,230,108,388,150,38,21,172,278,67,246,267,50,309,236,135,451,0 }, +{ 206,417,93,940,959,473,499,203,8,137,559,728,31,202,44,120,450,141,10,260,116,564,22,326,269,318,268,244,0,1,253,638 }, +{ 15,515,700,753,1,0,13,2,23,4,3,51,5,217,7,77,341,115,8,9,10,33,6,180,317,349,291,120,11,165,457,901 }, +{ 1,2,5,14,48,21,290,32,50,45,38,46,263,207,155,72,76,29,17,408,425,171,89,52,7,0,292,449,3,227,513,428 }, +{ 121,132,354,167,271,223,146,98,18,463,1,668,446,195,407,60,212,447,781,48,360,363,411,522,156,393,807,9,21,16,293,13 }, +{ 131,578,105,371,219,224,716,616,187,49,9,254,737,159,385,98,258,127,272,761,0,916,623,910,28,286,39,31,22,518,924,242 }, +{ 302,467,97,6,273,1,24,484,124,51,36,18,2,398,453,421,523,69,7,23,13,403,386,150,66,0,298,65,426,165,22,158 }, +{ 30,190,530,88,1,100,778,539,625,274,382,410,96,731,960,39,795,321,9,131,264,144,840,748,44,166,669,957,36,31,435,228 }, +{ 141,1,2,128,64,33,15,202,3,0,180,5,13,77,515,134,269,102,197,700,10,137,318,6,120,165,753,352,4,82,23,117 }, +{ 44,201,567,116,131,224,295,662,489,268,219,31,434,144,187,276,110,384,93,261,699,137,36,442,120,1,613,30,228,64,141,244 }, +{ 12,15,51,23,515,37,99,13,700,0,10,117,753,38,165,82,134,120,11,453,197,64,115,569,1,629,401,22,457,474,110,153 }, +{ 7,135,2,92,172,14,66,140,38,52,97,46,29,74,16,324,278,226,6,87,1,571,262,5,357,232,35,380,69,314,24,330 }, +{ 125,386,23,963,949,60,51,391,165,221,13,197,118,21,719,193,541,421,517,150,393,7,401,453,308,5,791,551,326,558,48,173 }, +{ 6,85,42,25,138,222,174,235,280,256,525,289,26,214,64,746,90,32,544,65,204,19,66,337,355,95,348,415,74,29,5,312 }, +{ 1,14,5,50,2,67,24,0,46,69,48,21,58,103,16,12,18,38,54,96,83,7,502,45,36,181,35,9,430,28,10,155 }, +{ 811,351,642,180,951,752,110,638,253,10,82,352,197,341,365,564,499,854,873,55,9,417,282,901,244,22,559,143,206,141,28,898 }, +{ 23,13,51,15,12,453,403,165,4,515,115,719,475,457,700,523,2,21,0,99,202,197,14,5,386,753,128,401,37,308,33,117 }, +{ 120,13,23,77,141,1,15,93,217,82,260,51,137,202,110,515,21,180,165,5,128,102,64,351,291,700,269,352,326,203,177,0 }, +{ 1,5,0,22,12,2,36,21,10,23,86,13,28,51,9,128,48,14,32,50,7,3,96,137,54,4,202,49,37,65,208,323 }, +{ 219,98,23,127,301,51,258,308,170,910,13,165,22,105,293,616,125,242,276,401,201,395,964,115,55,284,31,374,327,206,512,900 }, +{ 64,180,80,165,5,237,2,250,34,58,297,61,197,17,22,29,186,498,231,445,247,3,752,311,95,32,483,153,27,45,115,469 }, +{ 13,77,23,33,0,2,1,64,141,51,102,10,15,3,115,40,180,6,515,128,7,22,269,202,4,217,700,5,177,117,14,165 }, +{ 15,120,51,515,13,450,23,700,202,153,196,753,260,64,128,141,730,4,326,386,21,523,33,318,5,457,95,32,403,1,77,269 }, +{ 2,1,5,29,32,45,207,263,14,425,58,72,76,21,7,408,48,46,52,186,17,292,38,6,61,89,476,50,155,720,119,3 }, +{ 15,515,700,753,4,13,11,5,1,23,33,21,3,141,32,2,40,180,117,64,269,202,102,197,0,165,120,51,341,352,153,12 }, +{ 76,5,214,129,2,123,45,710,17,249,618,460,179,32,1,257,205,519,90,207,245,184,162,61,769,209,292,106,6,29,14,128 }, +{ 1,15,23,13,120,141,51,515,202,21,700,165,0,180,137,2,5,77,128,93,753,260,269,197,326,33,110,352,82,102,318,48 }, +{ 7,2,135,14,29,87,66,52,97,172,70,112,5,58,46,337,92,16,20,43,1,38,232,155,74,294,6,461,409,151,262,32 }, +{ 574,187,384,926,860,110,258,434,269,531,141,244,160,261,253,116,699,959,940,717,533,36,219,31,902,661,871,295,201,352,10,260 }, +{ 156,354,296,1,182,586,64,379,340,937,850,698,31,48,98,44,120,18,163,23,30,658,195,125,77,284,223,291,774,481,96,39 }, +{ 250,80,34,472,17,495,176,469,33,194,64,483,4,297,141,14,161,27,53,667,56,833,73,527,585,231,106,51,84,814,2,59 }, +{ 97,7,81,140,66,92,172,192,24,298,43,6,74,69,314,426,462,14,501,16,21,508,60,189,267,232,230,104,48,20,135,330 }, +{ 31,44,116,144,268,393,492,434,367,489,127,98,918,0,384,9,22,206,948,105,93,203,1,456,332,940,299,28,137,49,293,125 }, +{ 15,128,33,3,13,51,141,1,202,64,23,2,515,120,102,0,5,82,10,700,165,197,269,153,403,110,753,137,196,318,117,12 }, +{ 31,98,127,9,0,105,22,28,44,512,293,395,299,1,242,49,685,763,320,599,125,116,109,276,284,95,870,159,23,456,36,900 }, +{ 7,24,124,1,6,97,2,69,14,18,23,92,21,67,66,16,5,484,43,20,118,65,36,22,28,0,51,140,13,71,29,150 }, +{ 1,64,442,303,284,349,202,141,622,67,154,447,260,44,652,429,9,335,237,919,197,98,167,33,682,269,547,77,863,411,340,201 }, +{ 1,15,2,141,515,0,700,13,3,180,10,753,5,64,77,33,4,6,7,197,102,269,165,23,134,11,352,341,291,349,22,120 }, +{ 99,139,12,453,196,277,775,40,475,33,23,401,215,51,11,14,77,111,313,130,38,211,37,266,129,15,339,153,719,3,369,515 }, +{ 33,77,102,4,23,128,13,141,202,64,51,0,40,59,269,115,117,137,153,1,318,11,10,177,15,134,22,90,196,2,403,32 }, +{ 7,2,14,58,70,112,16,5,87,38,46,52,6,128,135,1,32,21,155,29,66,64,0,97,92,186,172,294,13,23,20,37 }, +{ 15,13,515,1,700,2,23,0,753,5,3,4,51,10,341,115,365,180,11,33,317,77,6,7,217,12,197,165,117,9,64,102 }, +{ 2,1,14,29,75,69,67,6,52,46,38,24,103,220,83,25,70,87,262,74,96,267,50,366,26,16,226,394,357,66,108,19 }, +{ 9,105,18,39,1,0,16,557,101,272,252,890,326,49,265,21,137,100,23,938,13,310,159,5,31,24,254,51,30,128,202,132 }, +{ 80,209,45,61,667,17,6,106,5,2,151,29,483,255,454,833,27,311,112,19,738,378,1,58,113,26,25,469,119,887,32,64 }, +{ 13,23,51,15,5,1,515,0,21,2,12,141,700,165,202,115,753,32,180,4,3,197,10,120,457,9,269,128,64,341,7,33 }, +{ 99,12,453,277,139,157,369,474,339,51,38,23,37,196,188,401,775,111,11,313,328,475,153,266,4,471,79,40,33,629,102,14 }, +{ 7,92,16,232,97,140,126,14,60,107,66,35,298,387,314,104,246,462,441,150,0,38,24,2,172,357,230,330,5,633,22,289 }, +{ 13,77,23,202,318,141,33,4,51,269,102,177,115,403,137,2,40,494,90,11,342,128,31,117,21,32,7,12,64,134,14,10 }, +{ 13,2,0,23,141,1,77,3,180,33,6,64,15,10,115,51,4,5,217,197,7,165,515,102,22,11,700,269,40,352,177,14 }, +{ 15,515,700,753,4,11,1,93,13,5,180,110,82,21,120,23,2,33,10,141,3,165,197,102,901,0,32,341,117,40,153,12 }, +{ 15,515,700,753,1,13,0,2,23,4,77,51,3,5,341,291,7,33,6,115,10,9,8,217,11,177,120,180,102,165,197,365 }, +{ 20,43,198,325,173,904,104,234,66,147,77,319,416,422,97,426,5,0,7,450,861,202,712,725,2,32,639,376,38,324,945,315 }, +{ 105,0,9,28,49,301,170,1,127,159,22,16,31,98,512,623,24,109,158,395,35,68,371,65,713,55,2,242,293,21,44,18 }, +{ 213,88,689,466,230,30,321,435,699,352,217,201,795,831,144,854,1,443,96,539,530,840,418,251,855,190,93,100,669,31,957,662 }, +{ 130,453,47,196,4,57,14,59,236,711,51,153,730,77,412,381,23,202,108,128,361,13,283,117,11,719,200,46,34,78,210,2 }, +{ 1,2,5,14,0,50,36,22,38,46,65,67,12,86,114,28,103,29,208,7,10,128,21,83,218,23,96,54,194,6,133,51 }, +{ 6,26,74,19,165,453,14,730,1,125,197,50,29,51,138,357,13,2,108,391,70,719,46,457,47,500,386,262,112,23,235,52 }, +{ 9,10,376,20,43,0,49,18,30,120,2,33,325,104,501,470,77,788,725,102,523,39,858,5,904,414,174,55,137,37,342,13 }, +{ 15,515,700,753,0,1,13,23,51,77,120,202,341,82,5,4,9,260,2,137,141,128,115,351,901,8,180,10,197,21,450,33 }, +{ 105,131,272,578,9,49,371,219,159,616,286,320,224,187,716,98,28,22,0,623,127,258,910,737,385,31,239,347,254,109,424,95 }, +{ 457,51,13,23,961,12,719,99,453,15,4,515,165,401,629,3,700,11,17,14,2,37,753,41,57,569,38,45,0,33,5,32 }, +{ 202,120,5,33,318,77,450,102,1,260,403,128,494,21,165,13,269,12,326,23,342,523,402,2,817,64,15,141,125,82,457,475 }, +{ 141,269,352,217,180,64,349,137,202,160,317,15,372,515,700,752,318,753,244,13,437,291,165,864,22,237,5,82,954,21,77,418 }, +{ 70,29,2,145,74,112,26,6,75,52,19,66,632,1,87,220,5,135,163,287,307,25,226,7,58,396,294,278,113,409,69,151 }, +{ 82,351,317,15,752,180,898,352,141,901,515,341,10,700,365,1,753,498,0,217,253,115,55,854,33,5,143,32,21,160,36,197 }, +{ 39,9,310,254,0,30,101,49,252,272,100,265,105,455,159,557,190,333,286,688,18,166,1,158,709,16,625,627,31,131,327,329 }, +{ 2,58,29,5,1,151,186,52,70,45,7,549,14,75,112,400,113,155,61,46,227,163,311,315,66,6,307,27,17,220,287,74 }, +{ 141,217,13,21,352,23,269,77,180,115,317,64,202,15,349,137,5,51,165,291,318,752,372,4,0,102,33,365,197,32,341,125 }, +{ 68,35,0,9,65,101,149,124,24,154,175,16,28,7,67,1,18,189,114,398,55,14,345,39,118,133,69,2,230,429,71,283 }, +{ 66,7,29,2,112,52,20,43,97,151,74,192,135,5,173,525,337,45,145,58,415,25,14,32,644,70,544,226,222,21,6,580 }, +{ 31,125,44,22,116,299,242,55,1,170,64,36,479,870,456,685,10,599,558,0,268,506,28,740,23,903,492,164,393,206,2,86 }, +{ 188,11,79,12,99,377,94,33,542,339,40,474,111,37,4,51,102,453,139,775,13,475,23,961,277,471,134,57,431,266,115,117 }, +{ 658,698,340,98,296,303,1,31,850,363,156,919,44,774,586,385,120,77,82,10,223,30,354,291,23,914,478,87,260,163,48,13 }, +{ 15,515,700,753,82,4,1,13,901,33,197,11,5,10,23,165,2,0,180,3,21,77,51,120,365,115,217,40,117,102,32,401 }, +{ 15,515,700,753,4,11,5,13,1,141,3,180,23,202,21,2,269,64,165,33,40,32,0,318,120,128,12,197,117,352,51,17 }, +{ 91,6,233,85,370,718,81,65,25,256,63,343,42,74,235,123,138,511,397,249,26,194,650,355,64,87,544,18,90,643,66,214 }, +{ 23,13,202,51,21,120,1,5,141,128,450,64,318,403,15,137,260,33,12,48,32,31,125,494,269,102,165,515,77,2,197,14 }, +{ 180,317,365,341,752,217,115,352,901,482,372,498,1,141,15,253,515,244,2,700,0,21,13,82,23,4,579,351,753,291,269,77 }, +{ 13,115,197,341,9,352,468,237,64,498,23,165,22,509,901,546,482,180,28,569,317,51,365,873,391,95,86,217,49,837,752,706 }, +{ 13,23,51,1,141,5,165,202,21,120,64,125,180,15,2,33,197,115,128,32,260,269,12,82,4,515,137,7,318,93,0,700 }, +{ 214,289,90,174,874,6,138,280,65,81,64,85,355,751,194,233,312,348,835,91,0,32,343,636,249,29,875,288,519,104,247,74 }, +{ 15,515,700,753,4,5,11,13,1,33,23,21,2,3,102,32,141,77,180,117,31,64,0,40,134,196,120,352,12,44,197,6 }, +{ 33,15,13,515,117,23,700,217,134,753,0,51,153,77,141,2,4,64,196,1,3,180,10,115,5,102,6,11,22,202,165,7 }, +{ 15,515,700,753,33,4,77,102,1,40,13,117,11,115,134,5,21,153,23,217,3,32,2,317,120,196,180,141,51,12,59,260 }, +{ 15,515,700,753,13,0,1,23,2,217,51,3,4,5,8,317,115,9,341,10,202,180,6,365,7,82,457,22,120,901,33,291 }, +{ 7,2,135,20,97,14,66,52,337,673,192,29,43,355,353,5,16,294,107,376,147,226,331,560,64,470,222,104,415,32,4,324 }, +{ 195,132,142,167,146,77,363,271,121,354,202,120,647,178,786,212,687,0,101,878,16,522,60,5,450,411,35,55,98,639,259,318 }, +{ 202,77,20,0,318,66,104,128,102,269,177,43,33,7,216,291,494,5,2,342,74,173,97,112,450,22,337,10,234,52,64,678 }, +{ 107,362,612,356,359,97,414,43,259,20,392,7,298,147,819,683,465,173,729,660,319,14,5,779,581,595,246,35,501,92,0,230 }, +{ 6,165,14,453,13,51,19,23,386,457,74,391,308,2,26,401,47,758,603,108,719,366,1,29,309,730,324,197,133,70,115,867 }, +{ 179,72,205,180,247,245,4,490,352,59,317,152,79,498,94,217,148,76,752,864,11,216,141,405,89,452,197,111,497,188,37,21 }, +{ 107,7,298,314,14,359,32,392,232,279,172,97,60,581,387,126,121,0,534,493,356,92,441,95,13,21,35,147,22,5,16,362 }, +{ 156,271,354,586,360,132,591,195,121,18,340,1,5,13,21,48,668,446,23,463,296,658,60,55,407,698,146,70,626,51,163,24 }, +{ 13,23,51,4,0,12,457,15,11,453,2,515,5,1,99,10,115,165,700,475,401,403,3,961,40,14,37,753,719,32,64,569 }, +{ 48,125,21,165,13,221,23,763,423,508,197,5,98,92,193,16,441,386,64,314,293,457,391,140,49,60,102,693,683,51,35,867 }, +{ 202,77,120,450,5,318,1,494,0,195,18,132,523,403,326,604,354,260,121,576,203,167,234,817,682,49,35,615,21,20,13,102 }, +{ 39,9,166,30,0,101,158,68,404,190,333,274,252,310,88,100,49,28,344,35,21,22,419,131,438,1,16,65,530,694,124,10 }, +{ 15,515,700,753,110,4,1,11,165,180,93,13,82,5,2,197,33,120,0,3,10,23,21,115,901,217,341,77,317,51,32,117 }, +{ 2,29,1,14,6,52,5,46,50,26,70,19,103,58,38,67,96,262,516,309,218,133,108,27,75,17,112,114,24,487,331,83 }, +{ 120,77,15,13,1,141,260,23,515,217,110,51,137,700,317,202,165,291,180,21,753,128,0,177,326,93,450,82,64,269,197,5 }, +{ 255,59,554,297,183,56,33,444,108,358,123,196,269,122,77,153,57,177,117,730,19,467,605,130,128,50,275,4,291,475,134,133 }, +{ 13,23,51,12,153,14,117,120,165,134,99,401,38,453,15,128,197,719,64,515,475,403,37,33,196,700,40,125,5,0,54,2 }, +{ 64,33,174,348,95,108,467,554,56,0,25,306,233,6,63,511,343,120,13,85,29,561,543,707,319,180,899,355,77,49,256,18 }, +{ 120,260,51,23,77,15,202,1,93,82,141,450,13,326,515,137,21,5,64,33,110,700,128,165,318,203,269,102,351,753,197,125 }, +{ 15,515,700,753,4,13,11,1,5,21,23,2,33,64,3,180,32,141,22,102,77,0,10,93,82,352,117,40,341,31,165,6 }, +{ 15,515,700,753,341,13,23,141,33,1,0,217,4,77,180,10,82,351,51,137,5,64,9,317,21,11,102,40,260,202,854,115 }, +{ 105,272,131,22,327,286,28,239,320,9,109,578,219,49,98,224,95,159,538,371,616,127,187,64,713,55,0,170,168,258,716,623 }, +{ 16,18,68,35,24,60,71,118,92,126,0,9,101,191,7,55,154,175,212,14,167,150,302,28,375,1,107,124,346,273,21,108 }, +{ 20,147,43,470,376,142,904,178,427,798,0,595,198,325,858,319,61,202,173,97,5,422,14,22,107,259,32,49,887,77,414,392 }, +{ 13,23,51,12,33,15,99,64,128,515,453,202,117,153,37,102,700,40,134,196,120,0,2,753,141,14,38,3,82,403,77,21 }, +{ 383,17,62,136,84,119,56,440,3,504,240,80,378,129,123,548,106,128,4,11,14,555,162,32,184,361,59,64,205,5,469,57 }, +{ 70,1,48,652,5,638,846,888,21,349,269,260,340,562,767,761,163,883,774,141,125,518,591,0,23,9,87,13,371,303,622,31 }, +{ 66,135,6,97,74,278,69,7,14,324,267,172,2,140,462,1,357,38,808,550,92,841,189,29,16,25,298,87,75,204,24,335 }, +{ 51,23,33,13,102,40,12,128,64,77,10,202,0,196,117,4,14,99,134,453,65,153,11,475,139,403,22,141,86,2,21,15 }, +{ 88,100,264,166,274,435,772,1,382,921,96,478,30,438,639,909,897,521,190,466,960,410,9,144,530,418,31,329,265,691,778,93 }, +{ 62,440,136,56,84,3,504,548,555,383,4,17,129,128,507,361,123,59,119,162,14,57,152,328,161,11,202,495,184,27,80,215 }, +{ 911,617,332,959,206,141,253,244,282,384,110,120,10,260,352,143,951,811,269,373,160,417,93,531,728,203,434,940,137,55,36,717 }, +{ 120,15,260,141,77,1,515,82,700,351,33,23,450,13,110,326,64,217,269,753,203,137,102,5,165,21,51,291,93,177,373,128 }, +{ 15,515,700,753,0,1,2,23,13,51,5,9,82,901,180,8,3,4,120,6,7,141,93,12,197,341,10,33,115,730,64,125 }, +{ 7,104,97,107,356,232,66,560,298,289,14,707,38,568,359,64,20,0,65,324,22,214,92,32,192,5,387,43,712,90,172,95 }, +{ 6,1,2,66,67,14,74,24,108,29,69,83,458,7,25,38,135,103,36,150,451,114,52,594,75,65,380,18,267,602,19,278 }, +{ 13,23,51,12,115,21,202,5,457,15,4,1,64,719,0,403,2,3,453,165,99,141,401,128,32,515,10,37,523,197,120,700 }, +{ 57,59,4,11,412,381,77,53,421,291,250,368,99,14,27,369,803,283,23,108,403,19,339,210,0,401,12,444,236,40,361,736 }, +{ 15,515,700,1,0,753,2,13,23,5,51,180,3,115,6,7,457,4,9,8,12,82,197,165,141,901,120,719,33,64,21,22 }, +{ 64,95,180,247,929,146,90,126,197,32,237,60,288,165,316,92,5,13,77,7,217,955,522,22,16,314,132,4,317,10,312,86 }, +{ 15,1,120,13,23,515,0,51,700,180,141,2,5,202,21,260,753,165,137,33,77,110,197,128,326,7,450,4,102,9,269,12 }, +{ 14,2,16,46,1,7,24,69,75,35,38,50,29,220,52,140,267,67,18,54,70,309,5,60,92,189,171,87,71,163,58,0 }, +{ 31,98,127,44,9,299,0,276,293,284,116,49,935,599,105,22,456,201,28,1,39,125,242,137,371,144,131,492,159,272,51,395 }, +{ 6,27,151,53,573,445,297,113,26,73,436,19,491,250,396,315,45,112,145,58,614,881,25,34,611,200,17,80,70,5,138,631 }, +{ 32,693,81,788,90,804,403,56,494,21,84,397,202,65,18,77,64,681,214,725,523,784,526,33,102,825,240,0,115,241,817,91 }, +{ 24,7,14,2,18,16,65,0,108,149,28,69,1,71,154,36,124,35,67,140,189,429,92,68,66,22,55,118,302,150,9,6 }, +{ 0,68,9,35,65,101,189,212,114,67,124,69,1,154,149,39,230,64,252,16,88,702,103,100,18,336,28,329,520,83,30,755 }, +{ 5,2,186,29,61,45,17,1,52,48,58,171,155,227,80,209,311,21,14,46,50,106,243,513,334,502,496,38,3,6,32,592 }, +{ 15,515,700,753,13,1,2,0,3,4,5,23,341,11,10,33,6,51,165,117,153,7,180,12,365,901,77,569,197,115,64,9 }, +{ 13,15,23,515,0,51,1,700,4,2,753,10,3,5,12,77,33,961,165,457,197,11,115,9,22,102,40,403,202,21,14,59 }, +{ 15,515,700,753,13,0,1,23,2,33,102,5,4,10,9,3,51,115,77,7,6,341,12,11,217,40,457,196,180,165,8,523 }, +{ 166,39,30,274,190,100,333,438,530,310,88,252,0,9,539,265,1,656,404,101,625,131,778,254,31,455,676,329,724,158,21,23 }, +{ 734,148,94,308,431,115,37,89,111,413,79,468,197,629,341,474,569,12,13,873,179,401,11,4,180,23,205,72,59,365,134,51 }, +{ 539,228,224,219,816,190,30,258,871,840,669,93,406,530,957,187,160,531,748,137,131,88,863,36,728,839,44,213,352,116,202,466 }, +{ 393,791,125,801,730,551,386,23,31,175,93,98,51,13,144,788,126,203,21,345,116,22,949,110,575,165,326,44,0,4,60,221 }, +{ 13,23,77,141,0,4,51,2,33,115,64,1,10,3,6,15,11,102,7,217,180,40,515,22,128,177,202,9,700,269,165,5 }, +{ 2,29,7,70,52,14,1,58,112,46,75,5,171,163,87,220,307,151,186,334,38,66,155,16,69,135,278,45,262,97,6,21 }, +{ 88,321,213,100,230,435,689,466,1,382,30,352,217,699,410,96,795,36,921,752,190,141,144,180,44,831,317,83,443,31,840,251 }, +{ 363,411,101,520,354,9,195,668,132,156,447,1,905,364,18,23,765,664,146,5,360,13,121,96,98,31,252,39,100,759,264,551 }, +{ 13,23,51,730,12,719,453,457,401,475,5,21,403,2,0,1,15,4,3,899,99,32,165,11,515,308,197,115,6,961,700,523 }, +{ 72,76,89,12,37,4,308,179,38,528,90,431,54,205,148,184,401,57,152,474,23,59,51,245,428,11,32,99,405,316,257,21 }, +{ 376,20,43,147,470,173,97,595,107,319,414,142,819,5,729,178,858,7,427,32,426,104,14,0,392,362,259,61,230,77,560,246 }, +{ 202,141,269,494,318,137,51,128,403,4,217,96,77,5,64,177,291,180,15,352,102,10,33,349,2,317,0,341,120,515,21,453 }, +{ 77,202,33,128,102,318,494,269,13,0,117,23,342,291,403,15,134,51,153,141,177,515,82,137,196,700,203,64,22,351,753,4 }, +{ 253,110,951,352,499,811,10,854,180,638,244,559,642,752,564,8,141,143,417,341,901,260,206,197,922,661,93,15,498,373,165,911 }, +{ 141,13,23,180,4,217,5,1,269,317,21,0,2,202,115,51,352,77,3,197,64,341,318,15,291,9,137,93,32,165,515,33 }, +{ 9,0,18,252,16,101,68,39,24,118,35,109,158,329,28,167,60,364,333,265,49,100,22,419,553,55,1,677,71,7,212,159 }, +{ 28,109,9,39,0,158,49,22,168,35,55,175,1,65,67,185,194,159,289,95,272,114,30,105,86,584,36,169,254,2,83,24 }, +{ 15,515,13,700,1,753,2,23,0,3,4,5,33,341,11,51,6,10,197,115,901,180,77,40,102,12,365,165,141,217,7,317 }, +{ 173,693,104,422,5,18,61,32,102,0,20,13,784,560,33,66,397,526,49,207,29,25,510,707,65,6,11,344,21,263,81,77 }, +{ 23,13,386,51,308,801,719,221,401,949,21,730,165,421,102,115,125,33,341,670,468,117,770,1,120,6,197,14,403,97,67,958 }, +{ 0,49,105,16,28,24,159,9,158,320,1,68,35,239,170,18,109,7,55,65,2,95,301,124,347,14,21,154,22,127,286,31 }, +{ 2,5,1,207,45,29,32,58,76,61,6,263,292,655,72,14,17,476,7,119,52,306,70,64,21,90,186,214,106,38,3,790 }, +{ 21,6,125,49,13,64,715,66,115,95,197,33,22,32,204,165,56,278,0,408,241,120,4,808,681,350,263,85,81,571,135,509 }, +{ 612,427,325,107,202,5,376,49,64,392,403,470,21,147,31,788,494,14,362,465,858,98,20,804,518,43,845,318,125,97,725,534 }, +{ 32,21,76,72,2,1,14,5,241,449,89,38,350,221,155,48,50,292,37,46,45,90,270,54,17,179,214,12,148,430,476,413 }, +{ 24,0,28,16,7,124,35,154,14,149,65,18,9,68,55,108,175,71,2,1,22,109,92,67,484,336,118,69,302,398,570,420 }, +{ 1,5,14,2,48,50,38,67,46,21,0,54,45,270,281,12,24,32,155,96,513,103,290,83,61,58,36,17,37,72,69,181 }, +{ 13,961,569,197,37,15,23,474,515,94,148,111,12,165,629,341,700,79,901,401,51,405,753,10,134,4,115,734,873,11,89,117 }, +{ 33,23,102,51,13,40,77,128,64,202,141,15,4,12,0,1,2,117,22,11,10,403,153,515,99,318,137,269,139,196,700,134 }, +{ 0,1,24,67,9,16,18,35,28,69,103,50,5,2,65,12,83,68,7,96,14,22,21,149,75,114,13,133,23,71,218,54 }, +{ 384,617,940,332,855,911,206,959,434,282,141,10,93,253,244,110,144,268,120,36,352,137,417,203,116,31,44,269,160,201,143,951 }, +{ 30,93,473,137,31,704,450,652,190,203,800,254,166,274,326,144,269,160,127,303,120,625,88,848,110,435,77,521,349,131,340,744 }, +{ 53,27,73,26,19,250,297,200,25,630,17,6,611,122,34,42,714,235,472,65,436,14,80,684,690,106,45,113,680,108,64,4 }, +{ 15,515,1,2,700,0,753,3,5,141,180,4,13,77,33,10,217,6,7,134,11,352,197,64,165,341,317,23,12,115,102,40 }, +{ 254,530,39,613,688,221,30,31,438,190,228,960,1,44,141,21,180,406,23,166,9,202,13,96,137,48,131,829,317,269,393,51 }, +{ 9,39,28,35,30,166,158,36,0,175,101,346,364,67,49,68,168,420,88,1,194,131,100,352,55,83,190,64,137,570,86,65 }, +{ 62,56,3,548,555,507,440,161,34,4,215,136,162,514,361,527,17,14,211,130,328,11,383,123,84,183,38,57,184,152,205,494 }, +{ 92,126,107,7,356,493,97,279,359,298,16,246,35,60,14,441,362,121,43,423,5,132,392,20,508,230,199,146,232,173,150,414 }, +{ 15,82,141,515,291,922,349,700,217,260,372,120,351,93,77,753,318,352,373,854,1,326,269,21,13,102,144,202,64,23,203,137 }, +{ 141,217,352,115,180,13,269,317,752,77,23,21,341,197,5,372,244,291,9,64,51,102,4,1,365,2,165,33,3,48,237,351 }, +{ 78,47,390,19,130,453,108,27,711,813,730,444,412,283,196,690,123,14,128,26,250,389,650,236,200,65,51,4,34,183,297,73 }, +{ 34,250,297,80,472,64,495,17,311,3,148,45,667,61,176,53,243,27,90,161,469,141,483,151,62,128,29,4,58,56,5,231 }, +{ 51,23,33,13,551,77,102,326,421,21,523,120,5,899,453,692,202,153,308,615,115,958,450,401,791,68,221,93,475,18,403,4 }, +{ 98,223,393,363,411,1,478,834,664,156,284,691,447,791,914,293,354,724,697,9,807,541,759,51,18,421,48,264,948,586,195,848 }, +{ 7,14,107,232,16,92,2,60,46,5,359,121,24,526,220,620,135,1,172,21,126,314,132,77,18,75,32,278,12,23,52,38 }, +{ 32,76,2,1,21,72,241,14,5,48,292,89,476,45,720,270,179,90,17,214,148,38,50,29,129,155,350,46,290,227,123,464 }, +{ 15,515,700,753,13,23,33,77,51,4,102,0,32,202,1,11,128,82,117,141,40,5,110,8,3,90,137,21,10,318,403,165 }, +{ 66,6,69,2,1,74,14,135,278,267,380,24,29,97,67,38,103,75,7,388,324,25,52,150,87,83,189,357,335,108,204,172 }, +{ 152,4,339,59,79,471,188,11,77,94,128,33,529,377,12,111,102,202,452,402,216,99,13,542,51,40,474,37,64,291,23,961 }, +{ 15,515,700,753,1,0,196,13,33,2,77,5,23,102,3,10,9,7,217,4,6,153,117,177,14,457,115,12,40,730,11,134 }, +{ 17,209,45,106,207,5,255,119,62,2,61,3,263,742,306,655,425,378,32,56,29,136,84,80,311,58,186,240,243,383,14,21 }, +{ 120,260,450,15,1,23,817,13,515,523,326,5,700,51,82,31,202,64,21,753,318,93,32,269,98,33,351,77,102,125,457,165 }, +{ 116,492,268,93,23,206,203,0,551,918,13,51,8,22,417,940,120,10,499,31,949,791,125,523,165,473,341,730,421,959,401,391 }, +{ 15,515,700,753,165,13,0,1,197,23,4,82,120,2,180,12,260,719,8,3,386,117,5,523,901,11,341,51,10,9,141,351 }, +{ 14,24,69,7,2,66,108,1,67,6,36,398,18,267,150,97,29,38,83,149,65,74,28,0,189,71,388,16,273,124,46,22 }, +{ 330,96,523,335,367,662,141,839,1,922,372,615,244,717,269,443,418,352,403,692,217,854,752,180,36,64,498,576,349,201,98,284 }, +{ 184,90,257,205,245,229,57,152,769,17,524,5,32,497,45,432,619,2,452,266,4,106,1,21,179,59,76,3,460,292,381,128 }, +{ 7,14,16,2,46,5,70,107,87,13,58,307,92,32,38,23,202,0,172,24,18,21,60,128,77,35,20,10,9,4,171,112 }, +{ 7,66,140,16,14,92,97,69,267,172,189,24,380,2,35,60,298,451,230,135,314,74,150,71,38,357,6,330,67,423,21,443 }, +{ 121,167,354,132,18,446,147,101,212,146,407,16,55,35,647,191,20,271,199,68,60,259,463,107,9,126,363,7,195,43,14,411 }, +{ 76,90,179,32,205,21,184,460,257,288,45,245,316,5,57,152,241,2,358,1,229,72,524,148,48,769,17,4,12,38,14,720 }, +{ 147,259,178,878,427,465,581,198,786,798,142,534,325,929,20,362,35,132,107,376,43,5,279,77,49,146,70,202,590,771,33,14 }, +{ 473,93,450,778,141,30,855,466,144,203,330,530,88,523,459,372,201,617,839,704,254,321,934,326,39,36,82,717,332,213,559,403 }, +{ 523,475,51,899,730,453,23,719,403,33,457,13,421,386,4,120,117,196,102,153,15,801,450,817,515,260,202,11,700,99,165,125 }, +{ 15,1,13,515,0,2,700,5,23,753,4,3,341,317,10,115,180,11,33,64,217,77,117,165,197,7,6,365,9,141,102,134 }, +{ 19,4,119,40,33,202,27,84,102,56,77,73,504,485,26,494,757,63,862,59,23,300,25,12,128,11,5,13,342,880,469,6 }, +{ 32,20,2,13,5,21,23,6,12,38,43,29,64,7,95,51,61,207,48,147,90,178,17,182,49,0,115,202,52,362,37,22 }, +{ 339,188,11,79,4,94,377,12,99,111,542,102,37,33,474,51,471,40,453,152,77,13,59,403,342,23,117,57,475,134,128,38 }, +{ 34,128,283,176,495,231,318,432,503,275,529,527,161,53,3,202,56,291,585,469,73,17,14,412,57,27,80,245,250,381,402,51 }, +{ 15,515,13,700,1,217,141,120,23,180,753,115,365,51,317,341,77,260,0,291,110,137,202,5,21,269,64,36,349,2,4,10 }, +{ 13,15,961,515,700,753,4,12,2,457,3,11,197,51,37,569,115,23,5,0,99,10,1,134,6,111,165,33,72,40,38,79 }, +{ 15,515,700,753,13,1,0,2,23,33,5,3,10,4,9,115,7,102,6,51,12,217,77,11,40,457,569,341,117,317,14,719 }, +{ 5,76,2,32,292,214,45,1,129,519,123,179,90,710,17,29,460,72,14,207,21,249,58,205,464,263,618,48,6,245,3,257 }, +{ 72,76,32,4,21,12,38,23,99,54,89,3,14,17,51,57,11,90,13,488,179,2,59,148,45,37,5,115,401,1,10,421 }, +{ 98,223,393,1,834,264,284,791,724,293,478,772,697,909,363,682,905,447,541,821,411,51,421,9,807,48,765,31,730,96,386,410 }, +{ 341,13,509,8,23,638,165,901,762,10,569,242,391,197,873,642,506,499,629,961,15,180,116,456,206,546,417,1,338,457,515,867 }, +{ 1,2,5,50,14,38,46,114,0,36,29,22,218,65,86,96,137,21,133,285,12,10,323,181,17,58,51,23,67,7,28,6 }, +{ 481,878,202,13,5,23,182,32,269,21,1,318,77,142,557,494,141,33,640,137,70,291,2,51,260,415,929,403,120,58,4,259 }, +{ 15,515,700,753,1,4,13,0,2,5,341,3,11,180,134,12,10,317,197,365,33,21,23,165,117,6,77,7,217,37,32,498 }, +{ 25,119,19,6,26,42,27,17,4,790,45,814,2,469,483,84,122,1,0,33,32,128,76,80,611,113,73,56,5,240,202,77 }, +{ 14,2,7,1,24,0,65,6,16,69,67,22,124,28,108,5,18,36,86,10,38,46,66,398,289,168,12,83,21,23,610,13 }, +{ 51,23,128,13,15,202,12,120,33,64,141,82,10,515,0,403,700,3,1,99,117,269,153,165,753,5,318,197,102,260,2,137 }, +{ 16,35,24,0,9,18,7,1,68,69,50,71,103,65,67,189,133,23,28,13,60,537,149,335,75,21,64,5,114,2,12,14 }, +{ 754,803,133,576,880,543,2,1,657,50,14,38,46,5,29,67,218,36,58,171,52,96,24,103,775,0,114,83,181,54,65,45 }, +{ 21,32,5,3,2,17,14,72,76,1,12,23,38,51,4,54,10,0,89,13,99,137,45,36,421,115,543,11,22,128,221,48 }, +{ 434,384,268,144,855,940,617,206,332,116,93,911,959,282,203,137,141,489,44,120,10,110,244,36,98,31,269,253,367,417,160,9 }, +{ 15,2,1,0,13,515,5,700,3,23,180,217,141,10,753,4,117,6,77,33,64,7,11,197,352,317,341,134,165,115,12,9 }, +{ 2,113,6,25,1,0,29,4,7,833,5,45,32,61,128,19,77,151,74,145,64,42,14,210,655,106,59,177,27,17,21,738 }, +{ 116,268,918,203,551,31,8,692,206,791,403,499,417,93,940,421,0,23,22,120,13,523,44,51,299,473,959,1,10,475,202,125 }, +{ 107,126,132,612,362,279,20,146,259,493,199,121,590,43,660,147,35,376,939,60,941,534,683,5,0,953,16,7,49,649,595,470 }, +{ 15,515,700,753,13,1,0,23,2,33,77,4,3,51,5,102,115,10,9,341,6,7,11,342,217,12,120,180,40,317,141,8 }, +{ 53,27,17,161,469,378,73,527,19,136,383,250,495,56,862,26,62,84,80,106,200,4,34,14,440,297,3,128,585,5,129,123 }, +{ 17,45,209,106,5,207,243,454,119,255,2,263,186,290,29,3,21,62,425,61,84,32,58,56,48,408,655,136,306,14,742,227 }, +{ 4,152,59,452,128,79,216,11,339,471,529,188,94,77,202,12,291,33,318,377,99,51,23,5,402,349,32,474,102,13,205,111 }, +{ 15,515,700,753,1,0,2,13,3,5,23,4,180,51,115,9,6,12,7,8,197,33,10,961,901,77,141,752,110,22,120,341 }, +{ 951,752,638,811,351,642,180,253,10,341,197,901,110,873,8,244,15,352,165,898,143,515,564,762,499,55,365,700,82,753,141,854 }, +{ 6,262,197,350,74,26,115,509,841,583,165,38,21,13,47,50,235,19,33,324,453,4,308,196,138,99,64,903,675,1,223,130 }, +{ 125,165,391,23,386,221,21,13,558,457,51,867,197,115,401,758,77,97,308,791,7,180,48,120,963,451,743,89,603,134,403,450 }, +{ 1,14,2,5,16,46,7,38,58,24,50,0,69,48,35,67,54,18,12,75,21,45,513,155,430,37,270,9,61,163,223,32 }, +{ 23,13,51,0,12,15,4,1,115,2,515,453,10,457,5,3,202,21,165,700,403,11,37,64,77,401,9,197,753,59,475,99 }, +{ 129,84,17,56,27,495,19,548,80,123,162,378,3,504,161,469,618,73,40,53,4,26,205,184,106,183,62,6,257,128,862,12 }, +{ 28,9,22,49,109,1,67,0,39,55,168,158,83,36,35,86,420,194,185,159,95,105,69,208,272,103,50,114,2,254,169,30 }, +{ 242,391,8,456,116,13,23,492,341,165,867,51,499,457,479,638,338,509,719,10,1,642,417,762,401,93,206,268,901,569,22,197 }, +{ 211,162,248,130,57,4,41,556,507,266,183,152,305,361,11,129,62,229,38,471,514,313,157,300,377,3,440,128,123,328,339,59 }, +{ 7,92,97,16,298,140,60,126,14,35,279,314,232,246,43,230,508,173,71,107,423,24,150,779,20,189,66,18,607,21,0,653 }, +{ 15,515,700,753,1,0,2,13,23,5,3,180,51,901,6,4,7,12,9,115,8,457,165,82,120,197,10,64,141,341,22,117 }, +{ 0,18,403,25,523,74,6,24,42,91,22,102,13,51,49,193,475,681,95,85,730,64,899,397,273,750,247,673,32,805,757,288 }, +{ 56,0,18,65,33,554,84,343,64,6,90,561,22,19,899,108,27,63,289,475,240,467,370,32,233,214,24,123,95,287,28,194 }, +{ 31,98,127,9,0,44,293,105,395,299,49,242,28,22,599,116,1,284,276,125,456,685,763,159,272,623,23,935,393,144,201,137 }, +{ 1,5,2,14,38,46,50,48,21,7,58,45,270,61,155,171,0,290,69,32,29,54,67,16,24,666,663,17,37,75,502,52 }, +{ 23,51,13,453,457,12,719,4,15,99,401,2,961,3,11,730,475,515,0,1,165,115,629,700,14,17,403,40,5,33,37,64 }, +{ 968,967,966,965,964,963,962,961,960,959,958,957,956,955,954,953,952,951,950,949,948,947,946,945,944,943,942,941,940,939,938,937 }, +{ 2,1,14,29,67,103,6,46,52,75,24,133,38,218,83,309,36,108,70,114,96,5,238,74,25,26,220,236,65,50,69,87 }, +{ 7,71,16,92,24,60,14,97,150,140,35,189,149,298,18,230,43,508,2,423,69,0,38,314,66,279,399,517,251,20,232,273 }, +{ 23,1,120,51,13,202,77,141,260,21,15,5,128,82,2,450,269,165,102,318,48,32,137,515,125,64,12,115,351,180,33,7 }, +{ 77,13,33,23,64,51,4,102,141,128,40,1,2,202,0,6,177,115,137,15,59,10,11,7,269,22,515,180,318,3,700,95 }, +{ 101,9,18,363,264,520,411,604,676,682,905,271,16,821,167,0,621,364,39,100,121,118,166,781,647,252,1,848,447,265,404,60 }, +{ 144,203,326,382,166,418,93,88,96,822,1,141,859,77,744,438,110,269,921,367,521,274,100,39,494,120,403,473,217,576,13,291 }, +{ 13,21,180,125,5,23,191,32,18,16,146,199,115,24,165,118,0,225,22,1,60,197,64,901,375,241,48,12,408,71,522,818 }, +{ 15,515,700,753,13,0,23,8,1,51,82,102,2,33,4,9,180,165,5,77,10,110,12,197,120,260,18,326,351,403,22,457 }, +{ 33,77,102,64,13,23,128,51,141,202,1,40,0,2,117,10,15,4,6,318,269,134,22,515,180,115,177,153,137,196,3,700 }, +{ 174,544,104,525,74,0,151,25,6,624,29,66,2,636,81,45,204,177,64,416,7,644,5,138,222,319,355,77,22,122,789,216 }, +{ 141,304,372,352,291,947,177,269,128,954,77,349,217,202,64,318,498,437,102,864,86,13,115,180,137,5,210,197,32,950,678,7 }, +{ 161,200,53,17,714,27,34,73,472,62,585,56,440,383,136,78,527,19,4,3,106,361,14,250,80,514,377,84,322,390,862,548 }, +{ 32,76,72,21,38,14,89,54,12,37,2,241,5,428,17,1,181,221,350,45,3,4,449,90,148,179,99,292,794,770,477,46 }, +{ 33,23,128,64,141,13,77,51,102,202,2,15,1,3,40,10,5,153,269,515,165,0,117,196,180,318,6,700,137,134,120,22 }, +{ 96,137,30,0,9,39,840,202,669,406,141,530,613,1,180,88,22,160,679,576,28,403,31,219,49,228,829,100,36,15,10,856 }, +{ 180,141,352,1,15,752,115,0,217,365,2,515,13,901,341,317,23,4,197,700,269,5,3,31,753,244,21,165,253,202,51,44 }, +{ 1,2,67,0,28,50,83,65,14,46,103,114,24,38,36,9,69,5,18,7,22,133,55,218,16,124,29,54,96,160,12,480 }, +{ 180,115,352,317,365,217,752,901,141,15,341,1,515,253,700,0,753,873,2,197,31,137,165,244,4,120,160,44,98,5,202,3 }, +{ 5,32,347,49,13,21,95,713,23,1,77,33,60,64,107,4,126,928,296,850,0,241,197,102,652,195,180,534,165,153,379,10 }, +{ 341,180,365,901,317,115,15,752,515,700,217,873,753,82,0,110,197,141,951,165,1,564,13,351,253,12,10,3,2,4,308,244 }, +{ 17,45,21,3,106,5,155,38,227,32,2,209,62,54,12,243,14,181,552,587,46,540,207,794,37,48,430,119,255,221,770,29 }, +{ 16,24,35,18,7,0,50,1,9,14,75,69,2,5,12,21,60,13,67,71,23,48,10,108,223,181,189,103,46,64,92,51 }, +{ 127,13,98,165,308,23,286,293,258,51,219,395,197,115,301,401,31,391,22,105,457,170,239,276,55,338,629,116,180,479,509,569 }, +{ 539,213,748,840,957,669,30,466,88,217,144,251,863,190,137,93,230,228,679,352,317,203,617,321,258,530,160,219,96,831,816,689 }, +{ 5,48,1,21,2,14,0,36,12,38,32,54,430,181,50,270,72,99,281,45,17,10,46,22,37,218,67,3,290,76,23,51 }, +{ 13,23,0,4,33,51,2,115,141,1,77,217,180,10,9,317,3,102,11,5,15,197,7,202,22,165,40,64,515,6,341,31 }, +{ 13,15,117,515,23,12,37,134,165,700,38,54,457,753,51,64,153,197,14,10,33,82,961,0,99,89,115,719,141,3,4,1 }, +{ 5,21,2,3,1,32,14,12,48,17,0,10,51,23,38,22,4,72,13,54,36,45,137,76,99,114,86,37,11,64,540,430 }, +{ 202,128,77,318,291,33,269,102,275,141,494,342,40,678,0,177,20,210,402,7,4,5,137,6,13,450,403,32,49,120,23,22 }, +{ 1,2,24,14,67,46,69,50,38,103,16,18,75,35,83,29,52,96,5,108,0,7,54,71,149,394,236,309,70,133,220,58 }, +{ 15,515,1,700,0,2,753,13,23,5,180,3,51,4,165,457,12,197,115,6,7,21,9,141,8,901,33,82,120,77,10,110 }, +{ 0,28,65,14,67,2,124,24,1,9,7,69,55,154,36,16,46,114,175,35,83,22,429,18,109,149,68,189,108,336,251,133 }, +{ 56,162,403,3,129,775,99,161,17,40,527,33,880,4,14,128,475,12,548,23,102,202,361,117,34,184,383,200,183,196,64,53 }, +{ 151,2,29,58,112,45,186,113,5,70,52,1,311,6,315,66,61,7,74,27,631,17,80,87,287,243,209,227,14,491,19,869 }, +{ 6,1,74,2,75,29,25,66,26,70,52,138,67,324,357,42,19,220,14,85,87,108,38,451,309,103,24,69,380,135,114,65 }, +{ 15,515,700,13,23,0,1,120,753,51,180,2,260,202,5,141,77,102,9,450,115,21,197,165,7,137,110,33,12,269,901,4 }, +{ 5,45,17,2,14,46,48,38,181,50,155,3,186,54,61,29,21,227,281,80,540,106,12,400,52,1,58,32,328,171,209,487 }, +{ 16,18,265,121,158,35,60,9,39,7,329,105,252,68,24,1,132,167,159,22,0,49,286,101,21,146,23,327,120,709,5,14 }, +{ 108,467,283,56,389,650,123,412,33,177,899,475,216,453,269,349,619,65,51,730,403,670,23,196,523,128,84,13,401,789,503,543 }, +{ 514,3,11,377,328,4,361,507,57,403,14,880,130,485,176,215,236,38,152,102,211,56,62,757,54,585,300,556,34,555,40,229 }, +{ 3,555,62,266,130,99,507,139,514,12,152,229,215,305,57,40,440,33,403,471,38,56,475,14,361,313,775,328,196,548,123,23 }, +{ 120,202,318,15,77,13,1,450,33,269,515,260,5,128,494,51,23,700,102,141,40,753,326,403,817,137,523,21,177,922,342,7 }, +{ 15,1,515,23,0,13,700,2,51,753,180,5,165,21,197,12,3,120,115,4,141,6,9,7,457,33,386,202,82,8,31,341 }, +{ 15,180,515,82,351,700,10,317,753,115,217,365,141,898,33,901,13,23,110,854,752,77,1,197,4,341,143,36,64,352,102,9 }, +{ 104,289,66,707,214,90,712,64,97,173,20,0,414,194,874,43,32,7,568,560,65,38,426,312,715,192,376,74,835,5,324,147 }, +{ 84,56,0,554,63,65,453,249,123,643,18,26,847,475,511,403,416,561,524,289,370,73,9,19,45,42,719,194,27,467,33,730 }, +{ 21,346,13,350,308,826,197,101,352,68,570,0,165,23,9,841,115,100,509,694,221,230,35,217,569,88,124,749,1,777,212,154 }, +{ 16,92,7,24,60,18,35,140,126,14,50,71,46,330,2,75,246,5,121,267,571,1,230,309,220,0,9,64,146,236,54,108 }, +{ 82,15,515,898,365,700,180,33,341,753,77,901,10,115,55,351,21,5,1,4,13,102,36,217,2,165,752,120,197,117,11,317 }, +{ 16,24,35,18,69,71,140,1,103,7,189,68,0,50,9,108,2,133,60,267,230,46,149,67,167,118,92,14,75,21,191,38 }, +{ 60,71,16,18,7,20,43,118,35,68,375,28,608,0,175,566,154,92,14,149,628,33,22,13,2,10,279,23,107,356,55,117 }, +{ 187,258,871,295,201,434,219,224,489,384,268,110,261,839,44,699,93,116,36,131,141,228,144,160,940,567,244,406,137,574,98,253 }, +{ 66,7,97,172,192,712,232,324,204,74,43,448,387,426,568,20,526,107,104,135,356,729,173,0,22,5,32,95,2,64,500,560 }, +{ 15,515,700,753,1,4,0,341,13,3,134,2,5,33,11,77,12,10,23,197,365,901,7,40,217,32,21,6,51,180,961,37 }, +{ 0,28,24,9,35,65,16,124,68,55,109,154,7,39,22,149,158,14,175,1,49,252,18,71,2,168,289,419,108,420,67,101 }, +{ 7,16,14,92,2,46,140,24,220,35,38,60,75,1,50,18,87,54,5,126,29,52,278,262,314,107,71,21,172,135,330,394 }, +{ 7,92,16,14,172,126,2,60,140,35,135,314,278,46,24,38,232,107,330,66,5,18,150,246,230,97,52,1,121,563,279,21 }, +{ 6,26,235,53,297,436,27,19,25,73,113,445,90,214,65,42,64,289,250,611,624,32,45,648,614,17,85,491,34,122,200,416 }, +{ 352,141,1,217,854,752,351,180,244,36,110,661,82,258,816,160,295,219,567,224,230,269,922,144,260,268,93,201,137,116,489,202 }, +{ 16,60,35,18,126,107,68,191,92,121,7,14,598,20,493,279,167,446,118,0,28,43,463,55,24,212,375,566,9,150,575,21 }, +{ 15,1,515,2,4,13,0,700,3,5,23,753,341,77,51,115,33,11,180,10,197,141,6,165,7,901,102,40,9,202,217,12 }, +{ 23,51,13,202,21,5,1,120,15,137,128,125,32,2,12,141,33,165,64,515,403,318,700,48,180,7,6,450,115,523,475,260 }, +{ 131,716,224,371,219,187,737,616,385,254,9,98,105,924,31,258,836,39,127,578,49,916,44,761,272,137,944,159,0,242,442,22 }, +{ 15,515,700,1,753,2,5,0,4,13,3,180,11,141,197,10,341,217,33,134,165,6,77,7,317,12,352,64,365,32,102,40 }, +{ 66,74,7,173,174,29,192,2,222,20,226,43,353,52,712,6,0,138,500,204,97,145,64,104,426,673,355,90,25,5,65,87 }, +{ 5,259,786,534,590,493,279,49,13,581,465,21,929,35,941,132,147,32,23,612,362,626,107,121,178,0,146,61,48,939,10,18 }, +{ 2,14,16,7,278,69,135,140,46,24,267,35,92,38,1,189,29,52,309,60,66,75,71,172,74,357,18,87,67,6,230,5 }, +{ 165,13,308,197,391,23,401,15,51,457,180,509,115,569,3,629,961,719,34,758,317,734,14,29,46,2,17,901,38,453,5,217 }, +{ 1,22,2,14,0,28,7,168,67,49,65,24,36,95,5,105,55,35,12,46,69,16,114,159,194,50,10,9,158,83,164,109 }, +{ 34,453,3,196,130,14,322,11,47,51,377,236,361,4,730,153,514,711,57,440,62,17,161,108,176,59,485,56,162,412,202,117 }, +{ 18,16,21,23,48,13,24,35,121,5,156,60,51,1,7,132,141,221,163,115,0,271,447,340,363,202,125,71,2,781,22,698 }, +{ 165,13,457,23,197,961,629,569,341,41,12,38,401,901,54,51,115,17,15,509,421,37,62,45,719,57,32,328,117,758,157,99 }, +{ 2,1,77,141,33,64,3,102,0,23,13,5,128,10,6,15,180,202,269,40,51,515,7,165,137,117,318,4,700,153,197,352 }, +{ 68,212,0,124,101,9,154,16,562,191,21,149,65,24,35,1,118,167,818,350,520,100,722,841,264,71,13,302,478,23,375,346 }, +{ 98,23,48,598,13,293,541,21,125,121,51,807,0,31,35,259,126,7,386,1,223,783,10,107,199,20,221,144,342,963,49,64 }, +{ 21,13,5,586,1,23,167,48,33,781,647,49,165,18,51,271,77,32,761,118,0,82,391,22,146,141,459,31,197,156,115,4 }, +{ 2,1,5,61,29,7,58,45,14,6,425,32,70,52,290,738,207,21,72,112,66,76,655,17,186,46,64,263,38,0,128,87 }, +{ 39,265,9,100,1,333,363,101,18,411,447,254,166,310,31,98,264,30,639,404,156,286,16,93,593,203,272,682,0,905,44,821 }, +{ 6,2,1,19,29,51,26,108,25,74,5,23,14,114,13,386,133,103,42,66,453,70,309,138,719,324,65,38,64,96,52,75 }, +{ 20,43,356,107,49,858,595,7,414,359,0,5,392,319,97,612,422,819,14,376,173,246,22,470,147,427,230,92,197,33,683,95 }, +{ 0,9,68,35,65,67,114,101,28,1,124,175,336,69,154,103,83,24,189,133,39,16,50,7,2,149,55,251,18,345,230,36 }, +{ 23,13,51,15,0,1,515,115,165,2,5,12,700,202,4,21,141,457,753,197,10,3,180,120,32,9,318,11,453,64,6,269 }, +{ 121,195,60,16,126,107,98,271,146,407,132,35,1,167,199,223,493,191,279,20,18,5,43,7,21,92,48,393,0,362,212,467 }, +{ 31,44,299,116,393,144,492,456,268,22,105,0,367,918,384,434,127,489,98,9,963,125,242,948,1,28,206,49,36,51,93,293 }, +{ 23,13,457,51,165,401,719,758,197,453,961,629,308,14,15,12,730,3,386,569,391,29,739,515,34,828,832,901,115,514,670,341 }, +{ 105,36,131,22,180,115,341,127,169,1,9,31,64,98,44,365,317,141,272,143,160,55,219,86,197,776,239,187,0,535,13,752 }, +{ 1,6,2,14,66,25,29,5,108,67,65,114,19,38,26,52,74,7,24,18,69,86,36,388,64,51,17,83,23,46,42,75 }, +{ 51,386,23,453,719,13,730,6,457,670,758,19,401,165,2,475,47,26,899,14,108,17,1,5,197,29,894,754,236,74,27,285 }, +{ 252,18,9,101,121,16,132,0,419,167,364,60,604,35,265,363,146,271,39,158,68,109,28,329,848,24,647,907,682,159,212,55 }, +{ 283,503,128,432,26,193,63,269,789,529,102,122,389,275,678,6,25,318,445,4,342,27,573,605,177,862,643,291,216,57,235,59 }, +{ 2,1,29,75,69,52,14,6,46,74,87,7,220,226,278,38,135,66,267,70,16,262,25,24,380,324,357,140,67,394,97,222 }, +{ 97,298,69,7,66,140,189,24,16,267,172,423,60,150,14,314,92,71,81,501,43,35,74,6,517,232,149,607,83,330,18,2 }, +{ 475,421,403,899,51,805,523,958,453,817,23,615,401,801,120,326,202,670,494,730,450,386,115,629,260,576,77,365,569,0,165,13 }, +{ 7,20,14,128,77,97,112,202,2,177,16,415,269,318,275,66,107,43,141,414,135,38,307,10,58,0,6,291,32,5,4,40 }, +{ 24,14,7,0,2,1,22,28,16,65,168,124,35,67,108,109,18,49,10,149,69,158,5,95,289,12,55,6,36,71,46,21 }, +{ 26,80,27,73,122,25,19,17,6,42,684,209,445,573,667,106,45,690,4,611,255,680,297,495,65,59,128,119,483,113,64,53 }, +{ 107,259,362,376,465,20,470,147,595,534,612,683,660,43,5,49,581,0,858,35,427,246,97,786,178,356,14,21,142,878,7,279 }, +{ 131,30,228,190,856,406,224,88,219,530,863,613,778,274,944,816,187,39,100,160,258,31,44,93,1,321,539,36,871,137,435,531 }, +{ 113,6,311,25,45,491,80,611,27,26,209,667,17,73,122,42,684,396,19,85,106,5,614,4,2,255,151,29,1,64,648,61 }, +{ 15,515,700,753,0,1,23,51,120,2,13,82,5,260,9,4,341,77,180,115,141,10,7,12,450,8,202,901,197,351,165,93 }, +{ 219,127,98,258,395,421,924,293,242,201,697,105,276,51,308,23,453,272,401,944,512,137,13,31,284,567,386,365,116,131,964,125 }, +{ 15,180,352,141,515,752,217,82,1,317,854,700,351,753,115,341,110,13,260,120,21,36,33,898,23,10,5,365,4,160,901,137 }, +{ 129,123,17,257,162,184,205,249,183,769,5,80,3,4,229,130,119,45,90,99,618,106,57,497,12,128,2,84,59,152,27,40 }, +{ 33,102,23,77,64,128,51,13,0,202,10,141,40,15,1,22,117,137,2,86,4,403,269,153,515,196,65,11,700,115,99,5 }, +{ 7,14,2,16,172,107,46,92,5,135,35,202,294,87,38,232,29,97,20,21,24,1,60,220,66,43,12,0,126,52,54,70 }, +{ 403,576,615,523,475,326,805,817,494,421,51,202,120,450,137,453,23,859,260,401,402,77,33,670,0,958,15,197,386,515,165,480 }, +{ 141,352,217,137,0,180,202,349,9,269,23,51,115,291,77,372,13,317,120,752,365,351,93,22,2,341,64,10,82,854,28,18 }, +{ 1,23,13,51,202,141,5,165,21,15,120,180,64,2,197,125,33,102,12,7,137,515,48,128,269,318,93,700,0,403,9,4 }, +{ 25,151,6,145,122,29,174,45,113,74,4,665,42,138,2,614,416,287,19,348,746,0,66,26,1,7,64,243,311,396,81,624 }, +{ 30,190,254,166,100,382,731,829,88,131,264,795,9,93,625,274,438,1,578,613,716,31,44,39,530,36,616,921,265,203,160,77 }, +{ 132,5,21,13,1,23,32,195,379,687,156,121,626,296,48,70,850,146,51,82,883,771,35,49,652,407,60,4,260,0,845,33 }, +{ 9,254,0,49,272,131,39,159,688,101,105,578,518,158,286,28,327,333,68,224,252,219,344,16,22,1,716,31,30,228,24,890 }, +{ 16,7,35,60,18,20,14,68,9,0,28,118,43,92,126,55,107,2,101,154,24,71,5,202,121,109,22,252,21,97,1,621 }, +{ 15,515,700,753,13,1,341,2,0,4,3,5,11,23,10,33,117,12,901,197,6,134,77,8,165,317,21,365,217,7,17,40 }, +{ 78,19,444,47,26,390,27,453,130,813,108,730,711,65,412,122,51,680,113,235,690,196,630,283,128,236,14,64,73,53,200,445 }, +{ 2,7,29,5,61,6,45,1,66,113,112,14,52,315,738,128,32,151,74,16,20,64,70,21,592,0,25,4,425,43,491,222 }, +{ 145,112,74,66,6,29,26,70,19,396,25,87,2,287,135,151,138,222,5,226,42,122,7,307,1,644,45,58,113,651,635,632 }, +{ 92,16,7,60,126,24,140,35,14,232,18,121,246,71,46,267,172,150,107,314,132,146,230,2,278,108,330,199,236,5,38,572 }, +{ 13,115,197,538,569,341,98,55,165,127,365,762,219,286,844,23,170,206,734,638,535,901,169,253,629,0,873,509,180,10,332,258 }, +{ 58,151,74,53,287,27,29,396,6,70,2,73,5,52,112,26,651,1,297,113,17,75,19,45,334,445,145,34,315,549,436,331 }, +{ 214,289,90,874,104,751,64,65,312,835,204,249,750,194,74,81,875,32,519,288,348,0,174,247,636,715,138,192,784,6,524,280 }, +{ 9,39,28,35,30,0,166,49,1,175,439,158,64,346,36,101,67,364,86,88,274,100,168,55,23,10,420,22,190,141,505,180 }, +{ 341,901,15,515,700,753,1,365,10,0,569,180,2,197,115,31,165,3,5,4,44,22,317,13,9,951,23,253,116,143,762,93 }, +{ 120,202,77,450,260,15,128,318,102,515,494,13,817,700,269,5,403,51,1,33,23,753,82,326,141,342,291,137,21,523,351,32 }, +{ 13,115,241,64,180,32,125,197,165,4,118,22,21,23,16,247,237,28,225,191,95,141,167,5,0,341,288,35,459,18,177,24 }, +{ 16,24,35,14,1,2,7,69,18,46,60,50,267,140,71,189,108,38,75,92,0,5,9,230,67,21,309,335,54,236,394,220 }, +{ 15,515,700,753,898,180,901,341,197,638,10,165,33,1,115,4,77,365,317,13,102,217,117,0,5,2,253,3,82,569,21,752 }, +{ 193,523,18,84,56,730,233,65,4,817,90,33,643,403,91,511,453,240,59,11,214,51,719,196,153,475,32,123,64,847,102,561 }, +{ 112,29,151,2,74,6,66,7,222,145,287,45,5,624,52,25,113,416,58,122,19,70,186,204,4,87,644,549,337,884,32,0 }, +{ 13,0,23,2,1,15,33,3,77,515,141,5,4,217,10,51,64,180,700,115,6,117,11,7,753,40,102,165,197,22,317,153 }, +{ 28,0,1,67,65,9,2,114,83,69,103,50,36,22,55,24,46,14,124,109,35,7,16,38,133,160,389,323,18,12,154,5 }, +{ 121,132,18,167,271,146,101,363,621,9,411,647,16,354,520,60,212,932,1,806,55,0,195,446,68,35,31,364,777,252,407,118 }, +{ 26,6,85,396,122,624,25,19,42,445,64,648,573,416,174,680,665,214,45,348,90,65,194,145,113,881,138,289,112,436,297,544 }, +{ 16,146,18,92,24,199,60,71,121,126,35,108,156,953,271,674,132,7,32,640,360,246,649,118,21,95,5,517,14,9,1,314 }, +{ 51,13,23,453,475,730,719,15,457,403,64,115,33,95,4,523,3,12,21,6,899,102,5,128,401,202,11,141,308,515,22,125 }, +{ 151,396,6,53,27,113,58,26,73,112,74,287,45,29,297,19,145,70,138,445,315,436,34,2,17,573,5,61,549,491,1,80 }, +{ 223,1,888,774,260,98,269,385,349,202,96,141,421,622,730,863,318,697,87,453,393,418,922,834,751,5,163,335,120,291,352,30 }, +{ 16,60,92,35,126,121,7,150,246,18,107,1,598,24,167,195,14,97,71,279,98,441,191,199,517,146,356,223,298,271,230,0 }, +{ 22,1,105,28,239,170,0,55,95,31,36,301,2,320,98,127,9,49,44,64,35,67,10,86,5,12,109,23,168,13,21,312 }, +{ 2,6,5,207,292,76,1,119,45,32,17,29,61,306,790,58,240,106,14,64,214,151,476,710,7,72,84,128,4,179,70,25 }, +{ 51,23,221,254,115,13,438,530,125,48,21,39,541,960,386,49,1,613,15,840,228,308,627,131,688,401,5,326,421,158,165,83 }, +{ 1,5,2,0,12,22,21,36,10,14,48,86,23,13,32,54,3,4,28,65,51,50,137,37,208,114,9,38,17,7,281,202 }, +{ 363,23,447,182,296,340,1,93,698,478,379,156,284,144,18,269,21,98,141,70,668,411,664,658,110,914,67,937,180,691,335,291 }, +{ 17,32,45,498,41,115,180,197,106,62,54,38,546,165,13,155,468,509,341,243,241,217,542,15,57,536,428,51,117,721,292,129 }, +{ 32,95,64,246,22,92,180,13,5,652,125,241,638,237,7,49,4,126,21,115,197,296,888,316,0,165,774,23,16,392,1,534 }, +{ 15,515,700,753,33,341,13,217,4,141,77,23,180,317,1,10,102,351,82,115,40,5,854,21,137,11,352,901,365,117,197,0 }, +{ 15,120,1,82,93,217,515,260,77,141,13,110,700,351,352,23,180,753,21,854,202,317,64,349,269,51,165,137,5,128,291,36 }, +{ 13,23,51,141,77,0,33,4,115,64,2,10,102,202,217,128,1,177,269,11,7,22,6,21,32,9,180,40,15,3,165,318 }, +{ 478,264,1,520,98,724,9,682,223,664,21,759,13,772,604,100,23,363,411,48,821,5,0,905,909,447,31,265,88,101,166,39 }, +{ 20,29,7,2,77,416,6,128,33,5,0,113,104,32,43,13,491,66,23,21,102,51,74,210,202,525,64,318,10,81,174,14 }, +{ 2,1,5,14,7,58,61,29,45,290,46,38,52,21,32,270,6,592,425,0,75,155,16,48,17,50,72,70,207,24,263,663 }, +{ 80,6,17,209,106,26,483,113,19,469,255,25,378,27,495,833,45,64,161,2,61,667,76,742,32,90,445,5,814,65,887,119 }, +{ 98,223,393,1,354,834,195,791,447,697,284,293,360,541,781,156,51,807,18,664,421,411,163,668,48,31,591,765,883,386,948,23 }, +{ 679,141,816,36,93,406,876,144,228,137,1,180,669,21,332,251,5,269,116,187,96,351,202,752,317,64,203,831,574,466,855,345 }, +{ 15,515,700,13,1,753,2,0,23,341,3,5,4,10,51,11,33,165,6,7,115,197,12,64,180,153,217,77,9,569,901,317 }, +{ 13,23,202,51,5,21,403,15,120,64,1,450,128,141,12,523,33,165,494,125,2,515,269,7,48,102,318,95,260,180,453,197 }, +{ 16,18,24,60,71,92,146,246,199,35,140,7,9,118,121,108,167,230,126,132,0,640,156,14,68,133,267,360,649,271,64,55 }, +{ 269,141,678,177,202,77,128,318,33,947,40,120,291,349,102,137,64,352,210,864,461,498,13,342,196,23,275,450,954,0,205,111 }, +{ 16,24,92,18,71,60,35,7,108,191,167,246,140,14,126,21,1,68,150,118,149,388,399,9,273,0,121,796,230,48,212,517 }, +{ 2,14,1,29,46,75,52,70,69,171,38,7,58,163,16,5,24,220,67,112,223,54,50,409,155,35,267,186,151,334,394,140 }, +{ 9,252,100,265,166,39,88,404,329,0,1,520,382,812,101,593,264,274,604,676,30,118,68,553,18,664,363,23,639,865,21,411 }, +{ 16,18,35,24,0,60,158,7,22,68,14,49,109,159,55,9,28,71,2,10,5,105,1,118,329,13,344,23,92,20,21,126 }, +{ 15,13,515,700,23,0,753,1,51,2,4,10,77,5,3,197,115,165,961,202,9,457,180,12,141,22,33,120,6,11,318,31 }, +{ 160,93,251,137,317,1,180,36,120,217,345,752,617,352,332,10,96,531,498,318,365,202,141,269,816,341,901,679,143,35,83,968 }, +{ 6,25,42,128,19,59,122,4,85,26,611,27,269,233,45,0,343,91,318,80,11,177,283,73,33,614,2,77,64,138,445,216 }, +{ 95,64,74,7,32,81,51,204,0,20,237,65,56,38,91,23,207,180,347,343,29,6,511,52,49,10,25,18,554,370,14,312 }, +{ 202,120,326,260,450,817,494,318,137,403,128,77,523,553,859,5,704,1,15,23,13,576,7,16,615,51,682,291,515,0,21,234 }, +{ 20,43,107,356,362,126,595,92,359,7,422,319,493,16,858,5,392,246,414,683,60,0,35,945,441,21,259,819,49,97,279,173 }, +{ 25,42,6,77,33,102,0,122,4,690,29,483,210,27,21,19,2,300,18,648,680,119,117,59,1,10,342,12,26,153,91,684 }, +{ 31,44,299,116,125,242,456,599,22,393,0,144,492,28,268,1,9,963,301,105,367,36,127,170,384,434,206,98,918,10,13,93 }, +{ 410,521,686,367,662,88,335,321,201,96,98,772,144,1,934,921,443,435,284,274,264,551,120,897,44,100,33,225,744,418,909,960 }, +{ 142,178,878,234,132,786,195,202,77,416,147,929,146,522,167,259,687,639,450,271,626,481,590,5,198,212,771,49,0,465,315,427 }, +{ 254,39,131,9,272,0,578,716,310,224,30,49,105,827,518,829,166,333,616,228,613,846,101,219,1,31,890,98,159,938,252,100 }, +{ 230,699,854,473,450,351,831,137,855,217,352,704,800,202,251,498,160,144,206,203,317,201,253,752,418,141,1,332,82,180,443,36 }, +{ 403,202,475,453,494,23,51,77,318,402,13,33,128,102,137,141,120,342,269,0,450,4,899,576,40,421,275,117,217,177,196,64 }, +{ 23,44,98,182,291,144,116,39,110,141,96,82,905,70,367,264,125,93,77,411,120,1,658,202,100,415,107,363,197,30,447,105 }, +{ 15,515,1,13,700,23,77,120,0,753,51,180,202,141,260,5,21,115,2,137,128,9,450,197,365,269,12,326,110,102,318,7 }, +{ 0,32,18,95,207,577,193,29,61,104,64,784,715,102,693,887,81,91,583,671,403,5,52,474,397,180,138,49,37,344,38,263 }, +{ 0,101,9,68,252,16,100,39,166,364,124,24,154,265,212,88,18,35,329,419,28,118,71,30,65,158,191,55,1,694,21,676 }, +{ 16,24,191,18,35,71,167,118,149,68,212,9,0,1,21,108,101,92,60,375,302,7,589,755,124,674,350,48,562,246,13,363 }, +{ 2,14,69,24,1,67,46,16,38,103,29,267,7,35,189,135,278,71,108,18,83,309,52,6,149,388,75,236,60,0,150,66 }, +{ 13,1,23,0,4,2,51,15,180,33,3,115,5,515,141,10,77,700,11,9,197,341,202,165,217,102,22,7,753,317,365,6 }, +{ 447,1,698,411,31,363,98,5,919,4,156,125,759,691,13,64,459,354,44,21,48,293,30,914,478,225,82,120,2,922,848,839 }, +{ 854,82,351,217,141,180,352,15,515,752,1,700,317,898,753,244,10,21,922,115,77,36,4,260,64,110,372,13,5,365,120,11 }, +{ 0,1,4,13,5,2,82,33,3,120,10,23,9,11,77,260,21,102,8,31,40,6,351,51,64,450,22,117,93,110,7,457 }, +{ 51,13,403,23,12,475,1,2,21,5,453,523,115,202,817,7,0,99,3,6,450,120,494,64,22,95,49,899,10,37,32,141 }, +{ 180,752,352,141,498,864,317,217,9,0,115,237,230,39,30,197,83,1,930,64,35,365,372,13,579,88,702,36,101,901,482,21 }, +{ 16,24,0,7,22,18,28,35,14,158,71,2,109,60,1,168,49,154,124,68,10,55,92,118,159,9,5,747,95,105,65,6 }, +{ 15,515,700,753,0,23,1,13,341,51,120,77,141,4,137,33,260,82,202,9,180,5,351,2,291,10,11,901,21,115,40,128 }, +{ 24,68,35,149,18,16,0,7,9,14,189,108,69,65,67,1,71,2,118,28,140,101,114,336,230,124,175,133,46,55,251,154 }, +{ 13,308,197,115,125,9,165,237,391,23,509,569,546,28,49,629,22,338,317,254,749,180,468,159,903,386,217,352,558,39,36,734 }, +{ 64,90,32,217,77,4,141,216,172,312,128,13,86,33,597,147,352,95,5,115,875,22,59,11,102,14,182,437,97,177,874,707 }, +{ 1,22,36,0,105,28,2,67,95,49,55,5,239,12,86,9,83,170,312,64,31,21,23,10,164,50,114,159,208,13,7,320 }, +{ 9,18,16,0,159,105,101,252,49,310,24,68,35,39,265,272,7,1,60,28,455,890,329,557,118,286,55,137,327,167,5,13 }, +{ 129,123,214,249,618,17,5,257,205,184,460,76,2,162,769,245,90,106,128,45,119,1,183,4,3,12,179,64,6,229,99,209 }, +{ 51,453,719,457,23,13,730,899,475,386,4,15,11,12,670,196,515,523,961,401,153,3,700,99,753,117,403,32,120,165,57,0 }, +{ 173,66,192,204,20,74,104,636,7,43,289,426,825,712,560,214,81,750,65,97,707,0,90,414,64,348,32,500,22,861,95,6 }, +{ 13,23,1,51,5,21,141,120,202,15,165,2,515,180,12,125,0,64,82,700,197,269,32,48,260,128,115,93,9,137,33,753 }, +{ 200,34,322,78,472,390,27,714,19,14,136,161,453,176,236,444,59,3,62,128,108,57,283,862,73,53,47,17,412,813,4,56 }, +{ 33,347,66,204,426,498,56,172,97,95,5,681,546,22,10,0,135,180,4,241,19,174,6,353,263,21,7,370,42,197,27,808 }, +{ 131,224,219,187,385,371,258,442,254,737,31,98,836,127,924,944,44,871,908,716,39,827,201,574,116,137,36,1,276,242,578,616 }, +{ 1,15,2,180,4,141,13,515,0,5,3,115,700,901,341,23,217,352,753,51,197,77,317,33,365,752,165,21,6,7,269,93 }, +{ 370,91,718,74,81,510,397,66,636,240,355,84,138,511,18,278,6,681,701,289,90,5,214,582,64,104,0,643,192,65,750,32 }, +{ 15,515,700,753,13,120,1,0,165,2,197,23,260,180,4,82,51,386,8,5,12,10,3,141,351,341,326,9,450,7,64,6 }, +{ 32,2,76,5,1,292,72,45,476,214,21,241,29,14,17,48,129,90,179,460,464,123,290,148,519,205,3,263,249,38,710,89 }, +{ 13,165,115,17,197,569,23,509,457,45,32,41,106,180,62,38,659,734,155,536,341,629,961,873,587,54,431,37,391,99,405,428 }, +{ 68,24,35,16,0,101,9,124,154,71,149,65,18,175,28,118,7,55,302,108,92,14,22,346,1,39,429,252,375,364,10,67 }, +{ 0,9,1,68,230,65,35,69,83,23,101,13,141,67,217,352,21,39,16,88,28,124,212,100,115,154,51,64,30,36,10,317 }, +{ 100,265,88,909,410,382,812,593,1,213,321,30,252,230,352,264,9,166,689,39,676,98,21,466,724,639,478,217,13,48,553,101 }, +{ 113,61,198,904,43,0,5,37,899,325,20,59,33,523,204,725,817,389,470,329,222,40,174,58,22,453,690,848,122,104,788,105 }, +{ 16,24,141,18,7,0,71,140,35,269,75,352,12,9,108,217,5,330,60,64,199,70,22,13,486,246,318,133,65,50,23,498 }, +{ 271,167,121,60,18,191,146,199,16,1,446,132,575,212,463,354,126,35,598,566,727,98,107,21,608,955,640,407,5,24,223,68 }, +{ 15,515,115,217,700,13,317,753,141,180,33,23,110,120,4,341,82,10,1,260,365,36,64,854,351,21,51,352,137,77,40,0 }, +{ 173,7,97,356,43,107,20,387,729,104,426,232,560,595,359,392,414,707,885,81,5,0,66,858,612,49,861,14,22,32,819,230 }, +{ 15,515,700,753,13,23,51,82,0,33,165,120,196,4,1,2,197,453,260,351,180,12,40,8,386,110,5,326,9,141,217,457 }, +{ 9,100,120,30,77,795,137,82,202,39,264,827,578,127,0,166,373,318,18,326,141,260,1,450,731,31,33,395,217,291,341,254 }, +{ 14,2,67,1,24,69,0,28,65,7,46,18,114,108,36,83,38,398,9,16,124,133,103,154,50,55,22,267,29,160,35,547 }, +{ 14,7,69,24,66,16,2,267,189,67,71,150,140,97,18,60,172,35,6,1,38,149,388,92,83,135,108,74,462,380,29,36 }, +{ 15,515,700,753,1,13,2,0,4,341,5,3,23,365,11,117,180,10,12,33,134,115,77,197,217,165,6,7,317,102,21,9 }, +{ 1,22,0,12,5,2,36,28,21,10,86,13,23,49,128,9,95,51,55,96,208,141,48,202,4,137,37,64,105,3,50,7 }, +{ 605,630,63,123,736,650,65,108,444,368,561,389,19,25,42,619,122,194,183,27,53,33,84,26,297,813,114,73,256,235,249,216 }, +{ 127,39,9,0,31,371,98,254,1,385,395,44,30,836,187,131,100,116,284,578,299,166,28,21,737,16,276,272,23,49,137,935 }, +{ 2,61,6,29,45,151,1,655,7,207,32,5,112,425,17,76,833,4,14,64,58,106,119,25,113,128,72,52,70,21,292,790 }, +{ 2,5,17,14,3,29,23,27,13,401,46,6,51,58,1,453,45,53,34,52,133,19,236,26,181,114,99,366,151,108,218,38 }, +{ 31,22,170,1,301,44,127,98,36,28,55,105,239,0,338,116,512,299,293,125,86,10,242,395,2,13,9,64,841,23,95,685 }, +{ 1,21,2,14,5,32,48,50,38,270,46,76,290,72,45,54,17,0,155,221,263,207,37,281,430,3,89,12,181,408,36,67 }, +{ 17,106,119,378,84,240,62,80,383,136,306,3,56,790,742,5,207,504,64,440,32,128,45,2,123,209,14,4,61,57,297,667 }, +{ 13,15,1,515,23,2,0,700,4,115,3,51,5,10,753,180,33,341,217,11,165,317,365,197,6,77,40,64,22,9,7,117 }, +{ 772,335,96,744,1,367,662,686,652,897,303,264,521,31,225,410,141,520,260,116,64,44,321,98,144,88,919,966,340,269,349,284 }, +{ 1,0,5,28,36,2,12,22,83,67,65,50,24,14,9,96,21,218,18,114,48,281,54,10,7,160,181,103,37,23,133,99 }, +{ 25,6,145,42,138,81,174,348,525,544,26,74,85,280,287,648,746,91,66,0,29,396,204,64,636,90,122,194,355,104,65,233 }, +{ 16,24,0,18,28,158,7,35,49,22,68,159,55,1,14,109,105,2,9,71,65,154,124,95,424,344,60,239,118,577,21,10 }, +{ 352,854,699,230,93,689,137,144,217,160,251,36,669,202,351,120,617,855,752,203,332,82,450,180,141,748,831,30,258,201,1,816 }, +{ 15,13,515,700,1,2,0,753,23,5,4,3,51,33,10,115,11,317,217,77,180,341,117,165,6,134,197,153,64,9,102,7 }, +{ 104,20,43,173,66,319,0,77,202,7,198,5,97,580,355,74,2,204,174,52,712,234,426,155,102,192,32,4,500,337,226,904 }, +{ 13,23,15,51,1,515,0,2,5,700,141,4,753,165,115,12,3,21,457,10,180,269,32,6,197,202,9,7,120,11,77,33 }, +{ 15,515,700,753,0,1,13,2,23,115,4,317,8,3,5,51,9,341,10,217,22,365,33,457,6,180,77,901,197,120,18,7 }, +{ 15,515,700,753,1,0,2,3,4,13,5,141,23,16,82,217,457,10,365,180,9,317,51,21,269,898,64,202,11,12,318,341 }, +{ 2,5,1,14,50,38,29,17,114,46,133,3,45,21,58,171,181,36,218,12,6,52,0,48,137,65,361,23,155,4,285,51 }, +{ 0,68,9,65,101,124,35,212,16,149,154,100,24,1,114,336,67,589,252,39,71,189,69,562,18,13,30,398,118,88,265,264 }, +{ 1,2,14,22,0,7,67,65,28,36,24,46,168,5,86,69,38,16,49,12,289,10,194,50,83,114,95,6,18,23,55,158 }, +{ 13,23,15,51,515,0,700,4,1,753,2,10,115,961,457,12,33,11,3,5,197,9,165,77,102,403,453,40,64,22,37,59 }, +{ 15,515,700,753,0,1,2,13,23,5,51,901,8,9,180,3,7,82,4,120,12,10,719,341,6,31,141,457,197,22,115,93 }, +{ 100,252,88,101,0,265,9,724,48,1,21,352,213,676,410,382,321,230,30,329,593,909,39,812,553,217,23,689,520,264,166,419 }, +{ 2,5,1,58,171,14,46,50,29,52,45,38,186,155,67,54,151,281,334,61,48,96,17,181,103,400,502,227,21,223,12,69 }, +{ 23,120,13,1,202,141,51,21,165,128,260,15,5,269,137,64,33,180,82,318,93,197,77,326,515,125,110,700,450,2,32,48 }, +{ 341,197,10,901,13,15,8,638,569,515,479,23,180,873,700,165,143,642,0,961,753,951,1,115,509,499,116,12,498,242,82,206 }, +{ 15,1,23,13,515,21,120,51,2,141,202,700,5,180,165,0,753,197,12,7,33,260,352,137,269,4,82,128,48,9,110,6 }, +{ 2,29,50,58,1,6,5,52,14,262,17,46,27,53,151,34,171,74,324,26,38,309,45,113,19,96,287,396,223,67,73,583 }, +{ 13,23,141,51,4,202,0,115,77,2,33,217,5,317,180,64,10,269,3,9,15,21,1,128,102,137,318,11,352,515,22,31 }, +{ 1,67,0,24,50,5,14,18,16,69,2,9,103,35,83,12,96,28,54,7,58,223,21,46,281,48,65,181,22,38,36,108 }, +{ 13,23,141,51,77,64,202,115,33,102,128,4,0,269,10,21,217,32,180,318,9,137,2,11,22,291,7,177,16,31,165,197 }, +{ 317,115,180,365,873,498,217,341,13,752,482,197,569,352,1,901,36,23,457,468,165,346,546,143,509,134,579,876,868,2,332,21 }, +{ 184,257,205,229,152,17,57,497,266,432,452,524,5,619,381,32,4,90,2,12,313,128,45,59,245,106,3,471,129,769,339,214 }, +{ 13,23,1,0,15,2,4,515,51,3,10,33,5,700,115,180,753,77,11,365,341,217,9,6,197,7,102,165,317,40,22,64 }, +{ 626,70,771,687,379,846,767,761,518,878,82,481,31,786,49,591,178,163,407,44,87,13,845,125,590,371,195,120,98,557,937,351 }, +{ 264,1,410,909,772,897,686,521,335,478,98,96,691,639,100,44,284,382,31,321,744,88,914,724,662,765,223,9,682,363,0,367 }, +{ 13,23,1,2,0,15,51,515,5,10,4,33,115,77,180,700,3,141,217,40,6,753,317,197,64,165,7,11,102,9,341,22 }, +{ 141,77,13,64,269,23,115,21,318,217,5,202,102,33,137,2,15,291,177,51,48,180,32,4,515,352,128,7,0,10,96,11 }, +{ 13,15,23,515,51,0,700,753,1,2,4,10,33,11,961,453,115,40,457,14,12,3,9,5,165,401,197,77,22,21,64,102 }, +{ 1,22,0,36,2,31,5,12,13,105,28,9,49,86,141,21,23,95,128,55,44,115,170,10,164,98,180,4,137,239,83,51 }, +{ 100,101,88,0,252,9,265,30,21,39,759,724,213,329,321,13,419,68,562,382,676,352,694,35,553,410,1,166,909,593,230,23 }, +{ 539,88,30,190,321,530,840,144,669,435,957,748,778,100,96,418,203,213,1,131,410,228,466,274,36,382,219,863,613,83,822,352 }, +{ 7,97,92,173,298,107,43,314,232,140,16,356,20,387,729,362,126,359,246,14,230,501,426,441,0,5,560,66,104,779,35,60 }, +{ 2,14,7,1,58,5,46,16,38,70,75,45,24,155,29,0,21,52,61,163,220,50,69,270,35,48,32,171,18,6,64,54 }, +{ 447,411,363,664,647,98,621,1,354,271,223,478,18,777,781,936,360,759,167,132,121,48,21,156,9,195,118,293,23,691,13,264 }, +{ 2,1,14,65,36,67,0,7,46,22,69,5,38,24,28,6,83,29,86,114,168,50,124,208,12,18,108,10,194,484,103,16 }, +{ 421,386,51,791,730,958,165,801,23,453,697,403,615,13,221,523,24,899,401,326,551,670,576,102,18,33,125,77,566,115,203,197 }, +{ 104,319,422,945,0,81,20,43,715,32,784,693,879,7,397,74,306,207,52,681,671,2,61,173,6,636,904,95,887,5,18,192 }, +{ 2,29,1,46,14,52,70,262,6,26,50,67,75,96,309,38,103,112,58,19,5,163,145,83,74,220,223,357,24,69,331,25 }, +{ 786,929,590,771,687,626,941,178,465,259,70,5,13,21,35,534,107,518,132,49,878,48,146,121,379,279,31,767,147,195,108,125 }, +{ 5,2,186,45,17,29,48,50,14,61,46,155,400,1,227,171,52,58,38,54,430,209,80,281,3,106,536,311,181,243,21,502 }, +{ 0,32,64,95,817,494,342,403,207,202,194,389,453,365,312,180,316,5,690,237,848,577,450,61,102,523,475,289,49,241,65,482 }, +{ 23,13,51,15,202,515,1,0,2,12,115,4,700,5,165,753,77,457,21,141,128,10,3,64,403,32,197,318,9,11,33,117 }, +{ 119,2,6,76,5,17,45,292,306,240,32,1,19,84,64,61,4,209,710,80,26,0,106,27,214,25,128,129,29,179,3,113 }, +{ 15,515,700,1,753,0,13,23,180,120,51,2,5,33,165,197,9,450,7,260,115,523,4,12,202,141,82,77,21,102,8,6 }, +{ 6,138,74,280,222,85,66,226,25,42,87,204,64,337,29,135,95,174,235,26,145,65,19,32,792,294,112,52,256,2,5,22 }, +{ 39,9,0,101,333,158,49,252,310,254,272,68,16,18,159,286,344,455,30,109,627,327,24,105,419,100,364,22,35,1,329,709 }, +{ 21,5,32,14,2,1,38,72,76,54,17,3,48,221,270,0,45,46,12,181,37,89,36,50,540,290,430,10,4,741,99,23 }, +{ 120,137,202,269,141,260,318,450,922,494,77,291,82,15,5,351,128,1,515,326,64,854,700,352,342,21,753,678,349,32,523,90 }, +{ 16,24,68,35,71,18,149,118,191,167,9,0,212,124,65,246,7,67,140,189,399,101,133,60,1,108,267,114,69,92,695,154 }, +{ 28,0,9,67,1,22,109,36,55,65,194,114,39,83,49,69,2,35,103,50,158,208,86,420,168,289,505,24,7,185,5,323 }, +{ 147,325,198,427,142,178,202,798,5,376,20,318,259,43,120,450,77,234,534,904,470,465,878,725,329,14,315,0,260,858,70,61 }, +{ 28,0,65,9,109,1,55,67,35,22,24,39,289,7,175,14,114,2,158,124,420,194,68,16,336,36,49,69,168,570,154,505 }, +{ 141,180,13,115,1,23,4,269,2,202,0,317,217,51,15,5,21,352,77,318,3,752,197,10,165,365,137,341,9,515,33,64 }, +{ 15,515,700,0,753,1,2,13,23,9,51,5,4,901,33,7,8,3,12,93,180,120,197,6,82,341,10,141,22,260,457,115 }, +{ 6,74,66,1,25,75,324,380,278,26,138,85,135,500,87,42,220,841,97,350,29,19,70,226,38,21,52,606,235,889,2,14 }, +{ 13,23,51,12,4,15,453,0,457,1,403,165,115,3,11,2,64,5,401,10,515,37,202,33,40,32,99,475,197,700,308,17 }, +{ 788,180,5,83,693,319,314,4,32,21,17,11,817,3,510,498,33,12,24,104,814,120,64,117,306,804,523,450,288,160,102,43 }, +{ 15,1,515,13,2,700,23,0,4,753,3,341,5,51,33,11,10,6,77,7,115,102,180,165,141,9,197,217,901,40,12,64 }, +{ 101,18,9,167,520,16,0,118,60,212,604,364,694,24,55,252,68,917,264,35,1,121,146,363,39,100,806,5,21,166,191,28 }, +{ 13,23,0,33,51,141,77,4,64,2,115,217,9,102,7,202,21,10,180,3,15,128,5,269,6,32,11,16,165,352,22,317 }, +{ 66,135,97,74,172,6,278,7,204,324,138,174,29,85,2,87,25,140,92,192,52,38,802,69,448,500,808,620,22,1,280,232 }, +{ 2,24,69,6,97,7,1,0,14,298,423,66,67,29,150,25,189,267,124,74,607,18,36,81,172,33,83,38,52,273,71,809 }, +{ 0,9,158,39,68,49,109,16,24,333,35,344,101,22,159,254,272,30,124,65,28,18,793,154,310,252,327,105,627,419,286,55 }, +{ 15,515,700,10,753,33,77,180,4,341,1,197,13,115,365,23,901,317,5,102,11,217,165,117,141,40,2,3,253,21,134,55 }, +{ 100,166,382,478,265,264,88,39,98,1,404,274,9,593,724,921,639,438,363,682,411,31,30,812,96,447,821,905,252,0,223,435 }, +{ 19,283,436,53,297,26,813,432,27,128,42,25,390,503,122,736,73,123,605,63,389,529,630,250,690,65,381,444,6,269,108,216 }, +{ 33,202,13,128,494,0,51,141,269,1,4,2,102,180,15,137,65,95,6,450,77,40,117,59,457,36,196,817,134,86,49,515 }, +{ 18,16,60,68,101,167,191,118,35,121,9,212,55,0,126,1,24,647,199,146,520,107,628,621,363,71,21,28,346,92,806,727 }, +{ 352,230,217,531,160,93,36,669,748,854,689,258,137,871,728,699,752,251,574,202,373,351,228,120,717,260,144,219,268,82,816,1 }, +{ 15,515,700,753,13,23,0,51,8,1,4,82,165,77,110,33,10,180,5,202,11,22,120,12,9,197,115,93,403,141,40,351 }, +{ 0,9,101,217,35,88,352,100,39,175,30,68,562,752,13,317,252,115,180,197,64,1,83,141,65,213,165,230,194,36,28,265 }, +{ 32,64,5,470,288,90,21,147,0,95,356,22,20,519,835,312,819,18,247,182,11,97,13,4,387,49,43,298,316,48,107,7 }, +{ 16,14,24,1,7,2,35,0,5,50,18,69,46,12,58,75,9,67,70,163,21,54,38,48,223,502,281,37,140,60,28,10 }, +{ 18,265,9,252,39,195,354,411,1,16,132,101,121,682,167,203,5,363,146,593,35,333,21,271,60,13,100,0,156,327,7,520 }, +{ 4,13,1,115,141,23,2,180,5,0,51,3,217,202,77,15,33,269,341,318,317,21,165,515,11,10,197,365,9,137,64,352 }, +{ 7,24,16,14,71,35,18,92,140,189,108,149,68,60,69,150,2,230,97,66,0,458,67,1,65,251,38,314,388,267,36,46 }, +{ 1,22,31,36,0,2,44,5,141,105,180,170,12,64,13,98,86,55,23,21,28,164,115,127,10,125,128,4,9,239,352,197 }, +{ 66,7,97,2,192,20,52,43,135,74,560,107,104,0,750,147,414,29,580,173,324,376,226,194,77,174,204,38,356,64,16,470 }, +{ 0,101,9,68,35,124,24,65,39,16,252,100,154,166,28,364,149,694,30,88,55,346,1,419,71,439,265,289,22,21,175,158 }, +{ 39,9,100,30,127,0,737,856,31,836,827,254,98,931,166,88,93,1,44,190,131,228,120,395,625,385,863,264,219,373,110,28 }, +{ 5,17,2,3,21,45,14,155,48,32,38,1,328,181,186,46,23,51,12,61,227,29,106,54,99,133,62,832,13,37,514,543 }, +{ 131,613,30,224,228,716,274,100,827,406,219,856,39,190,31,88,1,166,9,44,829,863,931,93,0,187,625,924,127,98,137,254 }, +{ 352,217,64,141,752,269,180,864,437,372,954,115,498,177,77,349,317,318,579,291,947,197,247,0,23,717,237,304,128,457,776,678 }, +{ 141,217,180,317,352,115,15,341,1,23,13,365,515,752,2,64,0,5,498,700,372,165,51,237,753,77,244,197,137,4,21,253 }, +{ 5,2,17,14,1,45,3,38,21,29,181,58,46,48,50,133,114,171,61,155,32,6,186,281,361,12,36,54,4,13,52,514 }, +{ 6,2,119,25,790,4,45,483,655,113,1,29,76,26,32,19,887,17,128,0,292,833,59,61,106,64,77,814,14,151,84,42 }, +{ 13,23,2,0,51,4,1,115,141,3,5,180,33,217,77,9,202,11,7,15,10,6,317,64,21,197,515,165,102,128,22,269 }, +{ 13,23,51,202,21,5,1,15,141,165,120,2,115,12,32,0,515,128,318,64,125,700,4,403,197,453,180,457,3,7,10,6 }, +{ 9,39,31,30,0,127,1,44,100,131,98,187,385,276,88,442,219,908,254,116,49,166,935,28,201,36,141,827,137,299,284,21 }, +{ 22,49,28,109,9,185,105,95,1,131,159,272,36,67,86,254,39,55,35,0,505,31,83,169,208,327,286,98,168,535,312,708 }, +{ 5,1,2,21,0,12,48,22,10,14,36,3,32,17,23,54,86,38,4,51,13,37,137,50,65,281,114,45,28,99,58,202 }, +{ 141,269,352,82,217,351,180,854,372,922,752,1,15,260,317,318,515,202,64,700,120,349,954,753,77,35,67,717,898,137,365,115 }, +{ 144,203,613,418,326,406,96,669,137,679,1,228,494,822,840,317,36,83,855,160,817,859,856,816,217,831,345,93,876,77,44,251 }, +{ 15,13,515,23,700,753,51,1,33,0,202,21,2,5,180,141,120,165,217,82,12,117,4,352,269,197,115,32,3,9,134,260 }, +{ 13,23,115,1,0,51,4,77,2,33,15,141,10,5,341,180,515,3,217,202,9,365,317,64,700,102,11,165,197,22,753,7 }, +{ 531,943,373,160,728,93,206,260,261,559,964,269,717,535,332,384,365,295,110,533,141,10,180,352,244,137,120,55,959,564,36,253 }, +{ 1,31,36,170,22,55,44,10,86,64,127,0,2,98,301,164,740,338,237,143,5,125,116,13,242,141,299,180,23,169,105,12 }, +{ 13,23,141,77,51,4,64,32,33,202,115,269,102,128,21,0,177,180,318,90,40,10,7,5,137,15,217,352,9,291,59,22 }, +{ 202,120,260,318,77,15,450,269,1,82,33,23,141,13,51,515,351,128,700,5,64,326,137,21,102,110,753,494,93,523,817,165 }, +{ 523,899,102,33,730,15,23,403,719,117,153,13,515,51,475,4,700,5,453,817,196,753,494,40,202,120,1,2,450,457,17,421 }, +{ 202,403,494,450,120,817,523,475,318,453,33,402,128,77,13,51,260,576,342,102,15,23,515,4,700,5,82,753,326,210,137,615 }, +{ 2,29,112,66,7,52,70,151,58,87,135,5,74,226,307,6,14,186,1,45,549,172,644,25,113,287,46,155,334,64,294,97 }, +{ 1,77,349,291,260,120,652,102,5,39,64,269,9,33,340,342,13,98,888,698,23,296,100,318,51,202,87,137,638,128,50,850 }, +{ 1,13,15,2,0,4,23,515,5,141,180,3,700,341,115,51,753,269,77,901,197,352,217,33,21,11,365,6,165,202,7,317 }, +{ 7,14,16,2,46,5,70,58,1,38,24,35,92,163,0,75,21,18,50,54,140,12,87,220,155,69,171,23,60,9,13,307 }, +{ 2,29,66,226,135,7,87,74,52,278,6,75,222,220,294,70,97,1,145,25,172,262,324,38,69,112,331,92,5,14,140,26 }, +{ 15,515,700,753,33,77,117,4,1,102,134,40,153,11,13,196,217,21,5,51,23,115,32,3,2,202,141,137,128,291,48,177 }, +{ 15,217,82,515,351,141,317,1,13,700,260,77,110,120,115,854,23,753,180,51,21,36,137,922,5,64,365,352,291,202,93,341 }, +{ 31,190,30,373,120,110,863,88,44,127,908,856,260,318,82,98,93,187,836,717,935,39,442,131,141,254,228,219,1,968,77,116 }, +{ 23,13,308,9,165,115,51,21,401,125,49,39,197,391,159,254,217,743,28,438,773,629,558,386,341,95,32,317,876,679,109,166 }, +{ 49,5,43,165,7,0,21,104,125,22,173,422,64,13,623,102,20,18,314,95,91,141,23,31,193,51,391,900,779,558,92,232 }, +{ 1,4,13,2,15,0,23,515,77,3,341,33,5,700,115,51,202,753,141,180,11,10,102,217,6,901,40,7,197,318,317,365 }, +{ 93,843,295,120,36,160,206,261,10,137,567,110,384,141,943,268,201,332,258,55,1,180,64,116,44,144,699,203,282,31,260,373 }, +{ 81,7,192,426,43,173,172,104,879,91,5,712,715,526,6,97,568,95,448,66,33,861,560,32,49,20,0,636,232,825,2,22 }, +{ 475,403,51,453,33,102,13,23,494,202,0,196,15,77,153,18,4,117,515,450,318,22,730,128,700,421,65,753,269,402,134,817 }, +{ 141,269,260,318,202,120,352,349,82,351,1,5,854,137,64,291,15,922,180,851,32,77,515,372,21,700,7,217,13,947,33,753 }, +{ 15,515,700,753,1,13,0,2,4,23,3,5,180,115,197,12,51,165,217,10,961,9,6,141,352,21,8,7,33,77,457,120 }, +{ 39,166,9,30,0,101,274,404,252,333,190,100,158,438,310,88,68,265,656,21,1,530,329,344,49,539,625,254,13,131,48,419 }, +{ 0,1,28,9,22,12,65,83,67,36,5,2,50,55,96,109,16,13,24,23,21,238,49,18,285,160,128,39,69,114,7,323 }, +{ 15,515,700,1,0,753,23,2,13,51,5,180,115,6,3,9,197,12,457,120,7,165,901,82,4,21,8,141,31,33,719,341 }, +{ 9,39,30,0,28,166,22,49,180,1,352,35,317,158,88,141,498,131,115,345,752,128,228,217,100,83,219,930,13,251,365,36 }, +{ 5,61,45,2,80,29,311,209,6,17,58,1,151,106,454,667,243,70,52,496,287,592,255,738,64,74,483,14,27,32,112,19 }, +{ 31,125,22,44,299,456,685,242,599,116,170,28,0,1,492,393,506,144,558,10,268,301,239,23,13,36,963,367,55,206,105,95 }, +{ 187,258,926,574,839,93,228,860,406,219,871,160,137,531,224,116,120,902,669,201,36,131,44,144,843,533,318,384,442,1,434,268 }, +{ 7,107,75,16,87,9,64,177,24,18,291,77,349,141,60,232,23,0,51,269,132,14,5,21,70,32,678,112,126,121,71,947 }, +{ 15,515,700,753,1,0,2,13,5,3,23,180,4,115,901,51,6,8,961,9,7,10,12,82,197,22,141,341,33,120,365,457 }, +{ 13,23,51,1,5,202,2,12,15,21,165,141,0,115,3,4,32,515,197,10,180,318,128,120,64,700,6,7,403,269,457,137 }, +{ 1,2,0,77,64,3,141,13,33,15,23,10,6,102,5,515,180,4,117,7,700,165,11,217,269,40,753,115,128,17,197,134 }, +{ 345,531,332,269,260,317,717,752,373,351,180,352,728,82,10,365,160,533,217,143,498,251,244,93,341,901,36,1,141,898,55,864 }, +{ 16,7,33,189,92,77,388,60,140,35,102,24,14,1,230,21,150,117,733,314,18,915,71,13,108,134,5,64,69,2,98,22 }, +{ 142,202,234,178,5,786,77,49,70,0,416,450,639,878,1,48,21,929,147,259,315,455,198,120,12,481,163,113,846,329,318,22 }, +{ 81,715,192,0,173,712,681,104,636,91,74,20,750,370,7,718,95,879,22,43,825,560,422,64,207,49,172,18,397,10,426,319 }, +{ 13,23,15,51,515,0,700,753,4,1,961,2,10,115,457,11,33,453,3,5,9,40,12,197,165,77,401,475,64,102,22,569 }, +{ 64,297,5,445,95,61,250,311,80,34,17,312,45,2,86,472,58,14,180,53,22,151,869,738,247,237,29,1,128,165,21,288 }, +{ 16,24,18,71,7,35,118,92,14,154,60,68,0,149,28,302,124,150,55,175,2,9,97,1,429,20,108,273,22,65,43,126 }, +{ 195,360,156,771,132,163,626,687,591,371,883,146,121,846,70,586,379,13,293,98,407,48,761,296,354,18,31,1,55,49,21,105 }, +{ 202,13,77,23,318,33,51,0,4,141,5,21,217,32,291,102,64,128,15,10,9,494,269,137,515,403,1,31,117,700,120,317 }, +{ 7,192,97,81,172,66,426,173,43,715,712,232,861,879,104,330,568,298,74,893,885,526,387,825,92,140,91,14,636,6,5,448 }, +{ 2,1,14,6,67,7,65,69,24,36,66,124,108,83,38,29,22,86,0,18,484,5,28,46,12,10,25,302,150,16,650,74 }, +{ 33,77,13,202,102,4,0,23,128,51,141,64,318,22,403,269,137,10,15,40,494,117,32,59,11,153,1,21,177,196,515,115 }, +{ 269,141,318,77,349,291,217,202,33,15,372,304,515,22,102,177,351,700,352,120,5,137,10,317,260,753,64,851,854,403,49,21 }, +{ 6,74,66,85,138,25,87,42,135,26,226,222,280,29,75,500,220,278,792,70,19,2,1,294,204,64,32,145,853,112,52,174 }, +{ 9,0,105,39,16,18,1,101,272,31,127,98,24,518,333,252,310,28,68,737,846,371,158,916,938,49,30,7,286,35,301,455 }, +{ 2,1,14,67,24,46,83,108,69,29,38,103,114,36,6,133,18,0,28,7,65,52,236,75,50,398,5,309,135,16,278,160 }, +{ 22,1,28,105,49,95,0,2,67,55,36,239,168,159,65,35,14,170,320,164,9,7,10,5,114,12,83,64,194,109,24,301 }, +{ 7,66,172,97,92,140,232,568,298,14,192,314,16,380,135,324,2,330,74,38,357,448,126,69,35,5,107,6,387,60,204,572 }, +{ 2,29,1,14,5,6,46,133,114,50,52,26,218,108,19,13,366,236,27,45,70,17,58,23,86,51,137,65,112,38,25,12 }, +{ 6,1,74,25,2,26,29,66,42,19,75,14,388,67,108,70,52,85,103,65,38,138,357,133,114,594,324,516,603,96,309,69 }, +{ 22,9,28,1,36,49,109,105,86,95,131,31,169,39,0,141,272,159,44,55,98,180,13,30,185,115,83,128,352,137,64,208 }, +{ 203,822,326,23,77,859,403,494,576,39,473,182,33,1,691,100,18,217,13,817,411,447,363,102,93,966,96,478,291,704,310,120 }, +{ 15,515,700,753,33,77,117,4,102,134,115,153,13,1,40,217,11,196,341,2,5,3,23,317,365,0,21,291,32,51,12,569 }, +{ 15,515,700,753,13,0,23,1,8,82,51,165,197,120,180,2,9,33,4,110,5,12,10,260,351,386,141,7,457,475,93,901 }, +{ 1,23,13,15,51,0,21,2,515,5,141,180,120,165,700,202,197,4,753,12,33,9,7,82,115,93,3,352,260,6,110,48 }, +{ 15,515,700,753,0,1,2,5,3,4,8,13,180,341,10,23,7,6,9,51,77,197,961,115,165,82,120,31,22,202,457,217 }, +{ 559,661,922,564,141,533,10,317,373,110,143,269,244,260,332,261,93,642,752,295,351,876,531,843,180,206,728,384,352,1,434,120 }, +{ 15,515,1,0,700,2,13,23,753,5,51,180,3,165,12,6,197,115,4,9,7,21,719,8,457,82,141,120,33,22,901,10 }, +{ 198,234,0,325,5,77,202,416,20,147,32,43,639,315,49,61,450,455,142,21,113,230,22,318,725,342,207,13,95,904,494,10 }, +{ 447,264,363,9,411,676,682,1,156,664,821,478,166,354,812,39,100,905,382,897,98,18,759,404,31,101,724,5,265,223,88,13 }, +{ 24,14,69,16,35,18,2,7,108,189,71,67,267,149,1,46,68,83,38,140,0,236,251,9,388,60,133,103,65,28,29,50 }, +{ 16,7,35,20,14,18,109,2,43,120,107,60,1,121,326,907,553,77,13,147,23,82,68,260,0,403,5,24,202,126,265,199 }, +{ 30,131,187,276,31,44,613,442,39,9,190,228,1,839,116,935,908,219,127,88,244,224,110,137,93,201,98,141,36,567,0,856 }, +{ 98,223,1,393,812,265,100,421,593,834,697,48,51,410,791,382,21,88,31,284,9,125,96,293,230,23,213,217,656,689,541,5 }, +{ 98,51,127,219,616,258,105,293,395,421,924,512,31,308,23,201,116,44,301,272,763,276,125,13,453,170,401,295,261,944,115,567 }, +{ 253,110,951,352,811,206,332,180,141,244,282,10,854,417,642,638,559,752,143,911,260,55,93,533,499,498,661,120,351,959,564,341 }, +{ 49,9,159,254,272,158,0,131,28,39,627,105,327,286,22,518,688,578,68,347,374,101,224,424,95,35,219,24,16,364,65,344 }, +{ 105,22,131,272,98,286,327,109,374,239,28,95,320,219,9,224,55,127,187,36,578,169,64,185,538,1,159,10,371,634,49,616 }, +{ 691,478,340,1,658,914,724,363,744,698,156,772,411,296,682,447,9,284,335,98,264,303,909,21,354,410,225,13,664,686,88,919 }, +{ 16,24,35,18,71,7,140,108,189,267,92,60,14,230,68,69,9,1,149,46,246,191,388,167,2,0,118,236,133,21,674,5 }, +{ 9,0,127,31,98,371,395,39,737,49,1,44,385,272,512,28,293,242,836,761,254,299,101,16,187,22,116,158,159,131,18,21 }, +{ 16,18,68,0,60,35,9,101,252,28,118,24,419,55,7,109,604,71,39,121,22,364,14,158,191,167,925,126,329,21,92,49 }, +{ 116,268,203,93,206,692,551,31,417,940,499,8,473,44,202,523,959,0,120,137,559,22,450,403,576,10,728,299,13,326,51,1 }, +{ 225,459,744,1,919,914,691,330,622,21,141,223,5,284,934,335,88,538,340,82,385,839,363,120,478,98,48,30,64,32,686,166 }, +{ 5,1,2,0,14,36,21,281,12,48,50,67,22,28,54,83,24,218,38,10,181,9,32,18,65,58,45,114,430,17,99,37 }, +{ 137,450,202,704,120,260,326,318,968,269,851,403,291,77,23,141,182,310,494,373,351,457,82,890,349,110,60,128,817,678,105,96 }, +{ 15,515,700,753,4,33,13,23,77,5,40,11,102,93,1,21,110,51,82,117,141,2,10,8,32,64,120,31,202,3,217,115 }, +{ 15,13,1,23,515,0,51,2,700,5,753,21,180,141,165,3,12,115,197,4,7,6,457,9,352,202,33,8,719,120,77,341 }, +{ 219,127,258,98,276,201,131,395,944,293,116,284,567,31,242,105,137,935,295,44,403,860,51,224,576,456,9,371,578,475,202,512 }, +{ 16,7,18,35,60,0,14,20,118,28,68,22,2,24,1,92,158,107,5,49,154,126,109,12,43,10,55,6,677,71,21,168 }, +{ 93,728,531,160,559,373,574,120,295,860,533,269,717,260,926,902,258,318,36,201,261,434,851,137,617,141,187,352,843,384,332,251 }, +{ 28,0,1,9,22,109,83,39,49,12,36,67,55,5,96,2,128,30,158,69,21,23,160,208,35,13,65,323,50,141,194,238 }, +{ 23,13,51,1,5,15,141,21,0,2,165,515,202,700,12,197,180,120,32,115,4,753,64,9,7,269,6,3,125,386,48,453 }, +{ 5,21,13,49,14,20,7,23,43,32,1,0,652,48,713,22,38,2,16,132,955,107,12,279,24,888,197,640,70,303,18,638 }, +{ 9,0,28,39,1,30,35,101,22,67,83,141,49,175,36,68,55,88,13,251,10,69,23,158,180,115,64,100,217,65,345,166 }, +{ 260,120,82,269,5,450,351,1,202,141,854,13,77,922,32,33,137,4,23,125,291,21,15,515,165,349,177,700,318,326,180,753 }, +{ 121,16,18,35,363,101,60,20,107,14,68,259,621,55,604,43,7,252,9,364,126,0,167,191,5,407,132,28,199,419,146,10 }, +{ 13,23,51,1,15,0,2,141,5,515,12,21,700,115,165,180,4,753,3,197,202,32,9,120,7,8,6,11,37,10,457,269 }, +{ 201,144,206,443,418,203,435,96,335,459,187,1,88,332,330,321,269,934,30,372,822,521,268,326,44,523,382,141,410,264,494,473 }, +{ 31,44,276,201,116,131,284,662,567,144,9,489,98,295,268,434,0,30,137,39,93,1,187,22,219,918,110,299,141,36,224,384 }, +{ 520,478,664,1,264,604,9,167,777,759,411,0,806,724,48,21,101,68,647,936,363,223,118,682,410,18,100,16,252,98,265,13 }, +{ 23,70,21,87,60,75,120,182,163,379,92,18,7,937,71,121,446,132,24,98,931,126,107,77,795,195,115,44,411,146,51,850 }, +{ 39,9,0,737,127,31,846,98,1,827,105,310,371,30,254,100,44,18,395,242,272,101,385,916,836,16,265,131,938,93,166,557 }, +{ 279,20,43,126,107,7,92,16,356,362,60,595,246,359,598,35,0,683,939,653,121,97,125,441,399,392,150,199,48,230,14,649 }, +{ 206,417,93,959,499,728,8,559,120,473,137,141,10,564,31,260,44,450,203,341,253,244,373,116,143,638,268,180,352,110,318,940 }, +{ 259,465,147,132,590,687,534,199,581,146,941,427,107,640,279,178,121,5,195,150,522,955,198,35,786,929,798,142,1,21,325,626 }, +{ 523,15,120,450,202,515,403,51,817,700,13,753,23,457,33,899,128,64,730,102,494,342,115,719,453,196,49,99,318,421,308,5 }, +{ 141,559,10,244,365,564,661,180,253,143,752,110,55,317,533,341,901,93,373,206,535,160,82,922,260,36,531,964,352,332,261,197 }, +{ 219,258,98,127,276,964,943,137,843,535,201,935,131,860,261,295,284,567,206,44,116,31,253,492,203,332,160,615,36,93,55,692 }, +{ 0,319,422,207,945,693,577,887,32,804,95,344,104,904,61,20,5,43,7,725,113,510,306,102,49,263,153,426,33,83,22,9 }, +{ 1,22,0,5,12,2,36,21,28,86,49,105,9,10,23,13,141,95,31,55,128,37,51,4,83,202,3,64,96,7,32,44 }, +{ 15,515,13,700,1,0,753,2,23,3,4,5,51,10,115,197,6,33,12,9,165,7,8,77,11,961,180,269,141,22,120,457 }, +{ 15,1,23,120,77,13,515,51,141,202,700,180,110,137,260,753,326,5,128,102,0,21,2,165,269,33,197,450,318,217,93,115 }, +{ 15,515,1,700,0,753,13,2,23,180,51,5,120,4,9,115,197,12,7,165,21,33,6,82,3,8,523,901,31,141,457,260 }, +{ 16,18,24,7,92,35,60,75,9,13,71,14,0,108,50,21,126,121,1,140,23,5,132,146,2,12,128,10,64,141,70,87 }, +{ 180,341,901,15,515,1,365,700,0,2,197,753,115,4,10,13,752,5,3,8,165,317,141,23,143,873,44,31,569,55,93,6 }, +{ 9,0,175,35,101,28,39,67,68,1,65,83,30,69,364,336,22,114,55,124,194,158,100,289,252,166,64,345,103,36,50,88 }, +{ 64,165,180,197,115,247,217,237,21,13,32,316,22,141,352,72,288,304,95,225,76,391,386,16,468,90,49,35,365,640,372,23 }, +{ 15,515,700,13,753,1,0,2,23,4,5,3,115,51,141,197,12,10,180,961,7,9,21,33,217,6,8,165,457,11,77,341 }, +{ 132,121,199,146,60,279,493,640,407,598,126,195,534,581,955,590,107,5,150,35,522,49,259,16,18,360,156,0,147,362,21,167 }, +{ 0,1,28,9,22,5,36,12,65,24,67,96,2,83,18,50,114,55,21,16,7,10,23,14,13,160,137,51,48,218,103,69 }, +{ 93,120,957,77,30,968,459,110,137,160,613,102,202,352,373,141,31,372,217,330,190,318,269,260,203,44,28,473,228,177,863,704 }, +{ 15,515,700,0,753,1,13,23,2,51,5,9,120,82,4,7,901,197,10,8,260,180,341,12,33,6,3,523,165,102,115,141 }, +{ 206,417,8,141,499,44,244,93,31,10,137,253,559,116,728,144,120,564,269,638,203,352,143,260,341,752,268,717,951,180,160,110 }, +{ 530,254,228,1,96,21,406,39,827,31,669,840,613,829,137,679,166,98,23,51,960,438,131,93,48,224,219,317,310,36,876,190 }, +{ 15,515,700,753,13,457,0,197,719,1,165,82,23,8,120,730,2,10,12,180,134,5,9,141,260,4,351,51,115,3,341,899 }, +{ 0,16,68,9,24,28,18,35,252,109,39,419,124,158,154,55,101,71,22,118,60,7,49,65,333,14,1,10,329,364,677,346 }, +{ 1,15,13,23,515,51,120,0,700,180,2,165,5,753,141,197,21,33,202,102,260,4,9,12,7,326,137,450,115,6,82,110 }, +{ 535,253,352,564,110,365,82,180,341,10,854,533,55,898,244,901,873,141,752,143,642,559,498,317,36,951,115,964,638,282,661,197 }, +{ 31,44,125,338,116,64,242,36,1,10,55,22,456,237,180,13,299,164,506,86,23,165,558,143,0,762,492,479,844,546,93,8 }, +{ 13,23,4,1,202,2,0,51,115,77,141,180,5,15,217,3,33,11,515,317,9,10,102,21,700,341,365,318,269,64,32,128 }, +{ 9,39,0,166,68,101,28,364,30,158,562,35,175,65,333,154,49,404,706,124,21,252,274,168,190,289,100,570,16,1,310,346 }, +{ 15,515,700,753,341,13,0,23,1,33,141,4,260,82,77,51,351,180,9,5,115,137,10,217,11,120,102,40,349,269,202,854 } diff --git a/libkram/bc7enc/utils.cpp b/libkram/bc7enc/utils.cpp new file mode 100644 index 00000000..2b3b04d7 --- /dev/null +++ b/libkram/bc7enc/utils.cpp @@ -0,0 +1,908 @@ +// File: utils.cpp +#include "utils.h" + +// Don't need the impl yet +#if 0 + +#include "lodepng.h" +#include "miniz.h" + + +namespace utils +{ + +#define FLOOD_PUSH(y, xl, xr, dy) if (((y + (dy)) >= 0) && ((y + (dy)) < (int)m_height)) { stack.push_back(fill_segment(y, xl, xr, dy)); } + +// See http://www.realtimerendering.com/resources/GraphicsGems/gems/SeedFill.c +uint32_t image_u8::flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector* pSet_pixels) +{ + uint32_t total_set = 0; + + if (!flood_fill_is_inside(x, y, b)) + return 0; + + std::vector stack; + stack.reserve(64); + + FLOOD_PUSH(y, x, x, 1); + FLOOD_PUSH(y + 1, x, x, -1); + + while (stack.size()) + { + fill_segment s = stack.back(); + stack.pop_back(); + + int x1 = s.m_xl, x2 = s.m_xr, dy = s.m_dy; + y = s.m_y + s.m_dy; + + for (x = x1; (x >= 0) && flood_fill_is_inside(x, y, b); x--) + { + (*this)(x, y) = c; + total_set++; + if (pSet_pixels) + pSet_pixels->push_back(pixel_coord(x, y)); + } + + int l; + + if (x >= x1) + goto skip; + + l = x + 1; + if (l < x1) + FLOOD_PUSH(y, l, x1 - 1, -dy); + + x = x1 + 1; + + do + { + for (; x <= ((int)m_width - 1) && flood_fill_is_inside(x, y, b); x++) + { + (*this)(x, y) = c; + total_set++; + if (pSet_pixels) + pSet_pixels->push_back(pixel_coord(x, y)); + } + FLOOD_PUSH(y, l, x - 1, dy); + + if (x > (x2 + 1)) + FLOOD_PUSH(y, x2 + 1, x - 1, -dy); + + skip: + for (x++; x <= x2 && !flood_fill_is_inside(x, y, b); x++) + ; + + l = x; + } while (x <= x2); + } + + return total_set; +} + +void image_u8::draw_line(int xs, int ys, int xe, int ye, const color_quad_u8& color) +{ + if (xs > xe) + { + std::swap(xs, xe); + std::swap(ys, ye); + } + + int dx = xe - xs, dy = ye - ys; + if (!dx) + { + if (ys > ye) + std::swap(ys, ye); + for (int i = ys; i <= ye; i++) + set_pixel_clipped(xs, i, color); + } + else if (!dy) + { + for (int i = xs; i < xe; i++) + set_pixel_clipped(i, ys, color); + } + else if (dy > 0) + { + if (dy <= dx) + { + int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx); + rasterize_line(xs, ys, xe, ye, 0, 1, e, e_inc, e_no_inc, color); + } + else + { + int e = 2 * dx - dy, e_no_inc = 2 * dx, e_inc = 2 * (dx - dy); + rasterize_line(xs, ys, xe, ye, 1, 1, e, e_inc, e_no_inc, color); + } + } + else + { + dy = -dy; + if (dy <= dx) + { + int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx); + rasterize_line(xs, ys, xe, ye, 0, -1, e, e_inc, e_no_inc, color); + } + else + { + int e = 2 * dx - dy, e_no_inc = (2 * dx), e_inc = 2 * (dx - dy); + rasterize_line(xe, ye, xs, ys, 1, -1, e, e_inc, e_no_inc, color); + } + } +} + +void image_u8::rasterize_line(int xs, int ys, int xe, int ye, int pred, int inc_dec, int e, int e_inc, int e_no_inc, const color_quad_u8& color) +{ + int start, end, var; + + if (pred) + { + start = ys; + end = ye; + var = xs; + for (int i = start; i <= end; i++) + { + set_pixel_clipped(var, i, color); + if (e < 0) + e += e_no_inc; + else + { + var += inc_dec; + e += e_inc; + } + } + } + else + { + start = xs; + end = xe; + var = ys; + for (int i = start; i <= end; i++) + { + set_pixel_clipped(i, var, color); + if (e < 0) + e += e_no_inc; + else + { + var += inc_dec; + e += e_inc; + } + } + } +} + +bool load_png(const char* pFilename, image_u8& img) +{ + img.clear(); + + std::vector pixels; + unsigned int w = 0, h = 0; + unsigned int e = lodepng::decode(pixels, w, h, pFilename); + if (e != 0) + { + fprintf(stderr, "Failed loading PNG file %s\n", pFilename); + return false; + } + + img.init(w, h); + memcpy(&img.get_pixels()[0], &pixels[0], w * h * sizeof(uint32_t)); + + return true; +} + +bool save_png(const char* pFilename, const image_u8& img, bool save_alpha) +{ + const uint32_t w = img.width(); + const uint32_t h = img.height(); + + std::vector pixels; + if (save_alpha) + { + pixels.resize(w * h * sizeof(color_quad_u8)); + memcpy(&pixels[0], &img.get_pixels()[0], w * h * sizeof(color_quad_u8)); + } + else + { + pixels.resize(w * h * 3); + unsigned char* pDst = &pixels[0]; + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++, pDst += 3) + pDst[0] = img(x, y)[0], pDst[1] = img(x, y)[1], pDst[2] = img(x, y)[2]; + } + + return lodepng::encode(pFilename, pixels, w, h, save_alpha ? LCT_RGBA : LCT_RGB) == 0; +} + +static float gauss(int x, int y, float sigma_sqr) +{ + float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr))); + float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow; + return g; +} + +// size_x/y should be odd +void compute_gaussian_kernel(float* pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags) +{ + assert(size_x & size_y & 1); + + if (!(size_x | size_y)) + return; + + int mid_x = size_x / 2; + int mid_y = size_y / 2; + + double sum = 0; + for (int x = 0; x < size_x; x++) + { + for (int y = 0; y < size_y; y++) + { + float g; + if ((x > mid_x) && (y < mid_y)) + g = pDst[(size_x - x - 1) + y * size_x]; + else if ((x < mid_x) && (y > mid_y)) + g = pDst[x + (size_y - y - 1) * size_x]; + else if ((x > mid_x) && (y > mid_y)) + g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x]; + else + g = gauss(x - mid_x, y - mid_y, sigma_sqr); + + pDst[x + y * size_x] = g; + sum += g; + } + } + + if (flags & cComputeGaussianFlagNormalizeCenterToOne) + { + sum = pDst[mid_x + mid_y * size_x]; + } + + if (flags & (cComputeGaussianFlagNormalizeCenterToOne | cComputeGaussianFlagNormalize)) + { + double one_over_sum = 1.0f / sum; + for (int i = 0; i < size_x * size_y; i++) + pDst[i] = static_cast(pDst[i] * one_over_sum); + + if (flags & cComputeGaussianFlagNormalizeCenterToOne) + pDst[mid_x + mid_y * size_x] = 1.0f; + } + + if (flags & cComputeGaussianFlagPrint) + { + printf("{\n"); + for (int y = 0; y < size_y; y++) + { + printf(" "); + for (int x = 0; x < size_x; x++) + { + printf("%f, ", pDst[x + y * size_x]); + } + printf("\n"); + } + printf("}"); + } +} + +void gaussian_filter(imagef& dst, const imagef& orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor) +{ + assert(odd_filter_width && (odd_filter_width & 1)); + odd_filter_width |= 1; + + std::vector kernel(odd_filter_width * odd_filter_width); + compute_gaussian_kernel(&kernel[0], odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize); + + const int dst_width = orig_img.get_width() / width_divisor; + const int dst_height = orig_img.get_height() / height_divisor; + + const int H = odd_filter_width / 2; + const int L = -H; + + dst.crop(dst_width, dst_height); + +#pragma omp parallel for + for (int oy = 0; oy < dst_height; oy++) + { + for (int ox = 0; ox < dst_width; ox++) + { + vec4F c(0.0f); + + for (int yd = L; yd <= H; yd++) + { + int y = oy * height_divisor + (height_divisor >> 1) + yd; + + for (int xd = L; xd <= H; xd++) + { + int x = ox * width_divisor + (width_divisor >> 1) + xd; + + const vec4F& p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping); + + float w = kernel[(xd + H) + (yd + H) * odd_filter_width]; + c[0] += p[0] * w; + c[1] += p[1] * w; + c[2] += p[2] * w; + c[3] += p[3] * w; + } + } + + dst(ox, oy).set(c[0], c[1], c[2], c[3]); + } + } +} + +static void pow_image(const imagef& src, imagef& dst, const vec4F& power) +{ + dst.resize(src); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + + if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f)) + dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]); + else + dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3])); + } + } +} + +#if 0 +static void mul_image(const imagef& src, imagef& dst, const vec4F& mul) +{ + dst.resize(src); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]); + } + } +} +#endif + +static void scale_image(const imagef& src, imagef& dst, const vec4F& scale, const vec4F& shift) +{ + dst.resize(src); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + + vec4F d; + + for (uint32_t c = 0; c < 4; c++) + d[c] = scale[c] * p[c] + shift[c]; + + dst(x, y).set(d[0], d[1], d[2], d[3]); + } + } +} + +static void add_weighted_image(const imagef& src1, const vec4F& alpha, const imagef& src2, const vec4F& beta, const vec4F& gamma, imagef& dst) +{ + dst.resize(src1); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + dst(x, y).set( + s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0], + s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1], + s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2], + s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]); + } + } +} + +static void add_image(const imagef& src1, const imagef& src2, imagef& dst) +{ + dst.resize(src1); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]); + } + } +} + +static void adds_image(const imagef& src, const vec4F& value, imagef& dst) +{ + dst.resize(src); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + + dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]); + } + } +} + +static void mul_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale) +{ + dst.resize(src1); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + vec4F d; + + for (uint32_t c = 0; c < 4; c++) + { + float v1 = s1[c]; + float v2 = s2[c]; + d[c] = v1 * v2 * scale[c]; + } + + dst(x, y) = d; + } + } +} + +static void div_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale) +{ + dst.resize(src1); + +#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + vec4F d; + + for (uint32_t c = 0; c < 4; c++) + { + float v = s2[c]; + if (v == 0.0f) + d[c] = 0.0f; + else + d[c] = (s1[c] * scale[c]) / v; + } + + dst(x, y) = d; + } + } +} + +static vec4F avg_image(const imagef& src) +{ + vec4F avg(0.0f); + + for (uint32_t y = 0; y < src.get_height(); y++) + { + for (uint32_t x = 0; x < src.get_width(); x++) + { + const vec4F& s = src(x, y); + + avg += vec4F(s[0], s[1], s[2], s[3]); + } + } + + avg /= static_cast(src.get_total_pixels()); + + return avg; +} + +// Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html +vec4F compute_ssim(const imagef& a, const imagef& b) +{ + imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3; + + const float C1 = 6.50250f, C2 = 58.52250f; + + pow_image(a, a_sq, vec4F(2)); + pow_image(b, b_sq, vec4F(2)); + mul_image(a, b, axb, vec4F(1.0f)); + + gaussian_filter(mu1, a, 11, 1.5f * 1.5f); + gaussian_filter(mu2, b, 11, 1.5f * 1.5f); + + pow_image(mu1, mu1_sq, vec4F(2)); + pow_image(mu2, mu2_sq, vec4F(2)); + mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f)); + + gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f); + add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq); + + gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f); + add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq); + + gaussian_filter(s12, axb, 11, 1.5f * 1.5f); + add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12); + + scale_image(mu1_mu2, t1, vec4F(2), vec4F(0)); + adds_image(t1, vec4F(C1), t1); + + scale_image(s12, t2, vec4F(2), vec4F(0)); + adds_image(t2, vec4F(C2), t2); + + mul_image(t1, t2, t3, vec4F(1)); + + add_image(mu1_sq, mu2_sq, t1); + adds_image(t1, vec4F(C1), t1); + + add_image(s1_sq, s2_sq, t2); + adds_image(t2, vec4F(C2), t2); + + mul_image(t1, t2, t1, vec4F(1)); + + div_image(t3, t1, smap, vec4F(1)); + + return avg_image(smap); +} + +vec4F compute_ssim(const image_u8& a, const image_u8& b, bool luma) +{ + image_u8 ta(a), tb(b); + + if ((ta.width() != tb.width()) || (ta.height() != tb.height())) + { + fprintf(stderr, "compute_ssim: Cropping input images to equal dimensions\n"); + + const uint32_t w = std::min(a.width(), b.width()); + const uint32_t h = std::min(a.height(), b.height()); + ta.crop(w, h); + tb.crop(w, h); + } + + if (!ta.width() || !ta.height()) + { + assert(0); + return vec4F(0); + } + + if (luma) + { + for (uint32_t y = 0; y < ta.height(); y++) + { + for (uint32_t x = 0; x < ta.width(); x++) + { + ta(x, y).set((uint8_t)ta(x, y).get_luma(), ta(x, y).a); + tb(x, y).set((uint8_t)tb(x, y).get_luma(), tb(x, y).a); + } + } + } + + imagef fta, ftb; + + fta.set(ta); + ftb.set(tb); + + return compute_ssim(fta, ftb); +} + +bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void* pBlocks, uint32_t pixel_format_bpp, DXGI_FORMAT dxgi_format, bool srgb, bool force_dx10_header) +{ + (void)srgb; + + FILE* pFile = NULL; +#ifdef _MSC_VER + fopen_s(&pFile, pFilename, "wb"); +#else + pFile = fopen(pFilename, "wb"); +#endif + if (!pFile) + { + fprintf(stderr, "Failed creating file %s!\n", pFilename); + return false; + } + + fwrite("DDS ", 4, 1, pFile); + + DDSURFACEDESC2 desc; + memset(&desc, 0, sizeof(desc)); + + desc.dwSize = sizeof(desc); + desc.dwFlags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT | DDSD_CAPS; + + desc.dwWidth = width; + desc.dwHeight = height; + + desc.ddsCaps.dwCaps = DDSCAPS_TEXTURE; + desc.ddpfPixelFormat.dwSize = sizeof(desc.ddpfPixelFormat); + + desc.ddpfPixelFormat.dwFlags |= DDPF_FOURCC; + + desc.lPitch = (((desc.dwWidth + 3) & ~3) * ((desc.dwHeight + 3) & ~3) * pixel_format_bpp) >> 3; + desc.dwFlags |= DDSD_LINEARSIZE; + + desc.ddpfPixelFormat.dwRGBBitCount = 0; + + if ((!force_dx10_header) && + ((dxgi_format == DXGI_FORMAT_BC1_UNORM) || + (dxgi_format == DXGI_FORMAT_BC3_UNORM) || + (dxgi_format == DXGI_FORMAT_BC4_UNORM) || + (dxgi_format == DXGI_FORMAT_BC5_UNORM))) + { + if (dxgi_format == DXGI_FORMAT_BC1_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '1'); + else if (dxgi_format == DXGI_FORMAT_BC3_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '5'); + else if (dxgi_format == DXGI_FORMAT_BC4_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '1'); + else if (dxgi_format == DXGI_FORMAT_BC5_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '2'); + + fwrite(&desc, sizeof(desc), 1, pFile); + } + else + { + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', '1', '0'); + + fwrite(&desc, sizeof(desc), 1, pFile); + + DDS_HEADER_DXT10 hdr10; + memset(&hdr10, 0, sizeof(hdr10)); + + // Not all tools support DXGI_FORMAT_BC7_UNORM_SRGB (like NVTT), but ddsview in DirectXTex pays attention to it. So not sure what to do here. + // For best compatibility just write DXGI_FORMAT_BC7_UNORM. + //hdr10.dxgiFormat = srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM; + hdr10.dxgiFormat = dxgi_format; // DXGI_FORMAT_BC7_UNORM; + hdr10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D; + hdr10.arraySize = 1; + + fwrite(&hdr10, sizeof(hdr10), 1, pFile); + } + + fwrite(pBlocks, desc.lPitch, 1, pFile); + + if (fclose(pFile) == EOF) + { + fprintf(stderr, "Failed writing to DDS file %s!\n", pFilename); + return false; + } + + return true; +} + +void strip_extension(std::string& s) +{ + for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--) + { + if (s[i] == '.') + { + s.resize(i); + break; + } + } +} + +void strip_path(std::string& s) +{ + for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--) + { + if ((s[i] == '/') || (s[i] == ':') || (s[i] == '\\')) + { + s.erase(0, i + 1); + break; + } + } +} + +uint32_t hash_hsieh(const uint8_t* pBuf, size_t len) +{ + if (!pBuf || !len) + return 0; + + uint32_t h = static_cast(len); + + const uint32_t bytes_left = len & 3; + len >>= 2; + + while (len--) + { + const uint16_t* pWords = reinterpret_cast(pBuf); + + h += pWords[0]; + + const uint32_t t = (pWords[1] << 11) ^ h; + h = (h << 16) ^ t; + + pBuf += sizeof(uint32_t); + + h += h >> 11; + } + + switch (bytes_left) + { + case 1: + h += *reinterpret_cast(pBuf); + h ^= h << 10; + h += h >> 1; + break; + case 2: + h += *reinterpret_cast(pBuf); + h ^= h << 11; + h += h >> 17; + break; + case 3: + h += *reinterpret_cast(pBuf); + h ^= h << 16; + h ^= (static_cast(pBuf[sizeof(uint16_t)])) << 18; + h += h >> 11; + break; + default: + break; + } + + h ^= h << 3; + h += h >> 5; + h ^= h << 4; + h += h >> 17; + h ^= h << 25; + h += h >> 6; + + return h; +} + +float compute_block_max_std_dev(const color_quad_u8* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps) +{ + tracked_stat comp_stats[4]; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_quad_u8* pPixel = pPixels + x + y * block_width; + + for (uint32_t c = 0; c < num_comps; c++) + comp_stats[c].update(pPixel->m_c[c]); + } + } + + float max_std_dev = 0.0f; + for (uint32_t i = 0; i < num_comps; i++) + max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev()); + return max_std_dev; +} + +const uint32_t ASTC_SIG = 0x5CA1AB13; + +#pragma pack(push, 1) +struct astc_header +{ + uint32_t m_sig; + uint8_t m_block_x; + uint8_t m_block_y; + uint8_t m_block_z; + uint8_t m_width[3]; + uint8_t m_height[3]; + uint8_t m_depth[3]; +}; +#pragma pack(pop) + +bool save_astc_file(const char* pFilename, block16_vec& blocks, uint32_t width, uint32_t height, uint32_t block_width, uint32_t block_height) +{ + FILE* pFile = nullptr; + +#ifdef _MSC_VER + fopen_s(&pFile, pFilename, "wb"); +#else + pFile = fopen(pFilename, "wb"); +#endif + + if (!pFile) + return false; + + astc_header hdr; + memset(&hdr, 0, sizeof(hdr)); + + hdr.m_sig = ASTC_SIG; + hdr.m_block_x = (uint8_t)block_width; + hdr.m_block_y = (uint8_t)block_height; + hdr.m_block_z = 1; + hdr.m_width[0] = (uint8_t)(width); + hdr.m_width[1] = (uint8_t)(width >> 8); + hdr.m_width[2] = (uint8_t)(width >> 16); + hdr.m_height[0] = (uint8_t)(height); + hdr.m_height[1] = (uint8_t)(height >> 8); + hdr.m_height[2] = (uint8_t)(height >> 16); + hdr.m_depth[0] = 1; + fwrite(&hdr, sizeof(hdr), 1, pFile); + + fwrite(blocks.data(), 16, blocks.size(), pFile); + if (fclose(pFile) == EOF) + return false; + + return true; +} + +bool load_astc_file(const char* pFilename, block16_vec& blocks, uint32_t& width, uint32_t& height, uint32_t& block_width, uint32_t& block_height) +{ + FILE* pFile = nullptr; + +#ifdef _MSC_VER + fopen_s(&pFile, pFilename, "rb"); +#else + pFile = fopen(pFilename, "rb"); +#endif + + if (!pFile) + return false; + + astc_header hdr; + if (fread(&hdr, sizeof(hdr), 1, pFile) != 1) + { + fclose(pFile); + return false; + } + + if (hdr.m_sig != ASTC_SIG) + { + fclose(pFile); + return false; + } + + width = hdr.m_width[0] + (hdr.m_width[1] << 8) + (hdr.m_width[2] << 16); + height = hdr.m_height[0] + (hdr.m_height[1] << 8) + (hdr.m_height[2] << 16); + uint32_t depth = hdr.m_depth[0] + (hdr.m_depth[1] << 8) + (hdr.m_depth[2] << 16); + + if ((width < 1) || (width > 32768) || (height < 1) || (height > 32768)) + return false; + if ((hdr.m_block_z != 1) || (depth != 1)) + return false; + + block_width = hdr.m_block_x; + block_height = hdr.m_block_y; + + if ((block_width < 4) || (block_width > 12) || (block_height < 4) || (block_height > 12)) + return false; + + uint32_t blocks_x = (width + block_width - 1) / block_width; + uint32_t blocks_y = (height + block_height - 1) / block_height; + uint32_t total_blocks = blocks_x * blocks_y; + + blocks.resize(total_blocks); + + if (fread(blocks.data(), 16, total_blocks, pFile) != total_blocks) + { + fclose(pFile); + return false; + } + + fclose(pFile); + return true; +} + +uint32_t get_deflate_size(const void* pData, size_t data_size) +{ + size_t comp_size = 0; + void* pPre_RDO_Comp_data = tdefl_compress_mem_to_heap(pData, data_size, &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES); + mz_free(pPre_RDO_Comp_data); + + if (comp_size > UINT32_MAX) + return UINT32_MAX; + + return (uint32_t)comp_size; +} + +} // namespace utils + +#endif diff --git a/libkram/bc7enc/utils.h b/libkram/bc7enc/utils.h new file mode 100644 index 00000000..841710c4 --- /dev/null +++ b/libkram/bc7enc/utils.h @@ -0,0 +1,2617 @@ +// File: utils.h +#pragma once +#ifdef _MSC_VER +#pragma warning (push) +#pragma warning (disable:4127) // conditional expression is constant +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include "dds_defs.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#define ASSUME(c) static_assert(c, #c) +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) + +#define VECTOR_TEXT_LINE_SIZE (30.0f) +#define VECTOR_TEXT_CORE_LINE_SIZE (21.0f) + +#define UNUSED(x) (void)x + +namespace utils +{ +extern const uint32_t g_pretty_colors[]; +extern const uint32_t g_num_pretty_colors; + +const float cDegToRad = 0.01745329252f; +const float cRadToDeg = 57.29577951f; + +enum eClear { cClear }; +enum eZero { cZero }; +enum eInitExpand { cInitExpand }; + +inline int iabs(int i) { if (i < 0) i = -i; return i; } +inline uint8_t clamp255(int32_t i) { return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i); } +template inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); } +template inline F lerp(F a, F b, F s) { return a + (b - a) * s; } +template inline F square(F a) { return a * a; } + +template +inline T prev_wrap(T i, T n) +{ + T temp = i - 1; + if (temp < 0) + temp = n - 1; + return temp; +} + +template +inline T next_wrap(T i, T n) +{ + T temp = i + 1; + if (temp >= n) + temp = 0; + return temp; +} + +inline int posmod(int x, int y) +{ + if (x >= 0) + return (x < y) ? x : (x % y); + int m = (-x) % y; + return (m != 0) ? (y - m) : m; +} + +inline float deg_to_rad(float f) +{ + return f * cDegToRad; +}; + +inline float rad_to_deg(float f) +{ + return f * cRadToDeg; +}; + +template +struct rel_ops +{ + friend bool operator!=(const T& x, const T& y) + { + return (!(x == y)); + } + friend bool operator>(const T& x, const T& y) + { + return (y < x); + } + friend bool operator<=(const T& x, const T& y) + { + return (!(y < x)); + } + friend bool operator>=(const T& x, const T& y) + { + return (!(x < y)); + } +}; + +template +class vec : public rel_ops > +{ +public: + typedef T scalar_type; + enum + { + num_elements = N + }; + + inline vec() + { + } + + inline vec(eClear) + { + clear(); + } + + inline vec(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = other.m_s[i]; + } + + template + inline vec(const vec& other) + { + set(other); + } + + template + inline vec(const vec& other, T w) + { + *this = other; + m_s[N - 1] = w; + } + + explicit inline vec(T val) + { + set(val); + } + + inline vec(T val0, T val1) + { + set(val0, val1); + } + + inline vec(T val0, T val1, T val2) + { + set(val0, val1, val2); + } + + inline vec(T val0, T val1, T val2, T val3) + { + set(val0, val1, val2, val3); + } + + inline vec(T val0, T val1, T val2, T val3, T val4, T val5) + { + set(val0, val1, val2, val3, val4, val5); + } + + inline vec( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15) + { + set(val0, val1, val2, val3, + val4, val5, val6, val7, + val8, val9, val10, val11, + val12, val13, val14, val15); + } + + inline vec( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19) + { + set(val0, val1, val2, val3, + val4, val5, val6, val7, + val8, val9, val10, val11, + val12, val13, val14, val15, + val16, val17, val18, val19); + } + + inline vec( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19, + T val20, T val21, T val22, T val23, + T val24) + { + set(val0, val1, val2, val3, + val4, val5, val6, val7, + val8, val9, val10, val11, + val12, val13, val14, val15, + val16, val17, val18, val19, + val20, val21, val22, val23, + val24); + } + + inline void clear() + { + if (N > 4) + memset(m_s, 0, sizeof(m_s)); + else + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = 0; + } + } + + template + inline vec& set(const vec& other) + { + if ((void*)this == (void*)&other) + return *this; + const uint32_t m = std::min(N, ON); + uint32_t i; + for (i = 0; i < m; i++) + m_s[i] = static_cast(other[i]); + for (; i < N; i++) + m_s[i] = 0; + return *this; + } + + inline vec& set_component(uint32_t index, T val) + { + assert(index < N); + m_s[index] = val; + return *this; + } + + inline vec& set(T val) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = val; + return *this; + } + + inline vec& set(T val0, T val1) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + for (uint32_t i = 2; i < N; i++) + m_s[i] = 0; + } + return *this; + } + + inline vec& set(T val0, T val1, T val2) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + if (N >= 3) + { + m_s[2] = val2; + + for (uint32_t i = 3; i < N; i++) + m_s[i] = 0; + } + } + return *this; + } + + inline vec& set(T val0, T val1, T val2, T val3) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + if (N >= 3) + { + m_s[2] = val2; + + if (N >= 4) + { + m_s[3] = val3; + + for (uint32_t i = 4; i < N; i++) + m_s[i] = 0; + } + } + } + return *this; + } + + inline vec& set(T val0, T val1, T val2, T val3, T val4, T val5) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + if (N >= 3) + { + m_s[2] = val2; + + if (N >= 4) + { + m_s[3] = val3; + + if (N >= 5) + { + m_s[4] = val4; + + if (N >= 6) + { + m_s[5] = val5; + + for (uint32_t i = 6; i < N; i++) + m_s[i] = 0; + } + } + } + } + } + return *this; + } + + inline vec& set( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15) + { + m_s[0] = val0; + if (N >= 2) + m_s[1] = val1; + if (N >= 3) + m_s[2] = val2; + if (N >= 4) + m_s[3] = val3; + + if (N >= 5) + m_s[4] = val4; + if (N >= 6) + m_s[5] = val5; + if (N >= 7) + m_s[6] = val6; + if (N >= 8) + m_s[7] = val7; + + if (N >= 9) + m_s[8] = val8; + if (N >= 10) + m_s[9] = val9; + if (N >= 11) + m_s[10] = val10; + if (N >= 12) + m_s[11] = val11; + + if (N >= 13) + m_s[12] = val12; + if (N >= 14) + m_s[13] = val13; + if (N >= 15) + m_s[14] = val14; + if (N >= 16) + m_s[15] = val15; + + for (uint32_t i = 16; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline vec& set( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19) + { + m_s[0] = val0; + if (N >= 2) + m_s[1] = val1; + if (N >= 3) + m_s[2] = val2; + if (N >= 4) + m_s[3] = val3; + + if (N >= 5) + m_s[4] = val4; + if (N >= 6) + m_s[5] = val5; + if (N >= 7) + m_s[6] = val6; + if (N >= 8) + m_s[7] = val7; + + if (N >= 9) + m_s[8] = val8; + if (N >= 10) + m_s[9] = val9; + if (N >= 11) + m_s[10] = val10; + if (N >= 12) + m_s[11] = val11; + + if (N >= 13) + m_s[12] = val12; + if (N >= 14) + m_s[13] = val13; + if (N >= 15) + m_s[14] = val14; + if (N >= 16) + m_s[15] = val15; + + if (N >= 17) + m_s[16] = val16; + if (N >= 18) + m_s[17] = val17; + if (N >= 19) + m_s[18] = val18; + if (N >= 20) + m_s[19] = val19; + + for (uint32_t i = 20; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline vec& set( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19, + T val20, T val21, T val22, T val23, + T val24) + { + m_s[0] = val0; + if (N >= 2) + m_s[1] = val1; + if (N >= 3) + m_s[2] = val2; + if (N >= 4) + m_s[3] = val3; + + if (N >= 5) + m_s[4] = val4; + if (N >= 6) + m_s[5] = val5; + if (N >= 7) + m_s[6] = val6; + if (N >= 8) + m_s[7] = val7; + + if (N >= 9) + m_s[8] = val8; + if (N >= 10) + m_s[9] = val9; + if (N >= 11) + m_s[10] = val10; + if (N >= 12) + m_s[11] = val11; + + if (N >= 13) + m_s[12] = val12; + if (N >= 14) + m_s[13] = val13; + if (N >= 15) + m_s[14] = val14; + if (N >= 16) + m_s[15] = val15; + + if (N >= 17) + m_s[16] = val16; + if (N >= 18) + m_s[17] = val17; + if (N >= 19) + m_s[18] = val18; + if (N >= 20) + m_s[19] = val19; + + if (N >= 21) + m_s[20] = val20; + if (N >= 22) + m_s[21] = val21; + if (N >= 23) + m_s[22] = val22; + if (N >= 24) + m_s[23] = val23; + + if (N >= 25) + m_s[24] = val24; + + for (uint32_t i = 25; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline vec& set(const T* pValues) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = pValues[i]; + return *this; + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i) + { + return set(static_cast(other[i])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j) + { + return set(static_cast(other[i]), static_cast(other[j])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j, uint32_t k) + { + return set(static_cast(other[i]), static_cast(other[j]), static_cast(other[k])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j, uint32_t k, uint32_t l) + { + return set(static_cast(other[i]), static_cast(other[j]), static_cast(other[k]), static_cast(other[l])); + } + + inline vec& operator=(const vec& rhs) + { + if (this != &rhs) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = rhs.m_s[i]; + } + return *this; + } + + template + inline vec& operator=(const vec& other) + { + if ((void*)this == (void*)&other) + return *this; + + uint32_t s = std::min(N, O); + + uint32_t i; + for (i = 0; i < s; i++) + m_s[i] = static_cast(other[i]); + + for (; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline bool operator==(const vec& rhs) const + { + for (uint32_t i = 0; i < N; i++) + if (!(m_s[i] == rhs.m_s[i])) + return false; + return true; + } + + inline bool operator<(const vec& rhs) const + { + for (uint32_t i = 0; i < N; i++) + { + if (m_s[i] < rhs.m_s[i]) + return true; + else if (!(m_s[i] == rhs.m_s[i])) + return false; + } + + return false; + } + + inline T operator[](uint32_t i) const + { + assert(i < N); + return m_s[i]; + } + + inline T& operator[](uint32_t i) + { + assert(i < N); + return m_s[i]; + } + + template + inline uint64_t get_component_as_uint() const + { + ASSUME(index < N); + if (sizeof(T) == sizeof(float)) + return *reinterpret_cast(&m_s[index]); + else + return *reinterpret_cast(&m_s[index]); + } + + inline T get_x(void) const + { + return m_s[0]; + } + inline T get_y(void) const + { + ASSUME(N >= 2); + return m_s[1]; + } + inline T get_z(void) const + { + ASSUME(N >= 3); + return m_s[2]; + } + inline T get_w(void) const + { + ASSUME(N >= 4); + return m_s[3]; + } + + inline vec get_x_vector() const + { + return broadcast<0>(); + } + inline vec get_y_vector() const + { + return broadcast<1>(); + } + inline vec get_z_vector() const + { + return broadcast<2>(); + } + inline vec get_w_vector() const + { + return broadcast<3>(); + } + + inline T get_component(uint32_t i) const + { + return (*this)[i]; + } + + inline vec& set_x(T v) + { + m_s[0] = v; + return *this; + } + inline vec& set_y(T v) + { + ASSUME(N >= 2); + m_s[1] = v; + return *this; + } + inline vec& set_z(T v) + { + ASSUME(N >= 3); + m_s[2] = v; + return *this; + } + inline vec& set_w(T v) + { + ASSUME(N >= 4); + m_s[3] = v; + return *this; + } + + inline const T* get_ptr() const + { + return reinterpret_cast(&m_s[0]); + } + inline T* get_ptr() + { + return reinterpret_cast(&m_s[0]); + } + + inline vec as_point() const + { + vec result(*this); + result[N - 1] = 1; + return result; + } + + inline vec as_dir() const + { + vec result(*this); + result[N - 1] = 0; + return result; + } + + inline vec<2, T> select2(uint32_t i, uint32_t j) const + { + assert((i < N) && (j < N)); + return vec<2, T>(m_s[i], m_s[j]); + } + + inline vec<3, T> select3(uint32_t i, uint32_t j, uint32_t k) const + { + assert((i < N) && (j < N) && (k < N)); + return vec<3, T>(m_s[i], m_s[j], m_s[k]); + } + + inline vec<4, T> select4(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const + { + assert((i < N) && (j < N) && (k < N) && (l < N)); + return vec<4, T>(m_s[i], m_s[j], m_s[k], m_s[l]); + } + + inline bool is_dir() const + { + return m_s[N - 1] == 0; + } + inline bool is_vector() const + { + return is_dir(); + } + inline bool is_point() const + { + return m_s[N - 1] == 1; + } + + inline vec project() const + { + vec result(*this); + if (result[N - 1]) + result /= result[N - 1]; + return result; + } + + inline vec broadcast(unsigned i) const + { + return vec((*this)[i]); + } + + template + inline vec broadcast() const + { + return vec((*this)[i]); + } + + inline vec swizzle(uint32_t i, uint32_t j) const + { + return vec((*this)[i], (*this)[j]); + } + + inline vec swizzle(uint32_t i, uint32_t j, uint32_t k) const + { + return vec((*this)[i], (*this)[j], (*this)[k]); + } + + inline vec swizzle(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const + { + return vec((*this)[i], (*this)[j], (*this)[k], (*this)[l]); + } + + inline vec operator-() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = -m_s[i]; + return result; + } + + inline vec operator+() const + { + return *this; + } + + inline vec& operator+=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] += other.m_s[i]; + return *this; + } + + inline vec& operator-=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] -= other.m_s[i]; + return *this; + } + + inline vec& operator*=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] *= other.m_s[i]; + return *this; + } + + inline vec& operator/=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] /= other.m_s[i]; + return *this; + } + + inline vec& operator*=(T s) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] *= s; + return *this; + } + + inline vec& operator/=(T s) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] /= s; + return *this; + } + + // component-wise multiply (not a dot product like in previous versions) + // just remarking it out because it's too ambiguous, use dot() or mul_components() instead +#if 0 + friend inline vec operator*(const vec& lhs, const vec& rhs) + { + return vec::mul_components(lhs, rhs); + } +#endif + + friend inline vec operator*(const vec& lhs, T val) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] * val; + return result; + } + + friend inline vec operator*(T val, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = val * rhs.m_s[i]; + return result; + } + + friend inline vec operator/(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] / rhs.m_s[i]; + return result; + } + + friend inline vec operator/(const vec& lhs, T val) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] / val; + return result; + } + + friend inline vec operator+(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] + rhs.m_s[i]; + return result; + } + + friend inline vec operator-(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] - rhs.m_s[i]; + return result; + } + + static inline vec<3, T> cross2(const vec& a, const vec& b) + { + ASSUME(N >= 2); + return vec<3, T>(0, 0, a[0] * b[1] - a[1] * b[0]); + } + + inline vec<3, T> cross2(const vec& b) const + { + return cross2(*this, b); + } + + static inline vec<3, T> cross3(const vec& a, const vec& b) + { + ASSUME(N >= 3); + return vec<3, T>(a[1] * b[2] - a[2] * b[1], a[2] * b[0] - a[0] * b[2], a[0] * b[1] - a[1] * b[0]); + } + + inline vec<3, T> cross3(const vec& b) const + { + return cross3(*this, b); + } + + static inline vec<3, T> cross(const vec& a, const vec& b) + { + ASSUME(N >= 2); + + if (N == 2) + return cross2(a, b); + else + return cross3(a, b); + } + + inline vec<3, T> cross(const vec& b) const + { + ASSUME(N >= 2); + return cross(*this, b); + } + + inline T dot(const vec& rhs) const + { + return dot(*this, rhs); + } + + inline vec dot_vector(const vec& rhs) const + { + return vec(dot(*this, rhs)); + } + + static inline T dot(const vec& lhs, const vec& rhs) + { + T result = lhs.m_s[0] * rhs.m_s[0]; + for (uint32_t i = 1; i < N; i++) + result += lhs.m_s[i] * rhs.m_s[i]; + return result; + } + + inline T dot2(const vec& rhs) const + { + ASSUME(N >= 2); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1]; + } + + inline T dot3(const vec& rhs) const + { + ASSUME(N >= 3); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2]; + } + + inline T dot4(const vec& rhs) const + { + ASSUME(N >= 4); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2] + m_s[3] * rhs.m_s[3]; + } + + inline T norm(void) const + { + T sum = m_s[0] * m_s[0]; + for (uint32_t i = 1; i < N; i++) + sum += m_s[i] * m_s[i]; + return sum; + } + + inline T length(void) const + { + return sqrt(norm()); + } + + inline T squared_distance(const vec& rhs) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + } + return dist2; + } + + inline T squared_distance(const vec& rhs, T early_out) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + if (dist2 > early_out) + break; + } + return dist2; + } + + inline T distance(const vec& rhs) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + } + return sqrt(dist2); + } + + inline vec inverse() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = m_s[i] ? (1.0f / m_s[i]) : 0; + return result; + } + + // returns squared length (norm) + inline double normalize(const vec* pDefaultVec = NULL) + { + double n = m_s[0] * m_s[0]; + for (uint32_t i = 1; i < N; i++) + n += m_s[i] * m_s[i]; + + if (n != 0) + *this *= static_cast(1.0f / sqrt(n)); + else if (pDefaultVec) + *this = *pDefaultVec; + return n; + } + + inline double normalize3(const vec* pDefaultVec = NULL) + { + ASSUME(N >= 3); + + double n = m_s[0] * m_s[0] + m_s[1] * m_s[1] + m_s[2] * m_s[2]; + + if (n != 0) + *this *= static_cast((1.0f / sqrt(n))); + else if (pDefaultVec) + *this = *pDefaultVec; + return n; + } + + inline vec& normalize_in_place(const vec* pDefaultVec = NULL) + { + normalize(pDefaultVec); + return *this; + } + + inline vec& normalize3_in_place(const vec* pDefaultVec = NULL) + { + normalize3(pDefaultVec); + return *this; + } + + inline vec get_normalized(const vec* pDefaultVec = NULL) const + { + vec result(*this); + result.normalize(pDefaultVec); + return result; + } + + inline vec get_normalized3(const vec* pDefaultVec = NULL) const + { + vec result(*this); + result.normalize3(pDefaultVec); + return result; + } + + inline vec& clamp(T l, T h) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = static_cast(clamp(m_s[i], l, h)); + return *this; + } + + inline vec& saturate() + { + return clamp(0.0f, 1.0f); + } + + inline vec& clamp(const vec& l, const vec& h) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = static_cast(clamp(m_s[i], l[i], h[i])); + return *this; + } + + inline bool is_within_bounds(const vec& l, const vec& h) const + { + for (uint32_t i = 0; i < N; i++) + if ((m_s[i] < l[i]) || (m_s[i] > h[i])) + return false; + + return true; + } + + inline bool is_within_bounds(T l, T h) const + { + for (uint32_t i = 0; i < N; i++) + if ((m_s[i] < l) || (m_s[i] > h)) + return false; + + return true; + } + + inline uint32_t get_major_axis(void) const + { + T m = fabs(m_s[0]); + uint32_t r = 0; + for (uint32_t i = 1; i < N; i++) + { + const T c = fabs(m_s[i]); + if (c > m) + { + m = c; + r = i; + } + } + return r; + } + + inline uint32_t get_minor_axis(void) const + { + T m = fabs(m_s[0]); + uint32_t r = 0; + for (uint32_t i = 1; i < N; i++) + { + const T c = fabs(m_s[i]); + if (c < m) + { + m = c; + r = i; + } + } + return r; + } + + inline void get_projection_axes(uint32_t& u, uint32_t& v) const + { + const int axis = get_major_axis(); + if (m_s[axis] < 0.0f) + { + v = next_wrap(axis, N); + u = next_wrap(v, N); + } + else + { + u = next_wrap(axis, N); + v = next_wrap(u, N); + } + } + + inline T get_absolute_minimum(void) const + { + T result = fabs(m_s[0]); + for (uint32_t i = 1; i < N; i++) + result = std::min(result, fabs(m_s[i])); + return result; + } + + inline T get_absolute_maximum(void) const + { + T result = fabs(m_s[0]); + for (uint32_t i = 1; i < N; i++) + result = std::max(result, fabs(m_s[i])); + return result; + } + + inline T get_minimum(void) const + { + T result = m_s[0]; + for (uint32_t i = 1; i < N; i++) + result = std::min(result, m_s[i]); + return result; + } + + inline T get_maximum(void) const + { + T result = m_s[0]; + for (uint32_t i = 1; i < N; i++) + result = std::max(result, m_s[i]); + return result; + } + + inline vec& remove_unit_direction(const vec& dir) + { + *this -= (dot(dir) * dir); + return *this; + } + + inline vec get_remove_unit_direction(const vec& dir) const + { + return *this - (dot(dir) * dir); + } + + inline bool all_less(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] >= b.m_s[i]) + return false; + return true; + } + + inline bool all_less_equal(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] > b.m_s[i]) + return false; + return true; + } + + inline bool all_greater(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] <= b.m_s[i]) + return false; + return true; + } + + inline bool all_greater_equal(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] < b.m_s[i]) + return false; + return true; + } + + inline vec negate_xyz() const + { + vec ret; + + ret[0] = -m_s[0]; + if (N >= 2) + ret[1] = -m_s[1]; + if (N >= 3) + ret[2] = -m_s[2]; + + for (uint32_t i = 3; i < N; i++) + ret[i] = m_s[i]; + + return ret; + } + + inline vec& invert() + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] != 0.0f) + m_s[i] = 1.0f / m_s[i]; + return *this; + } + + inline scalar_type perp_dot(const vec& b) const + { + ASSUME(N == 2); + return m_s[0] * b.m_s[1] - m_s[1] * b.m_s[0]; + } + + inline vec perp() const + { + ASSUME(N == 2); + return vec(-m_s[1], m_s[0]); + } + + inline vec get_floor() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = floor(m_s[i]); + return result; + } + + inline vec get_ceil() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = ceil(m_s[i]); + return result; + } + + // static helper methods + + static inline vec mul_components(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = lhs.m_s[i] * rhs.m_s[i]; + return result; + } + + static inline vec mul_add_components(const vec& a, const vec& b, const vec& c) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = a.m_s[i] * b.m_s[i] + c.m_s[i]; + return result; + } + + static inline vec make_axis(uint32_t i) + { + vec result; + result.clear(); + result[i] = 1; + return result; + } + + static inline vec equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] == b[i]); + return ret; + } + + static inline vec not_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] != b[i]); + return ret; + } + + static inline vec less_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] < b[i]); + return ret; + } + + static inline vec less_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] <= b[i]); + return ret; + } + + static inline vec greater_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] >= b[i]); + return ret; + } + + static inline vec greater_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] > b[i]); + return ret; + } + + static inline vec component_max(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = std::max(a.m_s[i], b.m_s[i]); + return ret; + } + + static inline vec component_min(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = std::min(a.m_s[i], b.m_s[i]); + return ret; + } + + static inline vec lerp(const vec& a, const vec& b, float t) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = a.m_s[i] + (b.m_s[i] - a.m_s[i]) * t; + return ret; + } + + static inline bool equal_tol(const vec& a, const vec& b, float t) + { + for (uint32_t i = 0; i < N; i++) + if (!equal_tol(a.m_s[i], b.m_s[i], t)) + return false; + return true; + } + + inline bool equal_tol(const vec& b, float t) const + { + return equal_tol(*this, b, t); + } + +protected: + T m_s[N]; +}; + +typedef vec<1, double> vec1D; +typedef vec<2, double> vec2D; +typedef vec<3, double> vec3D; +typedef vec<4, double> vec4D; + +typedef vec<1, float> vec1F; + +typedef vec<2, float> vec2F; +typedef std::vector vec2F_array; + +typedef vec<3, float> vec3F; +typedef std::vector vec3F_array; + +typedef vec<4, float> vec4F; +typedef std::vector vec4F_array; + +typedef vec<2, uint32_t> vec2U; +typedef vec<3, uint32_t> vec3U; +typedef vec<2, int> vec2I; +typedef vec<3, int> vec3I; +typedef vec<4, int> vec4I; + +typedef vec<2, int16_t> vec2I16; +typedef vec<3, int16_t> vec3I16; + +inline vec2F rotate_point(const vec2F& p, float rad) +{ + float c = cos(rad); + float s = sin(rad); + + float x = p[0]; + float y = p[1]; + + return vec2F(x * c - y * s, x * s + y * c); +} + +class rect +{ +public: + inline rect() + { + } + + inline rect(eClear) + { + clear(); + } + + inline rect(eInitExpand) + { + init_expand(); + } + + // up to, but not including right/bottom + inline rect(int left, int top, int right, int bottom) + { + set(left, top, right, bottom); + } + + inline rect(const vec2I& lo, const vec2I& hi) + { + m_corner[0] = lo; + m_corner[1] = hi; + } + + inline rect(const vec2I& point) + { + m_corner[0] = point; + m_corner[1].set(point[0] + 1, point[1] + 1); + } + + inline bool operator==(const rect& r) const + { + return (m_corner[0] == r.m_corner[0]) && (m_corner[1] == r.m_corner[1]); + } + + inline bool operator<(const rect& r) const + { + for (uint32_t i = 0; i < 2; i++) + { + if (m_corner[i] < r.m_corner[i]) + return true; + else if (!(m_corner[i] == r.m_corner[i])) + return false; + } + + return false; + } + + inline void clear() + { + m_corner[0].clear(); + m_corner[1].clear(); + } + + inline void set(int left, int top, int right, int bottom) + { + m_corner[0].set(left, top); + m_corner[1].set(right, bottom); + } + + inline void set(const vec2I& lo, const vec2I& hi) + { + m_corner[0] = lo; + m_corner[1] = hi; + } + + inline void set(const vec2I& point) + { + m_corner[0] = point; + m_corner[1].set(point[0] + 1, point[1] + 1); + } + + inline uint32_t get_width() const + { + return m_corner[1][0] - m_corner[0][0]; + } + inline uint32_t get_height() const + { + return m_corner[1][1] - m_corner[0][1]; + } + + inline int get_left() const + { + return m_corner[0][0]; + } + inline int get_top() const + { + return m_corner[0][1]; + } + inline int get_right() const + { + return m_corner[1][0]; + } + inline int get_bottom() const + { + return m_corner[1][1]; + } + + inline bool is_empty() const + { + return (m_corner[1][0] <= m_corner[0][0]) || (m_corner[1][1] <= m_corner[0][1]); + } + + inline uint32_t get_dimension(uint32_t axis) const + { + return m_corner[1][axis] - m_corner[0][axis]; + } + inline uint32_t get_area() const + { + return get_dimension(0) * get_dimension(1); + } + + inline const vec2I& operator[](uint32_t i) const + { + assert(i < 2); + return m_corner[i]; + } + inline vec2I& operator[](uint32_t i) + { + assert(i < 2); + return m_corner[i]; + } + + inline rect& translate(int x_ofs, int y_ofs) + { + m_corner[0][0] += x_ofs; + m_corner[0][1] += y_ofs; + m_corner[1][0] += x_ofs; + m_corner[1][1] += y_ofs; + return *this; + } + + inline rect& init_expand() + { + m_corner[0].set(INT_MAX); + m_corner[1].set(INT_MIN); + return *this; + } + + inline rect& expand(int x, int y) + { + m_corner[0][0] = std::min(m_corner[0][0], x); + m_corner[0][1] = std::min(m_corner[0][1], y); + m_corner[1][0] = std::max(m_corner[1][0], x + 1); + m_corner[1][1] = std::max(m_corner[1][1], y + 1); + return *this; + } + + inline rect& expand(const rect& r) + { + m_corner[0][0] = std::min(m_corner[0][0], r[0][0]); + m_corner[0][1] = std::min(m_corner[0][1], r[0][1]); + m_corner[1][0] = std::max(m_corner[1][0], r[1][0]); + m_corner[1][1] = std::max(m_corner[1][1], r[1][1]); + return *this; + } + + inline bool touches(const rect& r) const + { + for (uint32_t i = 0; i < 2; i++) + { + if (r[1][i] <= m_corner[0][i]) + return false; + else if (r[0][i] >= m_corner[1][i]) + return false; + } + + return true; + } + + inline bool fully_within(const rect& r) const + { + for (uint32_t i = 0; i < 2; i++) + { + if (m_corner[0][i] < r[0][i]) + return false; + else if (m_corner[1][i] > r[1][i]) + return false; + } + + return true; + } + + inline bool intersect(const rect& r) + { + if (!touches(r)) + { + clear(); + return false; + } + + for (uint32_t i = 0; i < 2; i++) + { + m_corner[0][i] = std::max(m_corner[0][i], r[0][i]); + m_corner[1][i] = std::min(m_corner[1][i], r[1][i]); + } + + return true; + } + + inline bool contains(int x, int y) const + { + return (x >= m_corner[0][0]) && (x < m_corner[1][0]) && + (y >= m_corner[0][1]) && (y < m_corner[1][1]); + } + + inline bool contains(const vec2I& p) const + { + return contains(p[0], p[1]); + } + +private: + vec2I m_corner[2]; +}; + +inline rect make_rect(uint32_t width, uint32_t height) +{ + return rect(0, 0, width, height); +} + +struct color_quad_u8 +{ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4201) +#endif + union + { + uint8_t m_c[4]; + struct + { + uint8_t r; + uint8_t g; + uint8_t b; + uint8_t a; + }; + }; +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + inline color_quad_u8(eClear) : color_quad_u8(0, 0, 0, 0) { } + + inline color_quad_u8(uint8_t cr, uint8_t cg, uint8_t cb, uint8_t ca) + { + set(cr, cg, cb, ca); + } + + inline color_quad_u8(uint8_t cy = 0, uint8_t ca = 255) + { + set(cy, ca); + } + + inline void clear() + { + set(0, 0, 0, 0); + } + + inline color_quad_u8& set(uint8_t cy, uint8_t ca = 255) + { + m_c[0] = cy; + m_c[1] = cy; + m_c[2] = cy; + m_c[3] = ca; + return *this; + } + + inline color_quad_u8& set(uint8_t cr, uint8_t cg, uint8_t cb, uint8_t ca) + { + m_c[0] = cr; + m_c[1] = cg; + m_c[2] = cb; + m_c[3] = ca; + return *this; + } + + inline color_quad_u8& set_clamped(int cr, int cg, int cb, int ca) + { + m_c[0] = (uint8_t)clamp(cr, 0, 255); + m_c[1] = (uint8_t)clamp(cg, 0, 255); + m_c[2] = (uint8_t)clamp(cb, 0, 255); + m_c[3] = (uint8_t)clamp(ca, 0, 255); + return *this; + } + + color_quad_u8& set_alpha(int ca) { a = (uint8_t)clamp(ca, 0, 255); return *this; } + + inline uint8_t& operator[] (uint32_t i) { assert(i < 4); return m_c[i]; } + inline uint8_t operator[] (uint32_t i) const { assert(i < 4); return m_c[i]; } + + inline int get_luma() const { return (13938U * m_c[0] + 46869U * m_c[1] + 4729U * m_c[2] + 32768U) >> 16U; } // REC709 weightings + + inline bool operator== (const color_quad_u8& other) const + { + return (m_c[0] == other.m_c[0]) && (m_c[1] == other.m_c[1]) && (m_c[2] == other.m_c[2]) && (m_c[3] == other.m_c[3]); + } + + inline bool operator!= (const color_quad_u8& other) const + { + return !(*this == other); + } + + inline uint32_t squared_distance(const color_quad_u8& c, bool alpha = true) const + { + return square(r - c.r) + square(g - c.g) + square(b - c.b) + (alpha ? square(a - c.a) : 0); + } + + inline bool rgb_equals(const color_quad_u8& rhs) const + { + return (r == rhs.r) && (g == rhs.g) && (b == rhs.b); + } +}; +typedef std::vector color_quad_u8_vec; + +inline uint32_t color_distance(bool perceptual, const color_quad_u8& e1, const color_quad_u8& e2, bool alpha) +{ + if (perceptual) + { + const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f; + const float cr1 = e1.r - l1; + const float cb1 = e1.b - l1; + + const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f; + const float cr2 = e2.r - l2; + const float cb2 = e2.b - l2; + + const float dl = l1 - l2; + const float dcr = cr1 - cr2; + const float dcb = cb1 - cb2; + + uint32_t d = static_cast( + 32.0f * 4.0f * dl * dl + + 32.0f * 2.0f * (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f)) * dcr * dcr + + 32.0f * .25f * (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f)) * dcb * dcb); + + if (alpha) + { + int da = (int)e1.a - (int)e2.a; + + d += static_cast(128.0f * da * da); + } + + return d; + } + else + return e1.squared_distance(e2, alpha); +} + +extern color_quad_u8 g_white_color_u8, g_black_color_u8, g_red_color_u8, g_green_color_u8, g_blue_color_u8, g_yellow_color_u8, g_purple_color_u8, g_magenta_color_u8, g_cyan_color_u8; + +class image_u8 +{ +public: + image_u8() : + m_width(0), m_height(0), + m_clip_rect(cClear) + { + } + + image_u8(uint32_t width, uint32_t height) : + m_width(width), m_height(height), + m_clip_rect(0, 0, width, height) + { + m_pixels.resize(width * height); + } + + inline const color_quad_u8_vec& get_pixels() const { return m_pixels; } + inline color_quad_u8_vec& get_pixels() { return m_pixels; } + + inline uint32_t width() const { return m_width; } + inline uint32_t height() const { return m_height; } + inline uint32_t total_pixels() const { return m_width * m_height; } + + inline const rect& get_clip_rect() const { return m_clip_rect; } + + inline void set_clip_rect(const rect& r) + { + assert((r.get_left() >= 0) && (r.get_top() >= 0) && (r.get_right() <= (int)m_width) && (r.get_bottom() <= (int)m_height)); + + m_clip_rect = r; + } + + inline void clear_clip_rect() { m_clip_rect.set(0, 0, m_width, m_height); } + + inline bool is_clipped(int x, int y) const { return !m_clip_rect.contains(x, y); } + + inline rect get_bounds() const { return rect(0, 0, m_width, m_height); } + + inline color_quad_u8& operator()(uint32_t x, uint32_t y) { assert((x < m_width) && (y < m_height)); return m_pixels[x + m_width * y]; } + inline const color_quad_u8& operator()(uint32_t x, uint32_t y) const { assert((x < m_width) && (y < m_height)); return m_pixels[x + m_width * y]; } + + image_u8& clear() + { + m_width = m_height = 0; + m_clip_rect.clear(); + m_pixels.clear(); + return *this; + } + + image_u8& init(uint32_t width, uint32_t height) + { + clear(); + + m_width = width; + m_height = height; + m_clip_rect.set(0, 0, width, height); + m_pixels.resize(width * height); + return *this; + } + + image_u8& set_all(const color_quad_u8& p) + { + for (uint32_t i = 0; i < m_pixels.size(); i++) + m_pixels[i] = p; + return *this; + } + + inline const color_quad_u8& get_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + inline color_quad_u8& get_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + + inline image_u8& set_pixel_clipped(int x, int y, const color_quad_u8& c) + { + if (!is_clipped(x, y)) + (*this)(x, y) = c; + return *this; + } + + inline image_u8& fill_box(int x, int y, int w, int h, const color_quad_u8& c) + { + for (int y_ofs = 0; y_ofs < h; y_ofs++) + for (int x_ofs = 0; x_ofs < w; x_ofs++) + set_pixel_clipped(x + x_ofs, y + y_ofs, c); + return *this; + } + + void invert_box(int inX, int inY, int inW, int inH) + { + for (int y = 0; y < inH; y++) + { + const uint32_t yy = inY + y; + + for (int x = 0; x < inW; x++) + { + const uint32_t xx = inX + x; + + if (is_clipped(xx, yy)) + continue; + + color_quad_u8 c((*this)(xx, yy)); + + c.r = 255 - c.r; + c.g = 255 - c.g; + c.b = 255 - c.b; + + set_pixel_clipped(xx, yy, c); + } + } + } + + image_u8& crop_dup_borders(uint32_t w, uint32_t h) + { + const uint32_t orig_w = m_width, orig_h = m_height; + + crop(w, h); + + if (orig_w && orig_h) + { + if (m_width > orig_w) + { + for (uint32_t x = orig_w; x < m_width; x++) + for (uint32_t y = 0; y < m_height; y++) + set_pixel_clipped(x, y, get_clamped(std::min(x, orig_w - 1U), std::min(y, orig_h - 1U))); + } + + if (m_height > orig_h) + { + for (uint32_t y = orig_h; y < m_height; y++) + for (uint32_t x = 0; x < m_width; x++) + set_pixel_clipped(x, y, get_clamped(std::min(x, orig_w - 1U), std::min(y, orig_h - 1U))); + } + } + return *this; + } + + image_u8& crop(uint32_t new_width, uint32_t new_height) + { + if ((m_width == new_width) && (m_height == new_height)) + return *this; + + image_u8 new_image(new_width, new_height); + + const uint32_t w = std::min(m_width, new_width); + const uint32_t h = std::min(m_height, new_height); + + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + new_image(x, y) = (*this)(x, y); + + return swap(new_image); + } + + image_u8& swap(image_u8& other) + { + std::swap(m_width, other.m_width); + std::swap(m_height, other.m_height); + std::swap(m_pixels, other.m_pixels); + std::swap(m_clip_rect, other.m_clip_rect); + return *this; + } + + // No clipping + inline void get_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, color_quad_u8* pPixels) const + { + assert((bx * width + width) <= m_width); + assert((by * height + height) <= m_height); + + for (uint32_t y = 0; y < height; y++) + memcpy(pPixels + y * width, &(*this)(bx * width, by * height + y), width * sizeof(color_quad_u8)); + } + + inline void get_block_clamped(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, color_quad_u8* pPixels) const + { + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) + pPixels[x + y * width] = get_clamped(bx * width + x, by * height + y); + } + + // No clipping + inline void set_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, const color_quad_u8* pPixels) + { + assert((bx * width + width) <= m_width); + assert((by * height + height) <= m_height); + + for (uint32_t y = 0; y < height; y++) + memcpy(&(*this)(bx * width, by * height + y), pPixels + y * width, width * sizeof(color_quad_u8)); + } + + image_u8& swizzle(uint32_t r, uint32_t g, uint32_t b, uint32_t a) + { + assert((r | g | b | a) <= 3); + for (uint32_t y = 0; y < m_height; y++) + { + for (uint32_t x = 0; x < m_width; x++) + { + color_quad_u8 tmp((*this)(x, y)); + (*this)(x, y).set(tmp[r], tmp[g], tmp[b], tmp[a]); + } + } + + return *this; + } + + struct pixel_coord + { + uint16_t m_x, m_y; + pixel_coord() { } + pixel_coord(uint32_t x, uint32_t y) : m_x((uint16_t)x), m_y((uint16_t)y) { } + }; + + uint32_t flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector* pSet_pixels = nullptr); + + void draw_line(int xs, int ys, int xe, int ye, const color_quad_u8& color); + + inline void set_pixel_clipped_alphablend(int x, int y, const color_quad_u8& c) + { + if (is_clipped(x, y)) + return; + + color_quad_u8 ct(m_pixels[x + y * m_width]); + + ct.r = static_cast(ct.r + ((c.r - ct.r) * c.a) / 255); + ct.g = static_cast(ct.g + ((c.g - ct.g) * c.a) / 255); + ct.b = static_cast(ct.b + ((c.b - ct.b) * c.a) / 255); + + m_pixels[x + y * m_width] = ct; + } + +private: + color_quad_u8_vec m_pixels; + uint32_t m_width, m_height; + rect m_clip_rect; + + struct fill_segment + { + int16_t m_y, m_xl, m_xr, m_dy; + + fill_segment(int y, int xl, int xr, int dy) : + m_y((int16_t)y), m_xl((int16_t)xl), m_xr((int16_t)xr), m_dy((int16_t)dy) + { + } + }; + + inline bool flood_fill_is_inside(int x, int y, const color_quad_u8& b) const + { + if (is_clipped(x, y)) + return false; + + return (*this)(x, y) == b; + } + + void rasterize_line(int xs, int ys, int xe, int ye, int pred, int inc_dec, int e, int e_inc, int e_no_inc, const color_quad_u8& color); + + void draw_aaline_pixel(int x, int y, int a, color_quad_u8 color) + { + color.a = static_cast(255 - a); + set_pixel_clipped_alphablend(x, y, color); + } +}; + +bool load_png(const char* pFilename, image_u8& img); + +bool save_png(const char* pFilename, const image_u8& img, bool save_alpha); + +class image_metrics +{ +public: + double m_max, m_mean, m_mean_squared, m_root_mean_squared, m_peak_snr; + + image_metrics() + { + clear(); + } + + void clear() + { + memset(this, 0, sizeof(*this)); + } + + void compute(const image_u8& a, const image_u8& b, uint32_t first_channel, uint32_t num_channels) + { + const bool average_component_error = true; + + const uint32_t width = std::min(a.width(), b.width()); + const uint32_t height = std::min(a.height(), b.height()); + + assert((first_channel < 4U) && (first_channel + num_channels <= 4U)); + + // Histogram approach originally due to Charles Bloom. + double hist[256]; + memset(hist, 0, sizeof(hist)); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const color_quad_u8& ca = a(x, y); + const color_quad_u8& cb = b(x, y); + + if (!num_channels) + hist[iabs(ca.get_luma() - cb.get_luma())]++; + else + { + for (uint32_t c = 0; c < num_channels; c++) + hist[iabs(ca[first_channel + c] - cb[first_channel + c])]++; + } + } + } + + m_max = 0; + double sum = 0.0f, sum2 = 0.0f; + for (uint32_t i = 0; i < 256; i++) + { + if (!hist[i]) + continue; + + m_max = std::max(m_max, i); + + double x = i * hist[i]; + + sum += x; + sum2 += i * x; + } + + // See http://richg42.blogspot.com/2016/09/how-to-compute-psnr-from-old-berkeley.html + double total_values = width * height; + + if (average_component_error) + total_values *= clamp(num_channels, 1, 4); + + m_mean = clamp(sum / total_values, 0.0f, 255.0f); + m_mean_squared = clamp(sum2 / total_values, 0.0f, 255.0f * 255.0f); + + m_root_mean_squared = sqrt(m_mean_squared); + + if (!m_root_mean_squared) + m_peak_snr = 100.0f; + else + m_peak_snr = clamp(log10(255.0f / m_root_mean_squared) * 20.0f, 0.0f, 100.0f); + } +}; + +class imagef +{ +public: + imagef() : + m_width(0), m_height(0), m_pitch(0) + { + } + + imagef(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) : + m_width(0), m_height(0), m_pitch(0) + { + resize(w, h, p); + } + + imagef(const imagef& other) : + m_width(0), m_height(0), m_pitch(0) + { + *this = other; + } + + imagef& swap(imagef& other) + { + std::swap(m_width, other.m_width); + std::swap(m_height, other.m_height); + std::swap(m_pitch, other.m_pitch); + m_pixels.swap(other.m_pixels); + return *this; + } + + imagef& operator= (const imagef& rhs) + { + if (this != &rhs) + { + m_width = rhs.m_width; + m_height = rhs.m_height; + m_pitch = rhs.m_pitch; + m_pixels = rhs.m_pixels; + } + return *this; + } + + imagef& clear() + { + m_width = 0; + m_height = 0; + m_pitch = 0; + m_pixels.resize(0); + return *this; + } + + imagef& set(const image_u8& src, const vec4F& scale = vec4F(1), const vec4F& bias = vec4F(0)) + { + const uint32_t width = src.width(); + const uint32_t height = src.height(); + + resize(width, height); + + for (int y = 0; y < (int)height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const color_quad_u8& src_pixel = src(x, y); + (*this)(x, y).set((float)src_pixel.r * scale[0] + bias[0], (float)src_pixel.g * scale[1] + bias[1], (float)src_pixel.b * scale[2] + bias[2], (float)src_pixel.a * scale[3] + bias[3]); + } + } + + return *this; + } + + imagef& resize(const imagef& other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0, 0, 0, 1)) + { + return resize(other.get_width(), other.get_height(), p, background); + } + + imagef& resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0, 0, 0, 1)) + { + return crop(w, h, p, background); + } + + imagef& set_all(const vec4F& c) + { + for (uint32_t i = 0; i < m_pixels.size(); i++) + m_pixels[i] = c; + return *this; + } + + imagef& fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const vec4F& c) + { + for (uint32_t iy = 0; iy < h; iy++) + for (uint32_t ix = 0; ix < w; ix++) + set_pixel_clipped(x + ix, y + iy, c); + return *this; + } + + imagef& crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0, 0, 0, 1)) + { + if (p == UINT32_MAX) + p = w; + + if ((w == m_width) && (m_height == h) && (m_pitch == p)) + return *this; + + if ((!w) || (!h) || (!p)) + { + clear(); + return *this; + } + + vec4F_array cur_state; + cur_state.swap(m_pixels); + + m_pixels.resize(p * h); + + for (uint32_t y = 0; y < h; y++) + { + for (uint32_t x = 0; x < w; x++) + { + if ((x < m_width) && (y < m_height)) + m_pixels[x + y * p] = cur_state[x + y * m_pitch]; + else + m_pixels[x + y * p] = background; + } + } + + m_width = w; + m_height = h; + m_pitch = p; + + return *this; + } + + inline const vec4F& operator() (uint32_t x, uint32_t y) const { assert(x < m_width&& y < m_height); return m_pixels[x + y * m_pitch]; } + inline vec4F& operator() (uint32_t x, uint32_t y) { assert(x < m_width&& y < m_height); return m_pixels[x + y * m_pitch]; } + + inline const vec4F& get_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + inline vec4F& get_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + + inline const vec4F& get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const + { + x = wrap_u ? posmod(x, m_width) : clamp(x, 0, m_width - 1); + y = wrap_v ? posmod(y, m_height) : clamp(y, 0, m_height - 1); + return m_pixels[x + y * m_pitch]; + } + + inline vec4F& get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) + { + x = wrap_u ? posmod(x, m_width) : clamp(x, 0, m_width - 1); + y = wrap_v ? posmod(y, m_height) : clamp(y, 0, m_height - 1); + return m_pixels[x + y * m_pitch]; + } + + inline imagef& set_pixel_clipped(int x, int y, const vec4F& c) + { + if ((static_cast(x) < m_width) && (static_cast(y) < m_height)) + (*this)(x, y) = c; + return *this; + } + + // Very straightforward blit with full clipping. Not fast, but it works. + imagef& blit(const imagef& src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y) + { + for (int y = 0; y < src_h; y++) + { + const int sy = src_y + y; + if (sy < 0) + continue; + else if (sy >= (int)src.get_height()) + break; + + for (int x = 0; x < src_w; x++) + { + const int sx = src_x + x; + if (sx < 0) + continue; + else if (sx >= (int)src.get_height()) + break; + + set_pixel_clipped(dst_x + x, dst_y + y, src(sx, sy)); + } + } + + return *this; + } + + const imagef& extract_block_clamped(vec4F* pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const + { + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + *pDst++ = get_clamped(src_x + x, src_y + y); + return *this; + } + + imagef& set_block_clipped(const vec4F* pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h) + { + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + set_pixel_clipped(dst_x + x, dst_y + y, *pSrc++); + return *this; + } + + inline uint32_t get_width() const { return m_width; } + inline uint32_t get_height() const { return m_height; } + inline uint32_t get_pitch() const { return m_pitch; } + inline uint32_t get_total_pixels() const { return m_width * m_height; } + + inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; } + inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; } + inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); } + + inline const vec4F_array& get_pixels() const { return m_pixels; } + inline vec4F_array& get_pixels() { return m_pixels; } + + inline const vec4F* get_ptr() const { return &m_pixels[0]; } + inline vec4F* get_ptr() { return &m_pixels[0]; } + +private: + uint32_t m_width, m_height, m_pitch; // all in pixels + vec4F_array m_pixels; +}; + +enum +{ + cComputeGaussianFlagNormalize = 1, + cComputeGaussianFlagPrint = 2, + cComputeGaussianFlagNormalizeCenterToOne = 4 +}; + +// size_x/y should be odd +void compute_gaussian_kernel(float* pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags); + +void gaussian_filter(imagef& dst, const imagef& orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping = false, uint32_t width_divisor = 1, uint32_t height_divisor = 1); + +vec4F compute_ssim(const imagef& a, const imagef& b); + +vec4F compute_ssim(const image_u8& a, const image_u8& b, bool luma); + +struct block8 +{ + uint64_t m_vals[1]; +}; + +typedef std::vector block8_vec; + +struct block16 +{ + uint64_t m_vals[2]; +}; + +typedef std::vector block16_vec; + +//bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void* pBlocks, uint32_t pixel_format_bpp, DXGI_FORMAT dxgi_format, bool srgb, bool force_dx10_header); + +void strip_extension(std::string& s); +void strip_path(std::string& s); + +uint32_t hash_hsieh(const uint8_t* pBuf, size_t len); + +// https://www.johndcook.com/blog/standard_deviation/ +// This class is for small numbers of integers, so precision shouldn't be an issue. +class tracked_stat +{ +public: + tracked_stat() { clear(); } + + void clear() { m_num = 0; m_total = 0; m_total2 = 0; } + + void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; } + + tracked_stat& operator += (uint32_t val) { update(val); return *this; } + + uint32_t get_number_of_values() const { return m_num; } + uint64_t get_total() const { return m_total; } + uint64_t get_total2() const { return m_total2; } + + float get_mean() const { return m_num ? (float)m_total / m_num : 0.0f; }; + + float get_variance() const { return m_num ? ((float)(m_num * m_total2 - m_total * m_total)) / (m_num * m_num) : 0.0f; } + float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } + + float get_sample_variance() const { return (m_num > 1) ? ((float)(m_num * m_total2 - m_total * m_total)) / (m_num * (m_num - 1)) : 0.0f; } + float get_sample_std_dev() const { return (m_num > 1) ? sqrtf(get_sample_variance()) : 0.0f; } + +private: + uint32_t m_num; + uint64_t m_total; + uint64_t m_total2; +}; + +inline float compute_covariance(const float* pA, const float* pB, const tracked_stat& a, const tracked_stat& b, bool sample) +{ + const uint32_t n = a.get_number_of_values(); + assert(n == b.get_number_of_values()); + + if (!n) + { + assert(0); + return 0.0f; + } + if ((sample) && (n == 1)) + { + assert(0); + return 0; + } + + const float mean_a = a.get_mean(); + const float mean_b = b.get_mean(); + + float total = 0.0f; + for (uint32_t i = 0; i < n; i++) + total += (pA[i] - mean_a) * (pB[i] - mean_b); + + return total / (sample ? (n - 1) : n); +} + +inline float compute_correlation_coefficient(const float* pA, const float* pB, const tracked_stat& a, const tracked_stat& b, float c, bool sample) +{ + if (!a.get_number_of_values()) + return 1.0f; + + float covar = compute_covariance(pA, pB, a, b, sample); + float std_dev_a = sample ? a.get_sample_std_dev() : a.get_std_dev(); + float std_dev_b = sample ? b.get_sample_std_dev() : b.get_std_dev(); + float denom = std_dev_a * std_dev_b + c; + + if (denom < .0000125f) + return 1.0f; + + float result = (covar + c) / denom; + + return clamp(result, -1.0f, 1.0f); +} + +float compute_block_max_std_dev(const color_quad_u8* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps); + +class rand +{ + std::mt19937 m_mt; + +public: + rand() { } + + rand(uint32_t s) { seed(s); } + void seed(uint32_t s) { m_mt.seed(s); } + + // between [l,h] + int irand(int l, int h) { std::uniform_int_distribution d(l, h); return d(m_mt); } + + uint32_t urand32() { return static_cast(irand(INT32_MIN, INT32_MAX)); } + + bool bit() { return irand(0, 1) == 1; } + + uint8_t byte() { return static_cast(urand32()); } + + // between [l,h) + float frand(float l, float h) { std::uniform_real_distribution d(l, h); return d(m_mt); } + + float gaussian(float mean, float stddev) { std::normal_distribution d(mean, stddev); return d(m_mt); } +}; + +bool save_astc_file(const char* pFilename, block16_vec& blocks, uint32_t width, uint32_t height, uint32_t block_width, uint32_t block_height); +bool load_astc_file(const char* pFilename, block16_vec& blocks, uint32_t& width, uint32_t& height, uint32_t& block_width, uint32_t& block_height); + +class value_stats +{ +public: + value_stats() + { + clear(); + } + + void clear() + { + m_sum = 0; + m_sum2 = 0; + m_num = 0; + m_min = 1e+39; + m_max = -1e+39; + m_vals.clear(); + } + + void add(double val) + { + m_sum += val; + m_sum2 += val * val; + + m_num++; + + m_min = std::min(m_min, val); + m_max = std::max(m_max, val); + + m_vals.push_back(val); + } + + void add(int val) + { + add(static_cast(val)); + } + + void add(uint32_t val) + { + add(static_cast(val)); + } + + void add(int64_t val) + { + add(static_cast(val)); + } + + void add(uint64_t val) + { + add(static_cast(val)); + } + + void print(const char* pPrefix = "") + { + if (!m_vals.size()) + printf("%s: Empty\n", pPrefix); + else + printf("%s: Samples: %llu, Total: %f, Avg: %f, Std Dev: %f, Min: %f, Max: %f, Mean: %f\n", + pPrefix, (unsigned long long)get_num(), get_total(), get_average(), get_std_dev(), get_min(), get_max(), get_mean()); + } + + double get_total() const + { + return m_sum; + } + + double get_average() const + { + return m_num ? (m_sum / m_num) : 0.0f; + } + + double get_min() const + { + return m_min; + } + + double get_max() const + { + return m_max; + } + + uint64_t get_num() const + { + return m_num; + } + + double get_val(uint32_t index) const + { + return m_vals[index]; + } + + // Returns population standard deviation + double get_std_dev() const + { + if (!m_num) + return 0.0f; + + // TODO: FP precision + return sqrt((m_sum2 - ((m_sum * m_sum) / m_num)) / m_num); + } + + double get_mean() const + { + if (!m_num) + return 0.0f; + + std::vector sorted_vals(m_vals); + std::sort(sorted_vals.begin(), sorted_vals.end()); + + return sorted_vals[sorted_vals.size() / 2]; + } + +private: + double m_sum; + double m_sum2; + + uint64_t m_num; + + double m_min; + double m_max; + + mutable std::vector m_vals; +}; + +uint32_t get_deflate_size(const void* pData, size_t data_size); + +} // namespace utils + +#ifdef _MSC_VER +#pragma warning (pop) +#endif diff --git a/libkram/kram/KramImage.cpp b/libkram/kram/KramImage.cpp index 80568fdf..82c35f44 100644 --- a/libkram/kram/KramImage.cpp +++ b/libkram/kram/KramImage.cpp @@ -2524,29 +2524,29 @@ bool KramEncoder::compressMipLevel(const ImageInfo& info, KTXImage& image, uberLevel = 0; maxPartitions = 0; bc7params.m_try_least_squares = false; - bc7params.m_mode_partition_estimation_filterbank = true; + bc7params.m_mode17_partition_estimation_filterbank = true; } else if (info.quality <= 40) { uberLevel = 0; maxPartitions = 16; bc7params.m_try_least_squares = false; - bc7params.m_mode_partition_estimation_filterbank = true; + bc7params.m_mode17_partition_estimation_filterbank = true; } else if (info.quality <= 90) { uberLevel = 1; maxPartitions = 64; bc7params.m_try_least_squares = true; // true = 0.7s on test case - bc7params.m_mode_partition_estimation_filterbank = true; + bc7params.m_mode17_partition_estimation_filterbank = true; } else { uberLevel = 4; maxPartitions = 64; bc7params.m_try_least_squares = true; - bc7params.m_mode_partition_estimation_filterbank = true; + bc7params.m_mode17_partition_estimation_filterbank = true; } bc7params.m_uber_level = std::min(uberLevel, (uint32_t)BC7ENC_MAX_UBER_LEVEL); - bc7params.m_max_partitions_mode = std::min(maxPartitions, (uint32_t)BC7ENC_MAX_PARTITIONS1); + bc7params.m_max_partitions = std::min(maxPartitions, (uint32_t)BC7ENC_MAX_PARTITIONS); } else if (info.pixelFormat == MyMTLPixelFormatBC1_RGBA || info.pixelFormat == MyMTLPixelFormatBC1_RGBA_sRGB || From 73f333ff2ce96f6c6aef358545f95f017b8073cc Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 16 Jul 2022 09:08:12 -0700 Subject: [PATCH 047/615] kram - fix etc2 encoder This has an optimization to knock out color on transparent pixels, but that only works if encoding a premul texture. kram already knocks out the color pixels in that case. This was causing artifacts in rgb where alpha was fully 0 across the block on 4 channel ETC2 rgba textures. --- libkram/etc2comp/EtcBlock4x4Encoding_ETC1.cpp | 2 +- libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp | 7 ++++++- libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp | 6 +++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_ETC1.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_ETC1.cpp index 360e302e..f591a8bb 100644 --- a/libkram/etc2comp/EtcBlock4x4Encoding_ETC1.cpp +++ b/libkram/etc2comp/EtcBlock4x4Encoding_ETC1.cpp @@ -409,7 +409,7 @@ namespace Etc ColorFloatRGBA frgbaSumUR = m_pafrgbaSource[8] + m_pafrgbaSource[9] + m_pafrgbaSource[12] + m_pafrgbaSource[13]; ColorFloatRGBA frgbaSumLR = m_pafrgbaSource[10] + m_pafrgbaSource[11] + m_pafrgbaSource[14] + m_pafrgbaSource[15]; - // aveerage value of 8 pixels for each of the 4 corners + // average value of 8 pixels for each of the 4 corners m_frgbaSourceAverageLeft = (frgbaSumUL + frgbaSumLL) * 0.125f; m_frgbaSourceAverageRight = (frgbaSumUR + frgbaSumLR) * 0.125f; m_frgbaSourceAverageTop = (frgbaSumUL + frgbaSumUR) * 0.125f; diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp index a6f0f125..fe593a26 100644 --- a/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp +++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGB8.cpp @@ -451,7 +451,8 @@ namespace Etc float fGrayDistance2ToColor2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor2_TAndH); ColorFloatRGBA frgbaAlphaWeightedSource = m_pafrgbaSource[uiPixel] * alpha; - + frgbaAlphaWeightedSource.fA = 1.0f; + if (fGrayDistance2ToColor1 <= fGrayDistance2ToColor2) { fPixelsCloserToColor1 += alpha; @@ -468,9 +469,13 @@ namespace Etc break; } + // this doesn't scale alpha ColorFloatRGBA frgbAvgColor1Pixels = (frgbSumPixelsCloserToColor1 * (1.0f / fPixelsCloserToColor1)).QuantizeR4G4B4(); ColorFloatRGBA frgbAvgColor2Pixels = (frgbSumPixelsCloserToColor2 * (1.0f / fPixelsCloserToColor2)).QuantizeR4G4B4(); + frgbAvgColor1Pixels.fA = 1.0f; + frgbAvgColor2Pixels.fA = 1.0f; + if (frgbAvgColor1Pixels.fR == m_frgbaOriginalColor1_TAndH.fR && frgbAvgColor1Pixels.fG == m_frgbaOriginalColor1_TAndH.fG && frgbAvgColor1Pixels.fB == m_frgbaOriginalColor1_TAndH.fB && diff --git a/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp b/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp index 2c9fcdaa..ea0a2427 100644 --- a/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp +++ b/libkram/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp @@ -145,7 +145,7 @@ namespace Etc m_boolDone = true; } else if ((sourceAlphaMix == Block4x4::SourceAlphaMix::ALL_ZERO_ALPHA) || - (sourceAlphaMix == Block4x4::SourceAlphaMix::TRANSPARENT)) + (sourceAlphaMix == Block4x4::SourceAlphaMix::TRANSPARENT)) { // set the A8 portion m_fBase = 0; @@ -504,6 +504,9 @@ namespace Etc { m_alpha.PerformIteration(a_fEffort); + /* TODO: can only do this if color if encoding premul color + but kram already knocks out all the color channels in this cae + // this skips writing out color too if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT) { @@ -530,6 +533,7 @@ namespace Etc m_boolDone = true; //m_uiEncodingIterations++; } + */ } if (!m_boolDone) From 69c5f44de9efe493cec11760d286db52dcc00b7c Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 16 Jul 2022 09:58:33 -0700 Subject: [PATCH 048/615] kramv - fix decode Was referencing KRAM_SSE which didn't exist instead of USE_SSE --- kramv/KramLoader.mm | 2 +- libkram/kram/TaskSystem.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kramv/KramLoader.mm b/kramv/KramLoader.mm index 97a08efe..8876ac42 100644 --- a/kramv/KramLoader.mm +++ b/kramv/KramLoader.mm @@ -74,7 +74,7 @@ - (instancetype)init // for macOS/win Intel need to decode astc/etc // on macOS/arm, the M1 supports all 3 encode formats -#define DO_DECODE KRAM_SSE +#define DO_DECODE USE_SSE #if DO_DECODE diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index f709b22b..8fa1470b 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -480,7 +480,7 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t #if KRAM_MAC // don't use this, it's unsupported on ARM chips, and only affinity hints on x64 -// #if KRAM_SSE +// #if USE_SSE // if (!coreInfo.isTranslated) { // thread_affinity_policy_data_t policy = { (int)affinityMask }; // From e83124266b5b577353b70e34e5e4ddd25279cb2e Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 16 Jul 2022 13:28:00 -0700 Subject: [PATCH 049/615] kram - decode ETC2 on M1 if using 3d textures M1 supports all texture formats, but apparently not on 3d texture types - only ASTC/BC there I guess. --- kramv/KramLoader.mm | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/kramv/KramLoader.mm b/kramv/KramLoader.mm index 8876ac42..04c5d010 100644 --- a/kramv/KramLoader.mm +++ b/kramv/KramLoader.mm @@ -72,25 +72,26 @@ - (instancetype)init originalFormat:originalFormat]; } -// for macOS/win Intel need to decode astc/etc -// on macOS/arm, the M1 supports all 3 encode formats -#define DO_DECODE USE_SSE -#if DO_DECODE // this means format isnt supported on platform, but can be decoded to rgba to // display -bool isDecodeImageNeeded(MyMTLPixelFormat pixelFormat) +bool isDecodeImageNeeded(MyMTLPixelFormat pixelFormat, MyMTLTextureType type) { bool needsDecode = false; +#if USE_SSE if (isETCFormat(pixelFormat)) { needsDecode = true; } else if (isASTCFormat(pixelFormat)) { needsDecode = true; } - +#else + if (isETCFormat(pixelFormat) && type == MyMTLTextureType3D) { + needsDecode = true; + } +#endif return needsDecode; } @@ -98,7 +99,7 @@ bool decodeImage(const KTXImage &image, KTXImage &imageDecoded) { KramDecoderParams decoderParams; KramDecoder decoder; - +#if USE_SSE if (isETCFormat(image.pixelFormat)) { if (!decoder.decode(image, imageDecoded, decoderParams)) { return NO; @@ -109,6 +110,13 @@ bool decodeImage(const KTXImage &image, KTXImage &imageDecoded) return NO; } } +#else + if (isETCFormat(image.pixelFormat) && image.textureType == MyMTLTextureType3D) { + if (!decoder.decode(image, imageDecoded, decoderParams)) { + return NO; + } + } +#endif else { assert(false); // don't call this routine if decode not needed } @@ -119,8 +127,6 @@ bool decodeImage(const KTXImage &image, KTXImage &imageDecoded) return YES; } -#endif - #if SUPPORT_RGB // TODO: move these into libkram @@ -260,8 +266,7 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format) *originalFormat = (MTLPixelFormat)image.pixelFormat; } -#if DO_DECODE - if (isDecodeImageNeeded(image.pixelFormat)) { + if (isDecodeImageNeeded(image.pixelFormat, image.textureType)) { KTXImage imageDecoded; if (!decodeImage(image, imageDecoded)) { return nil; @@ -270,7 +275,6 @@ inline MyMTLPixelFormat remapInternalRGBFormat(MyMTLPixelFormat format) return [self blitTextureFromImage:imageDecoded name:name]; } else -#endif { // fast load path directly from mmap'ed data, decompress direct to staging return [self blitTextureFromImage:image name:name]; From 8a1ea4367f547cb71e8e456389a021a40f5d8a9a Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 23 Jul 2022 12:03:50 -0700 Subject: [PATCH 050/615] kram - return failure if src data from dds is incomplete --- libkram/kram/Kram.cpp | 10 +++++----- libkram/kram/KramDDSHelper.cpp | 5 +++++ libkram/lodepng/lodepng.h | 6 ++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index 9880a010..01e105dc 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -52,7 +52,7 @@ namespace kram { using namespace NAMESPACE_STL; // lodepng iccp decode is failing when setting this for some reason, find out why -// Must set in with LODEPNG_NO_COMPILE_ZLIB in lodepng.h if true +// Must set it with LODEPNG_NO_COMPILE_ZLIB in lodepng.h if true static bool useMiniZ = false; template @@ -64,22 +64,22 @@ void releaseVector(vector& v) bool isKTXFilename(const char* filename) { - // should really lookg at first 4 bytes of data + // should really look at first 4 bytes of data return endsWithExtension(filename, ".ktx"); } bool isKTX2Filename(const char* filename) { - // should really lookg at first 4 bytes of data + // should really look at first 4 bytes of data return endsWithExtension(filename, ".ktx2"); } bool isDDSFilename(const char* filename) { - // should really lookg at first 4 bytes of data + // should really look at first 4 bytes of data return endsWithExtension(filename, ".dds"); } bool isPNGFilename(const char* filename) { - // should really lookg at first 4 bytes of data + // should really look at first 4 bytes of data return endsWithExtension(filename, ".png"); } diff --git a/libkram/kram/KramDDSHelper.cpp b/libkram/kram/KramDDSHelper.cpp index 595b3390..af3909d6 100644 --- a/libkram/kram/KramDDSHelper.cpp +++ b/libkram/kram/KramDDSHelper.cpp @@ -511,6 +511,11 @@ bool DDSHelper::load(const uint8_t* data, size_t dataSize, KTXImage& image, bool size_t dstOffset = image.chunkOffset(mipNum, chunkNum); size_t mipLength = image.mipLevels[mipNum].length; + if ((mipDataOffset + srcOffset + mipLength) > dataSize) { + KLOGE("kram", "source image data incomplete"); + return false; + } + memcpy(dstImageData + dstOffset, srcImageData + srcOffset, mipLength); srcOffset += mipLength; diff --git a/libkram/lodepng/lodepng.h b/libkram/lodepng/lodepng.h index 785e777f..524bca4f 100644 --- a/libkram/lodepng/lodepng.h +++ b/libkram/lodepng/lodepng.h @@ -36,9 +36,11 @@ extern const char* LODEPNG_VERSION_STRING; // don't need io #define LODEPNG_NO_COMPILE_DISK -// using miniz now + +// using miniz now, but this was failing using miniz so switched off //#define LODEPNG_NO_COMPILE_ZLIB -// was not doing png saves, but to strip blocks now need to + +// was not doing png encodes, but to strip blocks now need to #define LODEPNG_COMPILE_ENCODER /* From a3e8ff83e86cd833ae77910668127b5bbe815b60 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Mon, 25 Jul 2022 21:43:55 -0700 Subject: [PATCH 051/615] kramv - fix buildShaders.sh script This wasn't including all shaders, and is simpler to build the metallib while skipping the .air files. Can drop this onto kramv to hotload shaders by re-opening it in recent file list. --- scripts/buildShaders.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scripts/buildShaders.sh b/scripts/buildShaders.sh index 1c01481f..d52cc060 100755 --- a/scripts/buildShaders.sh +++ b/scripts/buildShaders.sh @@ -1,7 +1,6 @@ #!/bin/zsh -xcrun -sdk macosx metal -c ../kramv/KramShaders.metal -o ../bin/KramShaders.air -xcrun -sdk macosx metallib ../bin/KramShaders.air -o ../bin/KramShaders.metallib - -# don't need this after metallib built -rm ../bin/KramShaders.air \ No newline at end of file +# run from kram directory +pushd kramv/Shaders +xcrun -sdk macosx metal KramShaders.metal skybox.metal pbr.metal hdr.metal brdf.metal -o ../../bin/KramShaders.metallib +popd From 0d458babb1ec324ba0c6a1d906bcc646d30f735c Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Wed, 27 Jul 2022 10:10:50 -0700 Subject: [PATCH 052/615] kram - expose sdfThreshold, and improve walk of samples Getting more images that are 8-bit gray, and the fixed 120 cutoff isn't enough. So expose that to callers via -sdfThreshold Also walk the data in hedistance more in line with the original. --- libkram/heman/hedistance.cpp | 67 ++++++++++++++++++++-------------- libkram/kram/Kram.cpp | 20 +++++++++- libkram/kram/KramImage.cpp | 2 +- libkram/kram/KramImageInfo.cpp | 2 + libkram/kram/KramImageInfo.h | 5 +++ libkram/kram/KramSDFMipper.cpp | 5 ++- libkram/kram/KramSDFMipper.h | 2 +- 7 files changed, 71 insertions(+), 32 deletions(-) diff --git a/libkram/heman/hedistance.cpp b/libkram/heman/hedistance.cpp index ab5802cf..0f03de07 100644 --- a/libkram/heman/hedistance.cpp +++ b/libkram/heman/hedistance.cpp @@ -114,16 +114,21 @@ static void heman_image_destroy(heman_image* img) // and an array of (w * h * nbands) floats, in scanline order. For simplicity // the API disallows struct definitions, so this is just an opaque handle. +using hfloat = float; + // 1E20 isn't big enough to process a 2k image with 1 pixel in each corner -const float INF = 1E20; +const hfloat INF = 1E20; // 2k max image is 11-bits x squared = 22 + 1 // this is also the limit of single-float precision in the mantissa -//const float INF = 1E23; +//const hfloat INF = 1E23; #define NEW(t, n) (t*)calloc(n, sizeof(t)) #define SQR(x) ((x) * (x)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// Compare here +// https://github.com/prideout/heman/blob/master/src/distance.c + // @ 8k x 8K resolution, this needs 1/2 GB for the fp32 buffers // so it resizes to the dst area. It doesn't re-eval off parabolas until the very end. // and needs first pass data for second pass. @@ -131,8 +136,9 @@ const float INF = 1E20; // This is really the sedt (squared euclidian distance transform). // The advantage is this can be stored in integer values, but this does parabolic lookup. // Also since EDT is done as a separable filter, it completes very quickly in O(N) of two passes. + static void squared_edt( - const float* f, float* d, float* z, int32_t* w, int32_t numSrcSamples, int32_t numDstSamples) + const hfloat* f, hfloat* d, hfloat* z, int32_t* w, int32_t numSrcSamples, int32_t numDstSamples) { // hull vertices w[0] = 0; @@ -141,19 +147,20 @@ static void squared_edt( z[0] = -INF; z[1] = +INF; - for (int32_t k = 0, q = 1; q < numSrcSamples; ++q) { + int32_t k = 0; + for (int32_t q = 1; q < numSrcSamples; ++q) { int32_t wk = w[k]; - float s; + const hfloat sConst = (f[q] + SQR(q)); - s = ((f[q] - f[wk]) + (float)(SQR(q) - SQR(wk))) / (float)(2 * (q - wk)); + hfloat s = (sConst - f[wk] - SQR(wk)) / (2 * (q - wk)); // this additional parabolic search completes in 0 or 1 iterations, so algorithm still O(n) - // sarch back and replace any higher parabola + // search back and replace any higher parabola while (s <= z[k]) { --k; wk = w[k]; - s = ((f[q] - f[wk]) + (float)(SQR(q) - SQR(wk))) / (float)(2 * (q - wk)); + s = (sConst - f[wk] - SQR(wk)) / (2 * (q - wk)); } k++; @@ -169,24 +176,28 @@ static void squared_edt( // Note: this can resample do a different sample count, since the stored parabolas // can be evaluated at any point along the curve. bool isResampling = numSrcSamples > numDstSamples; - float conversion = (numSrcSamples / (float)numDstSamples); + float conversion = (numDstSamples / (float)numSrcSamples); - for (int32_t k = 0, q = 0; q < numDstSamples; ++q) { - float qSrc = (float)q; - // convert q in dstSamples into sample in srcSamples - if (isResampling) { - qSrc *= conversion; + k = 0; + for (int32_t q = 0; q < numSrcSamples; ++q) { + // lookup the parabola, and evalute distance-squared from that + while (z[k + 1] < q) { + ++k; } - // lookup the parabola, and evalute distance squared from that - while (z[k + 1] < qSrc) { - ++k; + // convert to dst sample + int32_t qDst = q; + if (isResampling) { + // this may overwrite the same value > 1 time + // TODO: what if this skips an entry in d[]? - is that possible + qDst = (int32_t)((float)q * conversion); // don't roundf + assert(qDst < numDstSamples); } - int32_t wk = w[k]; - d[q] = f[wk] + (float)SQR(qSrc - (float)wk); - // above is adding intersection height to existing sample + // add intersection height to existing sample // of the lowest point intersection + int32_t wk = w[k]; + d[qDst] = f[wk] + SQR(q - wk); } } @@ -202,18 +213,18 @@ static void transform_to_distance(heman_image* temp, const my_image* src, int32_ assert(srcWidth >= dstWidth && srcHeight >= dstHeight); // these can all just be strip buffers per thread, but only one thread - // these were originall turned into 2d arrays for omp + // these were originally turned into 2d arrays for omp int32_t maxDim = MAX(srcWidth, srcHeight); - float* f = NEW(float, maxDim); - float* d = NEW(float, maxDim); - float* z = NEW(float, maxDim+1); // padded by 1 + hfloat* f = NEW(hfloat, maxDim); + hfloat* d = NEW(hfloat, maxDim); + hfloat* z = NEW(hfloat, maxDim+1); // padded by 1 int32_t* w = NEW(int32_t, maxDim); // process rows for (int32_t y = 0; y < srcHeight; ++y) { const uint8_t* s = src->data + y * srcWidth; - // load data into the rows, this is because tmp width is dstWidth, not srcWidth + // load data into the rows, if (isPositive) { for (int32_t x = 0; x < srcWidth; ++x) { f[x] = s[x] ? INF : 0; @@ -229,6 +240,7 @@ static void transform_to_distance(heman_image* temp, const my_image* src, int32_ // this is only pulling from closest parabola, not bilerping squared_edt(f, d, z, w, srcWidth, dstWidth); + // now have dstWidth * srcHeight image for column pass below float* t = temp->data + y * dstWidth; for (int32_t x = 0; x < dstWidth; ++x) { t[x] = d[x]; @@ -237,6 +249,7 @@ static void transform_to_distance(heman_image* temp, const my_image* src, int32_ // process columns for (int32_t x = 0; x < dstWidth; ++x) { + // Note offset by x references a specific column float* t = temp->data + x; for (int32_t y = 0; y < srcHeight; ++y) { @@ -247,8 +260,9 @@ static void transform_to_distance(heman_image* temp, const my_image* src, int32_ // this is only pulling from closest parabola, not bilerping squared_edt(f, d, z, w, srcHeight, dstHeight); + // can write over the same src column from t, it's already offet by x for (int32_t y = 0; y < dstHeight; ++y) { - t[y * dstWidth] = sqrtf(d[y]); // back to distance + t[y * dstWidth] = sqrtf(d[y]); // convert d^2 -> d } } @@ -283,7 +297,6 @@ void heman_distance_create_sdf(const my_image* src, my_image* dst, float& maxD, transform_to_distance(negative, src, dstHeight, false); if (maxD == 0) { - // now find signed distance, and store back into positive array float minV = 0.0f, maxV = 0.0f; for (int32_t y = 0; y < dstHeight; ++y) { diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index 01e105dc..3ff68818 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -1688,6 +1688,7 @@ void kramEncodeUsage(bool showVersion = true) "\t [-swizzle rg01]\n" "\t [-avg rxbx]\n" "\t [-sdf]\n" + "\t [-sdfThreshold 120]\n" "\t [-premul] [-prezero] [-premulrgb]\n" "\t [-gray]\n" "\t [-optopaque]\n" @@ -1767,7 +1768,9 @@ void kramEncodeUsage(bool showVersion = true) "\tNormal map rg storage signed for etc/bc (rg01), only unsigned astc L+A (gggr).\n" "\t-sdf" "\tGenerate single-channel SDF from a bitmap, can mip and drop large mips. Encode to r8, bc4, etc2r, astc4x4 (Unorm LLL1) to encode\n" - + "\t-sdfThreshold 120" + "\tSDF generation uses bitmap converted from 8-bit red channel\n" + "\t-gray" "\tConvert to grayscale before premul\n" @@ -2655,6 +2658,21 @@ static int32_t kramAppEncode(vector& args) if (isStringEqual(word, "-sdf")) { infoArgs.doSDF = true; } + else if (isStringEqual(word, "-sdfThreshold")) { + ++i; + if (i >= argc) { + KLOGE("Kram", "sdfThreshold arg invalid"); + error = true; + break; + } + + infoArgs.sdfThreshold = atoi(args[i]); + if (infoArgs.sdfThreshold < 1 || infoArgs.sdfThreshold > 255) { + KLOGE("Kram", "sdfThreshold arg invalid"); + error = true; + break; + } + } else if (isStringEqual(word, "-optopaque")) { infoArgs.optimizeFormatForOpaque = true; } diff --git a/libkram/kram/KramImage.cpp b/libkram/kram/KramImage.cpp index 82c35f44..c0c813f9 100644 --- a/libkram/kram/KramImage.cpp +++ b/libkram/kram/KramImage.cpp @@ -2154,7 +2154,7 @@ bool KramEncoder::createMipsFromChunks( } if (info.doSDF) { - sdfMipper.init(srcImage, info.isVerbose); + sdfMipper.init(srcImage, info.sdfThreshold, info.isVerbose); } else { // copy and convert to half4 or float4 image diff --git a/libkram/kram/KramImageInfo.cpp b/libkram/kram/KramImageInfo.cpp index 9b0e1fdf..0d42132c 100644 --- a/libkram/kram/KramImageInfo.cpp +++ b/libkram/kram/KramImageInfo.cpp @@ -1015,6 +1015,8 @@ void ImageInfo::initWithArgs(const ImageInfoArgs& args) isNormal = args.isNormal; doSDF = args.doSDF; + sdfThreshold = args.sdfThreshold; + //skipImageLength = args.skipImageLength; // mips diff --git a/libkram/kram/KramImageInfo.h b/libkram/kram/KramImageInfo.h index b7d53ca2..1241a1f3 100644 --- a/libkram/kram/KramImageInfo.h +++ b/libkram/kram/KramImageInfo.h @@ -85,6 +85,8 @@ class ImageInfoArgs { int32_t chunksX = 0; int32_t chunksY = 0; int32_t chunksCount = 0; + + int32_t sdfThreshold = 120; }; // preset data that contains all inputs about the encoding @@ -179,6 +181,9 @@ class ImageInfo { int32_t chunksX = 0; int32_t chunksY = 0; int32_t chunksCount = 0; + + // This converts incoming image channel to bitmap + int32_t sdfThreshold = 120; }; bool isSwizzleValid(const char* swizzle); diff --git a/libkram/kram/KramSDFMipper.cpp b/libkram/kram/KramSDFMipper.cpp index 2c8d850b..4a8d7845 100644 --- a/libkram/kram/KramSDFMipper.cpp +++ b/libkram/kram/KramSDFMipper.cpp @@ -14,14 +14,15 @@ namespace kram { using namespace heman; using namespace NAMESPACE_STL; -void SDFMipper::init(ImageData& srcImage, bool isVerbose_) +void SDFMipper::init(ImageData& srcImage, uint8_t sdfThreshold, bool isVerbose_) { // this resets maxD, which is determined off first mip generated // all mips are using same source, so distances should be same range to // scale maxD = 0.0; isVerbose = isVerbose_; - + threshold = sdfThreshold; + int32_t w = srcImage.width; int32_t h = srcImage.height; diff --git a/libkram/kram/KramSDFMipper.h b/libkram/kram/KramSDFMipper.h index 4fae0da5..8bae8e4f 100644 --- a/libkram/kram/KramSDFMipper.h +++ b/libkram/kram/KramSDFMipper.h @@ -22,7 +22,7 @@ struct my_image { class SDFMipper { public: - void init(ImageData& srcImage, bool isVerbose = false); + void init(ImageData& srcImage, uint8_t sdfThreshold, bool isVerbose = false); void mipmap(ImageData& dstImage, int32_t mipLevel); private: From 1a6291ca92fda506718f53a00a08c2e8c50f2b74 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Wed, 27 Jul 2022 17:04:01 -0700 Subject: [PATCH 053/615] kram - fix numSkippedMips on sdf This can export mips without building from the larger mips --- libkram/kram/KramImage.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libkram/kram/KramImage.cpp b/libkram/kram/KramImage.cpp index c0c813f9..4d6db30c 100644 --- a/libkram/kram/KramImage.cpp +++ b/libkram/kram/KramImage.cpp @@ -2197,10 +2197,9 @@ bool KramEncoder::createMipsFromChunks( } else { if (info.doSDF) { - // have to process all images to SDF // sdf mipper has to build from origin sourceImage // but it can in-place write to the same dstImage - sdfMipper.mipmap(dstImageData, mipLevel); + sdfMipper.mipmap(dstImageData, mipLevel + numSkippedMips); w = dstImageData.width; h = dstImageData.height; From 796f511f997d2213a738c9d95b6d266c5eca099a Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Wed, 27 Jul 2022 17:28:27 -0700 Subject: [PATCH 054/615] kram/v - bump to macOS 11.0 --- build2/kramc.xcodeproj/project.pbxproj | 4 ++-- build2/kramv.xcodeproj/project.pbxproj | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/build2/kramc.xcodeproj/project.pbxproj b/build2/kramc.xcodeproj/project.pbxproj index e73338b2..72bd473b 100644 --- a/build2/kramc.xcodeproj/project.pbxproj +++ b/build2/kramc.xcodeproj/project.pbxproj @@ -281,7 +281,7 @@ GCC_WARN_SHADOW = YES; GCC_WARN_STRICT_SELECTOR_MATCH = YES; HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram"; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_NAME = kram; }; name = Debug; @@ -303,7 +303,7 @@ GCC_WARN_SHADOW = YES; GCC_WARN_STRICT_SELECTOR_MATCH = YES; HEADER_SEARCH_PATHS = "$(PROJECT_DIR)/../libkram/kram"; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_NAME = kram; }; name = Release; diff --git a/build2/kramv.xcodeproj/project.pbxproj b/build2/kramv.xcodeproj/project.pbxproj index cad5656d..f170f89c 100644 --- a/build2/kramv.xcodeproj/project.pbxproj +++ b/build2/kramv.xcodeproj/project.pbxproj @@ -663,7 +663,7 @@ "$(inherited)", "@executable_path/../Frameworks", ); - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_BUNDLE_IDENTIFIER = com.ba.kramv; PRODUCT_NAME = "$(TARGET_NAME)"; }; @@ -696,7 +696,7 @@ "$(inherited)", "@executable_path/../Frameworks", ); - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_BUNDLE_IDENTIFIER = com.ba.kramv; PRODUCT_NAME = "$(TARGET_NAME)"; }; @@ -714,7 +714,7 @@ "@executable_path/../Frameworks", "@executable_path/../../../../Frameworks", ); - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_BUNDLE_IDENTIFIER = "com.ba.kramv.kram-thumb"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; @@ -733,7 +733,7 @@ "@executable_path/../Frameworks", "@executable_path/../../../../Frameworks", ); - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_BUNDLE_IDENTIFIER = "com.ba.kramv.kram-thumb"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; @@ -752,7 +752,7 @@ "@executable_path/../Frameworks", "@executable_path/../../../../Frameworks", ); - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_BUNDLE_IDENTIFIER = "com.ba.kramv.kram-preview"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; @@ -771,7 +771,7 @@ "@executable_path/../Frameworks", "@executable_path/../../../../Frameworks", ); - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 11.0; PRODUCT_BUNDLE_IDENTIFIER = "com.ba.kramv.kram-preview"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; From 831607effef82d66c3da3ea0f0a60141fb575894 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 30 Jul 2022 12:09:21 -0700 Subject: [PATCH 055/615] kramv - improve search to find corresponding normal map --- kramv/KramViewerMain.mm | 51 ++++++++++++++++++++++++++++++----------- libkram/kram/Kram.cpp | 21 +++++++++++++---- 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 2707dce3..37841dca 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -2821,20 +2821,22 @@ - (BOOL)findFilenameInFolders:(const string &)filename -static string findNormalMapFromAlbedoFilename(const char* filename) +static void findPossibleNormalMapFromAlbedoFilename(const char* filename, vector& normalFilenames) { + normalFilenames.clear(); + string filenameShort = filename; const char* ext = strrchr(filename, '.'); auto dotPos = filenameShort.find_last_of("."); if (dotPos == string::npos) - return ""; + return; // now chop off the extension filenameShort = filenameShort.substr(0, dotPos); - const char* searches[] = { "-a", "-d" }; + const char* searches[] = { "-a", "-d", "_Color", "_baseColor" }; for (uint32_t i = 0; i < ArrayCount(searches); ++i) { const char* search = searches[i]; @@ -2844,11 +2846,20 @@ static string findNormalMapFromAlbedoFilename(const char* filename) } } - // may need to try various names, and see if any exist - filenameShort += "-n"; - filenameShort += ext; + const char* suffixes[] = { "-n", "_normal", "_Normal" }; + + string normalFilename; + for (uint32_t i = 0; i < ArrayCount(suffixes); ++i) { + const char* suffix = suffixes[i]; + + // may need to try various names, and see if any exist + normalFilename = filenameShort; + normalFilename += suffix; + normalFilename += ext; + + normalFilenames.push_back( normalFilename ); + } - return filenameShort; } @@ -2868,15 +2879,22 @@ - (BOOL)loadFileFromFolder return NO; } + vector normalFilenames; string normalFilename; bool hasNormal = false; TexContentType texContentType = findContentTypeFromFilename(filename); if (texContentType == TexContentTypeAlbedo) { - normalFilename = findNormalMapFromAlbedoFilename(filename); + findPossibleNormalMapFromAlbedoFilename(filename, normalFilenames); - if (!normalFilename.empty()) - hasNormal = [self findFilenameInFolders:normalFilename]; + for (const auto& name: normalFilenames) { + hasNormal = [self findFilenameInFolders:name]; + + if (hasNormal) { + normalFilename = name; + break; + } + } } //------------------------------- @@ -2986,15 +3004,20 @@ - (BOOL)loadFileFromArchive string normalFilename; bool hasNormal = false; - + vector normalFilenames; TexContentType texContentType = findContentTypeFromFilename(filename); if (texContentType == TexContentTypeAlbedo) { - normalFilename = findNormalMapFromAlbedoFilename(filename); + findPossibleNormalMapFromAlbedoFilename(filename, normalFilenames); - if (!normalFilename.empty()) - hasNormal = _zip.extractRaw(normalFilename.c_str(), &imageNormalData, + for (const auto& name: normalFilenames) { + hasNormal = _zip.extractRaw(name.c_str(), &imageNormalData, imageNormalDataLength); + if (hasNormal) { + normalFilename = name; + break; + } + } } //--------------------------- diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index 3ff68818..5d8c82f3 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -3610,19 +3610,30 @@ TexContentType findContentTypeFromFilename(const char* filename) filenameShort = filenameShort.substr(0, dotPos); // dealing with png means fabricating the format, texture type, and other data - if (endsWith(filenameShort, "-n") || endsWith(filenameShort, "_normal")) { - return TexContentTypeNormal; - } - else if (endsWith(filenameShort, "-sdf")) { + if (endsWith(filenameShort, "-sdf")) { return TexContentTypeSDF; } else if (endsWith(filenameShort, "-h")) { return TexContentTypeHeight; } - else if (endsWith(filenameShort, "-a") || endsWith(filenameShort, "-d") || endsWith(filenameShort, "_baseColor")) { + else if (endsWith(filenameShort, "-n") || + endsWith(filenameShort, "_normal") || + endsWith(filenameShort, "_Normal") + ) + { + return TexContentTypeNormal; + } + else if (endsWith(filenameShort, "-a") || + endsWith(filenameShort, "-d") || + endsWith(filenameShort, "_baseColor") || + endsWith(filenameShort, "_Color") + ) + { return TexContentTypeAlbedo; } + // TODO: also _AO, _Metallic, _Roughness + // fallback to albedo for now return TexContentTypeAlbedo; } From ea99ecc35d681a4eefa5692de374e4a09e1e7438 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 30 Jul 2022 12:11:01 -0700 Subject: [PATCH 056/615] kram - start of Win Numa support Chris Green posted about processor groups on Win to go beyond 64 cores. For a rare set of multicore Threadripper devs, but might as well do it right. Commented out, since modifies remap table and other calls. --- libkram/kram/TaskSystem.cpp | 84 ++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 29 deletions(-) diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index 8fa1470b..d2e2fcea 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -473,30 +473,16 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t // for now only allow single core mask uint64_t affinityMask = ((uint64_t)1) << threadIndex; - + // These are used in most of the paths macroUnusedVar(handle); macroUnusedVar(affinityMask); -#if KRAM_MAC - // don't use this, it's unsupported on ARM chips, and only affinity hints on x64 -// #if USE_SSE -// if (!coreInfo.isTranslated) { -// thread_affinity_policy_data_t policy = { (int)affinityMask }; -// -// // TODO: consider skipping affinity on macOS altogether -// // this is just a hint on x64-based macOS -// int returnVal = thread_policy_set(pthread_mach_thread_np(handle), THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); -// -// if (returnVal != 0) { -// // TODO: unsupported on iOS/M1, only have QoS and priority -// // big P cores can also be disabled to resolve thermal throttling -// } -// } -// #endif + bool success = false; -#elif KRAM_IOS - // no support +#if KRAM_MAC || KRAM_IOS + // no support, don't use thread_policy_set it's not on M1 and just a hint + success = true; #elif KRAM_ANDROID cpu_set_t cpuset; @@ -506,16 +492,57 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t // convert pthread to pid pid_t pid; pthread_getunique_np(handle, &pid); - if (!sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset)) { - // TODO: this can fail on some/all cores - } - + success = sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset) == 0; + #elif KRAM_WIN // each processor group only has 64 bits DWORD_PTR mask = SetThreadAffinityMask(handle, *(const DWORD_PTR*)&affinityMask); - if (mask == 0) { - // TODO: failure case + success = mask != 0; + +#if 0 // TODO: finish this + // Revisit Numa groups on Win, have 128-core/256 ThreadRipper + // https://chrisgreendevelopmentblog.wordpress.com/2017/08/29/thread-pools-and-windows-processor-groups/ + + // win thread pool, but seems to limit to group 0 + // https://github.com/stlab/libraries/blob/develop/stlab/concurrency/default_executor.hpp + + int32_t threadIndexToGroup(int32_t threadIndex) + { + for (int32_t i = 0; i < nNumGroups; i++) + { + if (threadIndex < totalCores[i]) + return i; + } + return 0; // error + } + + void setupWinCoreGroups() + { + // Also have to test for HT on these, and fix remap table. + // Table will need to be larger to accomodate. + + int32_t nNumGroups = GetActiveProcessorGroupCount(); + int32_t numCores[16] = {}; // TODO: make members + int32_t totalCores[16] = 0; + for (int32_t i = 0; i < nNumGroups; i++) + { + numCores[i] = GetMaximumProcessorCount(i); + totalCores[i] += numCores[i]; + } } + + // have to adjust the mask for the core group + int32_t groupNum = threadIndexToGroup(threadIndex); + int32_t groupThreadIndex = (groupNum == 0) ? 0 : totalCores[groupNum-1]; + affinityMask = ((uint64_t)1) << (threadIndex - groupThreadIndex); + + // set group and affinity + GROUP_AFFINITY affinity; + affinity.group = groupNum; + affinity.mask = *(const DWORD_PTR*)&affinityMask; + success = SetThreadGroupAffinity(hndl, &affinity, nullptr); +#endif + #else // most systems are pthread-based, this is represented with array of bits cpu_set_t cpuset; @@ -523,11 +550,10 @@ static void setThreadAffinity(std::thread::native_handle_type handle, uint32_t t CPU_SET(threadIndex, &cpuset); // TODO: check return - int returnVal = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset); - if (returnVal != 0) { - // TODO: linux pthread failure case - } + success = pthread_setaffinity_np(handle, sizeof(cpu_set_t), &cpuset) == 0; #endif + if (!success) + KLOGW("Thread", "Failed to set affinity"); } void task_system::set_current_affinity(uint32_t threadIndex) From 1f1ff8e0fc89ebf87e45944801701f85b8a87091 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 30 Jul 2022 13:12:11 -0700 Subject: [PATCH 057/615] kramv - fix embedded jpg identifier so more gltf load --- gtlf/GLTFMTL/Source/GLTFMTLRenderer.m | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m b/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m index 4d388b0b..98e0cf2f 100644 --- a/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m +++ b/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m @@ -201,10 +201,11 @@ - (void)enqueueReusableBuffer:(id)buffer { texture.label = image.name ? image.name : image.url.lastPathComponent; } else if (image.bufferView != nil) { GLTFBufferView *bufferView = image.bufferView; - NSData *data = [NSData dataWithBytesNoCopy:bufferView.buffer.contents + bufferView.offset length:bufferView.length freeWhenDone:NO]; + const uint8_t* buffer = bufferView.buffer.contents + bufferView.offset; + NSData *data = [NSData dataWithBytesNoCopy:buffer length:bufferView.length freeWhenDone:NO]; - // TODO: identify jpg data by first 4 chars, hande with textureLoaderJpb - bool isJpg = false; + // identify jpg data by first 3 chars, handle with textureLoaderJpb + bool isJpg = buffer[0] == 0xFF && buffer[1] == 0xD8 && buffer[2] == 0xFF; if (isJpg) texture = [self.textureLoaderJpg newTextureWithData:data options:options error:&error]; From 50cc9666aeb22ab112bbdc1216a7cd5098f7296d Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 30 Jul 2022 23:50:01 -0700 Subject: [PATCH 058/615] kramv - add more types of textures --- gtlf/GLTFMTL/Source/GLTFMTLRenderer.m | 2 +- libkram/kram/Kram.cpp | 16 ++++++++++++++-- libkram/kram/Kram.h | 4 +++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m b/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m index 98e0cf2f..ee7bc935 100644 --- a/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m +++ b/gtlf/GLTFMTL/Source/GLTFMTLRenderer.m @@ -202,7 +202,7 @@ - (void)enqueueReusableBuffer:(id)buffer { } else if (image.bufferView != nil) { GLTFBufferView *bufferView = image.bufferView; const uint8_t* buffer = bufferView.buffer.contents + bufferView.offset; - NSData *data = [NSData dataWithBytesNoCopy:buffer length:bufferView.length freeWhenDone:NO]; + NSData *data = [NSData dataWithBytesNoCopy:(void*)buffer length:bufferView.length freeWhenDone:NO]; // identify jpg data by first 3 chars, handle with textureLoaderJpb bool isJpg = buffer[0] == 0xFF && buffer[1] == 0xD8 && buffer[2] == 0xFF; diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index 5d8c82f3..fc467a3b 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -3631,8 +3631,20 @@ TexContentType findContentTypeFromFilename(const char* filename) { return TexContentTypeAlbedo; } - - // TODO: also _AO, _Metallic, _Roughness + else if (endsWith(filenameShort, "-ao") || + endsWith(filenameShort, "_AO") + ) + { + return TexContentTypeAO; + } + else if (endsWith(filenameShort, "-mr") || + endsWith(filenameShort, "_Metallic") || + endsWith(filenameShort, "_Roughness") || + endsWith(filenameShort, "_MetaliicRoughness") + ) + { + return TexContentTypeMetallicRoughness; + } // fallback to albedo for now return TexContentTypeAlbedo; diff --git a/libkram/kram/Kram.h b/libkram/kram/Kram.h index 62264a13..b43ce912 100644 --- a/libkram/kram/Kram.h +++ b/libkram/kram/Kram.h @@ -80,7 +80,9 @@ enum TexContentType TexContentTypeAlbedo, TexContentTypeNormal, TexContentTypeHeight, - TexContentTypeSDF + TexContentTypeSDF, + TexContentTypeAO, + TexContentTypeMetallicRoughness, }; // this is a helper to override the format, since sRGB blocks and settings From 6a4d6e2ce990582d329fa5b2a4522953fa78be32 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Tue, 2 Aug 2022 14:25:37 -0700 Subject: [PATCH 059/615] kramv - split out isPremul from doShaderPremul, also add title info for review Can now walk through a folder of images and see if they have premul enabled without having to bring up info. --- kramv/KramRenderer.mm | 7 ++++--- kramv/KramViewerBase.h | 3 ++- kramv/KramViewerMain.mm | 25 ++++++++++++++++++++++--- kramv/Shaders/KramShaders.h | 2 +- kramv/Shaders/KramShaders.metal | 6 +++--- 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/kramv/KramRenderer.mm b/kramv/KramRenderer.mm index 40cbd404..e37ae3ba 100644 --- a/kramv/KramRenderer.mm +++ b/kramv/KramRenderer.mm @@ -1282,9 +1282,10 @@ - (void)updateImageSettings:(const string &)fullFilename // should really have 3 modes, unmul, default, premul bool isPNG = isPNGFilename(fullFilename.c_str()); - _showSettings->isPremul = false; + _showSettings->isPremul = image.isPremul(); + _showSettings->doShaderPremul = false; if (texContentType == TexContentTypeAlbedo && isPNG) { - _showSettings->isPremul = + _showSettings->doShaderPremul = true; // convert to premul in shader, so can see other channels } @@ -1457,7 +1458,7 @@ - (void)_updateGameState *(Uniforms *)_dynamicUniformBuffer[_uniformBufferIndex].contents; uniforms.isNormal = _showSettings->texContentType == TexContentTypeNormal; - uniforms.isPremul = _showSettings->isPremul; + uniforms.doShaderPremul = _showSettings->doShaderPremul; uniforms.isSigned = _showSettings->isSigned; uniforms.isSwizzleAGToRG = _showSettings->isSwizzleAGToRG; diff --git a/kramv/KramViewerBase.h b/kramv/KramViewerBase.h index 146c01c0..a06e85e2 100644 --- a/kramv/KramViewerBase.h +++ b/kramv/KramViewerBase.h @@ -123,7 +123,8 @@ class ShowSettings { //bool isNormal = false; bool isSigned = false; - bool isPremul = false; // needed for png which only holds unmul + bool isPremul = false; // copy of whether image.isPremul() + bool doShaderPremul = false; // needed for png which only holds unmul bool isSwizzleAGToRG = false; //bool isSDF = false; TexContentType texContentType = TexContentTypeUnknown; diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 37841dca..4c57d38e 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -1906,7 +1906,7 @@ - (void)updateUIControlState Renderer* renderer = (Renderer*)self.delegate; auto showAllState = toState(_showSettings->isShowingAllLevelsAndMips); - auto premulState = toState(_showSettings->isPremul); + auto premulState = toState(_showSettings->doShaderPremul); auto signedState = toState(_showSettings->isSigned); auto checkerboardState = toState(_showSettings->isCheckerboardShown); auto previewState = toState(_showSettings->isPreview); @@ -2434,10 +2434,10 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD // toggle premul alpha vs. unmul else if (action == _actionPremul) { if (!action->isHidden) { - _showSettings->isPremul = !_showSettings->isPremul; + _showSettings->doShaderPremul = !_showSettings->doShaderPremul; isChanged = true; text = "Premul "; - text += _showSettings->isPremul ? "On" : "Off"; + text += _showSettings->doShaderPremul ? "On" : "Off"; } } @@ -2949,6 +2949,25 @@ - (BOOL)loadFileFromFolder string title = "kramv - "; title += formatTypeName(_showSettings->originalFormat); title += " - "; + + // identify what we think the content type is + const char* typeText = ""; + switch(_showSettings->texContentType) { + case TexContentTypeAlbedo: typeText = "a"; break; + case TexContentTypeNormal: typeText = "n"; break; + case TexContentTypeAO: typeText = "ao"; break; + case TexContentTypeMetallicRoughness: typeText = "mr"; break; + case TexContentTypeSDF: typeText = "sdf"; break; + case TexContentTypeHeight: typeText = "h"; break; + } + title += typeText; + // add some info about the texture to avoid needing to go to info + // srgb src would be useful too. + if (_showSettings->texContentType == TexContentTypeAlbedo && _showSettings->isPremul) { + title += ",p"; + + } + title += " - "; title += filenameShort; self.window.title = [NSString stringWithUTF8String:title.c_str()]; diff --git a/kramv/Shaders/KramShaders.h b/kramv/Shaders/KramShaders.h index 6e337c20..282a8d57 100644 --- a/kramv/Shaders/KramShaders.h +++ b/kramv/Shaders/KramShaders.h @@ -118,7 +118,7 @@ struct Uniforms { bool isSigned; bool isNormal; bool isSwizzleAGToRG; - bool isPremul; + bool doShaderPremul; bool isCheckerboardShown; bool isWrap; diff --git a/kramv/Shaders/KramShaders.metal b/kramv/Shaders/KramShaders.metal index e5301a0c..4388f7ec 100644 --- a/kramv/Shaders/KramShaders.metal +++ b/kramv/Shaders/KramShaders.metal @@ -1000,7 +1000,7 @@ float4 DrawPixels( } // to premul, but also need to see without premul - if (uniforms.isPremul) { + if (uniforms.doShaderPremul) { c = toPremul(c); } } @@ -1050,7 +1050,7 @@ float4 DrawPixels( if (uniforms.isSigned) { // Note: premul on signed should occur while still signed, since it's a pull to zoer // to premul, but also need to see without premul - if (uniforms.isPremul) { + if (uniforms.doShaderPremul) { c = toPremul(c); } @@ -1058,7 +1058,7 @@ float4 DrawPixels( c.xyz = toUnorm(c.xyz); } else { - if (uniforms.isPremul) { + if (uniforms.doShaderPremul) { c = toPremul(c); } } From c0f7986ac6b925d9dcc75c61d8abefe25179a985 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sat, 6 Aug 2022 21:43:47 -0700 Subject: [PATCH 060/615] kram - fix warning --- kramv/KramViewerMain.mm | 1 + 1 file changed, 1 insertion(+) diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 4c57d38e..3412fad9 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -2959,6 +2959,7 @@ - (BOOL)loadFileFromFolder case TexContentTypeMetallicRoughness: typeText = "mr"; break; case TexContentTypeSDF: typeText = "sdf"; break; case TexContentTypeHeight: typeText = "h"; break; + case TexContentTypeUnknown: typeText = ""; break; } title += typeText; // add some info about the texture to avoid needing to go to info From f80324870546cf612b0703b1b845c53b783e2371 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Mon, 8 Aug 2022 00:53:23 -0700 Subject: [PATCH 061/615] kram - switch to fastl (MIT), and enable pch Xcode doesn't seem to precompile the pch despite my best efforts. It does prepend it to each file. So -include KramConfig.h is now gone from the c flags. fastl - this is a nice minimal stl from https://github.com/Viladoman/fastl. Bases everything on vector, so string, map/set and unorderd_map/set are minimal. Had to use algorithm, new, mutex, and string from std, so it's a hybrid. Once remaining holes are fixed, then can enable string. --- build2/kram.xcodeproj/project.pbxproj | 78 ++++- build2/kramv.xcodeproj/project.pbxproj | 2 + kram-thumb/KramThumbnailProvider.mm | 8 +- kramv/KramViewerMain.mm | 10 +- libkram/etc2comp/EtcImage.cpp | 2 +- libkram/fastl/LICENSE | 21 ++ libkram/fastl/falgorithm.h | 99 +++++++ libkram/fastl/fstring.h | 211 ++++++++++++++ libkram/fastl/map.h | 122 ++++++++ libkram/fastl/pair.h | 60 ++++ libkram/fastl/set.h | 108 +++++++ libkram/fastl/unordered_map.h | 28 ++ libkram/fastl/unordered_set.h | 28 ++ libkram/fastl/vector.h | 383 +++++++++++++++++++++++++ libkram/kram/KTXImage.cpp | 18 +- libkram/kram/KTXImage.h | 2 +- libkram/kram/Kram.cpp | 61 ++-- libkram/kram/KramConfig.h | 65 ++++- libkram/kram/KramDDSHelper.h | 2 +- libkram/kram/KramFileHelper.cpp | 4 +- libkram/kram/KramFileHelper.h | 2 +- libkram/kram/KramImage.cpp | 2 +- libkram/kram/KramImage.h | 2 +- libkram/kram/KramImageInfo.h | 2 +- libkram/kram/KramLib.h | 2 +- libkram/kram/KramLog.cpp | 11 +- libkram/kram/KramLog.h | 2 +- libkram/kram/KramMipper.h | 2 +- libkram/kram/KramMmapHelper.h | 2 +- libkram/kram/KramSDFMipper.h | 2 +- libkram/kram/KramTimer.h | 2 +- libkram/kram/KramZipHelper.cpp | 2 +- libkram/kram/KramZipHelper.h | 2 +- libkram/kram/TaskSystem.cpp | 38 +-- libkram/kram/TaskSystem.h | 15 +- libkram/kram/float4a.h | 2 +- libkram/lodepng/lodepng.cpp | 4 +- 37 files changed, 1273 insertions(+), 133 deletions(-) create mode 100644 libkram/fastl/LICENSE create mode 100644 libkram/fastl/falgorithm.h create mode 100644 libkram/fastl/fstring.h create mode 100644 libkram/fastl/map.h create mode 100644 libkram/fastl/pair.h create mode 100644 libkram/fastl/set.h create mode 100644 libkram/fastl/unordered_map.h create mode 100644 libkram/fastl/unordered_set.h create mode 100644 libkram/fastl/vector.h diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index 4e3a0c5a..e672e339 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -7,6 +7,22 @@ objects = { /* Begin PBXBuildFile section */ + 704738BC289F6AEE00C77A9F /* unordered_map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B1289F6AEE00C77A9F /* unordered_map.h */; }; + 704738BD289F6AEE00C77A9F /* unordered_map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B1289F6AEE00C77A9F /* unordered_map.h */; }; + 704738BE289F6AEE00C77A9F /* falgorithm.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B2289F6AEE00C77A9F /* falgorithm.h */; }; + 704738BF289F6AEE00C77A9F /* falgorithm.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B2289F6AEE00C77A9F /* falgorithm.h */; }; + 704738C0289F6AEE00C77A9F /* map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B3289F6AEE00C77A9F /* map.h */; }; + 704738C1289F6AEE00C77A9F /* map.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B3289F6AEE00C77A9F /* map.h */; }; + 704738C2289F6AEE00C77A9F /* pair.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B4289F6AEE00C77A9F /* pair.h */; }; + 704738C3289F6AEE00C77A9F /* pair.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B4289F6AEE00C77A9F /* pair.h */; }; + 704738C6289F6AEE00C77A9F /* unordered_set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B6289F6AEE00C77A9F /* unordered_set.h */; }; + 704738C7289F6AEE00C77A9F /* unordered_set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B6289F6AEE00C77A9F /* unordered_set.h */; }; + 704738C8289F6AEE00C77A9F /* vector.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B7289F6AEE00C77A9F /* vector.h */; }; + 704738C9289F6AEE00C77A9F /* vector.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B7289F6AEE00C77A9F /* vector.h */; }; + 704738CA289F6AEE00C77A9F /* set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B8289F6AEE00C77A9F /* set.h */; }; + 704738CB289F6AEE00C77A9F /* set.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B8289F6AEE00C77A9F /* set.h */; }; + 704738CC289F6AEE00C77A9F /* fstring.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B9289F6AEE00C77A9F /* fstring.h */; }; + 704738CD289F6AEE00C77A9F /* fstring.h in Headers */ = {isa = PBXBuildFile; fileRef = 704738B9289F6AEE00C77A9F /* fstring.h */; }; 706EEF7F26D1595D001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */; }; 706EEF8026D1595D001C950E /* EtcImage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAC26D1583E001C950E /* EtcImage.cpp */; }; 706EEF8126D1595D001C950E /* EtcDifferentialTrys.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EEDAF26D1583E001C950E /* EtcDifferentialTrys.cpp */; }; @@ -345,6 +361,8 @@ 70A7BD3127092A1200DBCCF7 /* hdr_encode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */; }; 70A7BD3227092A1200DBCCF7 /* hdr_encode.h in Headers */ = {isa = PBXBuildFile; fileRef = 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */; }; 70A7BD3327092A1200DBCCF7 /* hdr_encode.h in Headers */ = {isa = PBXBuildFile; fileRef = 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */; }; + 70C6398D289FB234006E7422 /* KramPrefix.pch in Headers */ = {isa = PBXBuildFile; fileRef = 70C6398C289FB234006E7422 /* KramPrefix.pch */; }; + 70C6398E289FB234006E7422 /* KramPrefix.pch in Headers */ = {isa = PBXBuildFile; fileRef = 70C6398C289FB234006E7422 /* KramPrefix.pch */; }; 70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; }; 70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */; }; 70CDB65227A1382700A546C1 /* KramDDSHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */; }; @@ -352,6 +370,14 @@ /* End PBXBuildFile section */ /* Begin PBXFileReference section */ + 704738B1289F6AEE00C77A9F /* unordered_map.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unordered_map.h; sourceTree = ""; }; + 704738B2289F6AEE00C77A9F /* falgorithm.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = falgorithm.h; sourceTree = ""; }; + 704738B3289F6AEE00C77A9F /* map.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = map.h; sourceTree = ""; }; + 704738B4289F6AEE00C77A9F /* pair.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pair.h; sourceTree = ""; }; + 704738B6289F6AEE00C77A9F /* unordered_set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unordered_set.h; sourceTree = ""; }; + 704738B7289F6AEE00C77A9F /* vector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vector.h; sourceTree = ""; }; + 704738B8289F6AEE00C77A9F /* set.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = set.h; sourceTree = ""; }; + 704738B9289F6AEE00C77A9F /* fstring.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fstring.h; sourceTree = ""; }; 706ECDDE26D1577A001C950E /* libkram.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkram.a; sourceTree = BUILT_PRODUCTS_DIR; }; 706EEDAA26D1583E001C950E /* EtcBlock4x4Encoding_RGB8.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = EtcBlock4x4Encoding_RGB8.cpp; sourceTree = ""; }; 706EEDAB26D1583E001C950E /* EtcErrorMetric.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = EtcErrorMetric.h; sourceTree = ""; }; @@ -666,6 +692,7 @@ 708A6A922708CE4700BA5410 /* bc6h_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bc6h_utils.h; sourceTree = ""; }; 70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hdr_encode.cpp; sourceTree = ""; }; 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hdr_encode.h; sourceTree = ""; }; + 70C6398C289FB234006E7422 /* KramPrefix.pch */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramPrefix.pch; sourceTree = ""; }; 70CDB64E27A1382600A546C1 /* KramDDSHelper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramDDSHelper.h; sourceTree = ""; }; 70CDB64F27A1382600A546C1 /* KramDDSHelper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = KramDDSHelper.cpp; sourceTree = ""; }; /* End PBXFileReference section */ @@ -690,6 +717,21 @@ /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ + 704738AF289F6AEE00C77A9F /* fastl */ = { + isa = PBXGroup; + children = ( + 704738B2289F6AEE00C77A9F /* falgorithm.h */, + 704738B3289F6AEE00C77A9F /* map.h */, + 704738B4289F6AEE00C77A9F /* pair.h */, + 704738B1289F6AEE00C77A9F /* unordered_map.h */, + 704738B6289F6AEE00C77A9F /* unordered_set.h */, + 704738B7289F6AEE00C77A9F /* vector.h */, + 704738B8289F6AEE00C77A9F /* set.h */, + 704738B9289F6AEE00C77A9F /* fstring.h */, + ); + path = fastl; + sourceTree = ""; + }; 706ECDD526D1577A001C950E = { isa = PBXGroup; children = ( @@ -713,6 +755,7 @@ children = ( 708A6A882708CE4700BA5410 /* compressonator */, 706EFC3E26D3473F001C950E /* eastl */, + 704738AF289F6AEE00C77A9F /* fastl */, 706EEDA926D1583E001C950E /* etc2comp */, 706EEDC926D1583E001C950E /* bc7enc */, 706EEDD226D1583E001C950E /* astc-encoder */, @@ -918,6 +961,7 @@ 706EEE1A26D1583F001C950E /* KramTimer.cpp */, 706EEE3326D1583F001C950E /* KramMmapHelper.h */, 706EEE2C26D1583F001C950E /* KramMmapHelper.cpp */, + 70C6398C289FB234006E7422 /* KramPrefix.pch */, 706EEE2E26D1583F001C950E /* Kram.h */, 706EEE3526D1583F001C950E /* Kram.cpp */, 706EEE3626D1583F001C950E /* KramFileHelper.h */, @@ -1230,6 +1274,7 @@ files = ( 706EEFD126D15984001C950E /* EtcErrorMetric.h in Headers */, 706EEFD226D15984001C950E /* EtcColor.h in Headers */, + 70C6398D289FB234006E7422 /* KramPrefix.pch in Headers */, 706EEFD326D15984001C950E /* EtcDifferentialTrys.h in Headers */, 706EEFD426D15984001C950E /* EtcBlock4x4Encoding_RGB8.h in Headers */, 706EEFD526D15984001C950E /* EtcConfig.h in Headers */, @@ -1239,6 +1284,7 @@ 706EEFD826D15984001C950E /* EtcMath.h in Headers */, 706EEFD926D15984001C950E /* EtcIndividualTrys.h in Headers */, 706EEFDA26D15984001C950E /* EtcBlock4x4EncodingBits.h in Headers */, + 704738BE289F6AEE00C77A9F /* falgorithm.h in Headers */, 706EEFDB26D15984001C950E /* EtcBlock4x4Encoding_RGB8A1.h in Headers */, 706EEFDC26D15984001C950E /* EtcBlock4x4.h in Headers */, 707789E72881BA81008A51BC /* rgbcx.h in Headers */, @@ -1260,6 +1306,7 @@ 706EEFF526D15985001C950E /* basisu_containers_impl.h in Headers */, 707789EB2881BA81008A51BC /* utils.h in Headers */, 706EEFF626D15985001C950E /* basisu_transcoder_internal.h in Headers */, + 704738C0289F6AEE00C77A9F /* map.h in Headers */, 70871DF927DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */, 70871DFB27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */, 706EEFF726D15985001C950E /* basisu_global_selector_cb.h in Headers */, @@ -1270,6 +1317,8 @@ 706EEFFB26D15985001C950E /* basisu_file_headers.h in Headers */, 706EEFFC26D15985001C950E /* miniz.h in Headers */, 706EEFFD26D15985001C950E /* hedistance.h in Headers */, + 704738BC289F6AEE00C77A9F /* unordered_map.h in Headers */, + 704738C2289F6AEE00C77A9F /* pair.h in Headers */, 706EEFFE26D15985001C950E /* stb_rect_pack.h in Headers */, 706EEFFF26D15985001C950E /* KramZipHelper.h in Headers */, 706EF00026D15985001C950E /* KramSDFMipper.h in Headers */, @@ -1284,6 +1333,7 @@ 70871DDD27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */, 707789D92881BA81008A51BC /* bc7decomp.h in Headers */, 706EF00826D15985001C950E /* Kram.h in Headers */, + 704738C8289F6AEE00C77A9F /* vector.h in Headers */, 70871DED27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */, 707789DB2881BA81008A51BC /* ert.h in Headers */, 706EF00926D15985001C950E /* KTXImage.h in Headers */, @@ -1291,6 +1341,7 @@ 707789DF2881BA81008A51BC /* rgbcx_table4.h in Headers */, 70871DF727DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */, 706EF00B26D15985001C950E /* KramTimer.h in Headers */, + 704738C6289F6AEE00C77A9F /* unordered_set.h in Headers */, 706EF00C26D15985001C950E /* KramMmapHelper.h in Headers */, 706EF00D26D15985001C950E /* float4a.h in Headers */, 706EF00E26D15985001C950E /* KramFileHelper.h in Headers */, @@ -1305,12 +1356,14 @@ 706EF01526D15985001C950E /* singlecolourfit.h in Headers */, 706EF01626D15985001C950E /* maths.h in Headers */, 707789F32881BCE2008A51BC /* rdo_bc_encoder.h in Headers */, + 704738CA289F6AEE00C77A9F /* set.h in Headers */, 706EF01726D15985001C950E /* colourset.h in Headers */, 708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */, 706EF01826D15985001C950E /* colourblock.h in Headers */, 706EF01926D15985001C950E /* rangefit.h in Headers */, 706EF01A26D15985001C950E /* zstd.h in Headers */, 70871DF327DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */, + 704738CC289F6AEE00C77A9F /* fstring.h in Headers */, 706EF01B26D15985001C950E /* lodepng.h in Headers */, 706EF01C26D15985001C950E /* tmpfileplus.h in Headers */, ); @@ -1322,6 +1375,7 @@ files = ( 706EF14B26D166C5001C950E /* EtcErrorMetric.h in Headers */, 706EF14C26D166C5001C950E /* EtcColor.h in Headers */, + 70C6398E289FB234006E7422 /* KramPrefix.pch in Headers */, 706EF14D26D166C5001C950E /* EtcDifferentialTrys.h in Headers */, 706EF14E26D166C5001C950E /* EtcBlock4x4Encoding_RGB8.h in Headers */, 706EF14F26D166C5001C950E /* EtcConfig.h in Headers */, @@ -1331,6 +1385,7 @@ 706EF15226D166C5001C950E /* EtcMath.h in Headers */, 706EF15326D166C5001C950E /* EtcIndividualTrys.h in Headers */, 706EF15426D166C5001C950E /* EtcBlock4x4EncodingBits.h in Headers */, + 704738BF289F6AEE00C77A9F /* falgorithm.h in Headers */, 706EF15526D166C5001C950E /* EtcBlock4x4Encoding_RGB8A1.h in Headers */, 706EF15626D166C5001C950E /* EtcBlock4x4.h in Headers */, 707789E82881BA81008A51BC /* rgbcx.h in Headers */, @@ -1352,6 +1407,7 @@ 706EF16F26D166C5001C950E /* basisu_containers_impl.h in Headers */, 707789EC2881BA81008A51BC /* utils.h in Headers */, 706EF17026D166C5001C950E /* basisu_transcoder_internal.h in Headers */, + 704738C1289F6AEE00C77A9F /* map.h in Headers */, 70871DFA27DDDBCD00D0B9E1 /* astcenc_vecmathlib_avx2_8.h in Headers */, 70871DFC27DDDBCD00D0B9E1 /* astcenc_vecmathlib_none_4.h in Headers */, 706EF17126D166C5001C950E /* basisu_global_selector_cb.h in Headers */, @@ -1362,6 +1418,8 @@ 706EF17526D166C5001C950E /* basisu_file_headers.h in Headers */, 706EF17626D166C5001C950E /* miniz.h in Headers */, 706EF17726D166C5001C950E /* hedistance.h in Headers */, + 704738BD289F6AEE00C77A9F /* unordered_map.h in Headers */, + 704738C3289F6AEE00C77A9F /* pair.h in Headers */, 706EF17826D166C5001C950E /* stb_rect_pack.h in Headers */, 706EF17926D166C5001C950E /* KramZipHelper.h in Headers */, 706EF17A26D166C5001C950E /* KramSDFMipper.h in Headers */, @@ -1376,6 +1434,7 @@ 70871DDE27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */, 707789DA2881BA81008A51BC /* bc7decomp.h in Headers */, 706EF18226D166C5001C950E /* Kram.h in Headers */, + 704738C9289F6AEE00C77A9F /* vector.h in Headers */, 70871DEE27DDDBCD00D0B9E1 /* astcenc_diagnostic_trace.h in Headers */, 707789DC2881BA81008A51BC /* ert.h in Headers */, 706EF18326D166C5001C950E /* KTXImage.h in Headers */, @@ -1383,6 +1442,7 @@ 707789E02881BA81008A51BC /* rgbcx_table4.h in Headers */, 70871DF827DDDBCD00D0B9E1 /* astcenc_vecmathlib_neon_4.h in Headers */, 706EF18526D166C5001C950E /* KramTimer.h in Headers */, + 704738C7289F6AEE00C77A9F /* unordered_set.h in Headers */, 706EF18626D166C5001C950E /* KramMmapHelper.h in Headers */, 706EF18726D166C5001C950E /* float4a.h in Headers */, 706EF18826D166C5001C950E /* KramFileHelper.h in Headers */, @@ -1397,12 +1457,14 @@ 706EF18F26D166C5001C950E /* singlecolourfit.h in Headers */, 706EF19026D166C5001C950E /* maths.h in Headers */, 707789F42881BCE2008A51BC /* rdo_bc_encoder.h in Headers */, + 704738CB289F6AEE00C77A9F /* set.h in Headers */, 706EF19126D166C5001C950E /* colourset.h in Headers */, 708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */, 706EF19226D166C5001C950E /* colourblock.h in Headers */, 706EF19326D166C5001C950E /* rangefit.h in Headers */, 706EF19426D166C5001C950E /* zstd.h in Headers */, 70871DF427DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */, + 704738CD289F6AEE00C77A9F /* fstring.h in Headers */, 706EF19526D166C5001C950E /* lodepng.h in Headers */, 706EF19626D166C5001C950E /* tmpfileplus.h in Headers */, ); @@ -1712,8 +1774,11 @@ GCC_DYNAMIC_NO_PIC = NO; GCC_ENABLE_CPP_EXCEPTIONS = NO; GCC_ENABLE_CPP_RTTI = NO; + GCC_INCREASE_PRECOMPILED_HEADER_SHARING = YES; GCC_NO_COMMON_BLOCKS = YES; GCC_OPTIMIZATION_LEVEL = 0; + GCC_PRECOMPILE_PREFIX_HEADER = YES; + GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramPrefix.pch"; GCC_PREPROCESSOR_DEFINITIONS = ( "DEBUG=1", "$(inherited)", @@ -1743,10 +1808,9 @@ "-DCOMPILE_SQUISH=1", "-DCOMPILE_BCENC=1", "-DCOMPILE_EASTL=0", + "-DCOMPILE_FASTL=1", "-DCOMPILE_COMP=1", "-DCOMPILE_BASIS=1", - "-include", - KramConfig.h, ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; @@ -1798,7 +1862,10 @@ GCC_C_LANGUAGE_STANDARD = gnu11; GCC_ENABLE_CPP_EXCEPTIONS = NO; GCC_ENABLE_CPP_RTTI = NO; + GCC_INCREASE_PRECOMPILED_HEADER_SHARING = YES; GCC_NO_COMMON_BLOCKS = YES; + GCC_PRECOMPILE_PREFIX_HEADER = YES; + GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramPrefix.pch"; "GCC_WARN_64_TO_32_BIT_CONVERSION[arch=*64]" = NO; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; @@ -1823,10 +1890,9 @@ "-DCOMPILE_SQUISH=1", "-DCOMPILE_BCENC=1", "-DCOMPILE_EASTL=0", + "-DCOMPILE_FASTL=1", "-DCOMPILE_COMP=1", "-DCOMPILE_BASIS=1", - "-include", - KramConfig.h, ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; @@ -1843,7 +1909,6 @@ CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; GCC_PRECOMPILE_PREFIX_HEADER = NO; - GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramConfig.h"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; SYSTEM_HEADER_SEARCH_PATHS = ""; @@ -1859,7 +1924,6 @@ CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; GCC_PRECOMPILE_PREFIX_HEADER = NO; - GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramConfig.h"; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; SYSTEM_HEADER_SEARCH_PATHS = ""; @@ -1875,7 +1939,6 @@ CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; GCC_PRECOMPILE_PREFIX_HEADER = NO; - GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramConfig.h"; IPHONEOS_DEPLOYMENT_TARGET = 14.1; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = iphoneos; @@ -1894,7 +1957,6 @@ CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; GCC_PRECOMPILE_PREFIX_HEADER = NO; - GCC_PREFIX_HEADER = "$(PROJECT_DIR)/../libkram/kram/KramConfig.h"; IPHONEOS_DEPLOYMENT_TARGET = 14.1; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = iphoneos; diff --git a/build2/kramv.xcodeproj/project.pbxproj b/build2/kramv.xcodeproj/project.pbxproj index f170f89c..4e79c2bf 100644 --- a/build2/kramv.xcodeproj/project.pbxproj +++ b/build2/kramv.xcodeproj/project.pbxproj @@ -562,6 +562,7 @@ MTL_LANGUAGE_REVISION = UseDeploymentTarget; ONLY_ACTIVE_ARCH = YES; OTHER_CFLAGS = ( + "-DCOMPILE_FASTL=1", "-DCOMPILE_EASTL=0", "-include", KramConfig.h, @@ -627,6 +628,7 @@ MTL_FAST_MATH = YES; MTL_LANGUAGE_REVISION = UseDeploymentTarget; OTHER_CFLAGS = ( + "-DCOMPILE_FASTL=1", "-DCOMPILE_EASTL=0", "-include", KramConfig.h, diff --git a/kram-thumb/KramThumbnailProvider.mm b/kram-thumb/KramThumbnailProvider.mm index e263cde1..b8dc82f3 100644 --- a/kram-thumb/KramThumbnailProvider.mm +++ b/kram-thumb/KramThumbnailProvider.mm @@ -57,8 +57,8 @@ - (void)provideThumbnailForFileRequest:(QLFileThumbnailRequest *)request complet handler(nil, error); return; } - - shared_ptr imageToPass = make_shared(); + + std::shared_ptr imageToPass = std::make_shared(); TexEncoder decoderType = kTexEncoderUnknown; uint32_t imageWidth, imageHeight; @@ -98,11 +98,11 @@ - (void)provideThumbnailForFileRequest:(QLFileThumbnailRequest *)request complet if (imageAspect >= 1.0f) { requestWidth = contextSize.width; - requestHeight = NAMESPACE_STL::clamp((contextSize.width / imageAspect), 1.0, contextSize.height); + requestHeight = clamp((contextSize.width / imageAspect), 1.0, contextSize.height); } else { - requestWidth = NAMESPACE_STL::clamp((contextSize.height * imageAspect), 1.0, contextSize.width); + requestWidth = clamp((contextSize.height * imageAspect), 1.0, contextSize.width); requestHeight = contextSize.height; } diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index 3412fad9..da8563f6 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -1025,8 +1025,8 @@ - (void)doZoomMath:(float)newZoom newPan:(float2 &)newPan // X bound may need adjusted for ar ? // that's in model space (+/0.5f, +/0.5f), so convert to texture space - pixel.x = NAMESPACE_STL::clamp(pixel.x, -0.5f * ar, maxX); - pixel.y = NAMESPACE_STL::clamp(pixel.y, minY, 0.5f); + pixel.x = std::clamp(pixel.x, -0.5f * ar, maxX); + pixel.y = std::clamp(pixel.y, minY, 0.5f); // now that's the point that we want to zoom towards // No checks on this zoom @@ -1737,7 +1737,7 @@ - (void)scrollWheel:(NSEvent *)event float zoom = _zoomGesture.magnification; if (wheelY != 0.0) { wheelY *= 0.01; - wheelY = clamp(wheelY, -0.1, 0.1); + wheelY = std::clamp(wheelY, -0.1, 0.1); zoom *= 1.0 + wheelY; @@ -3211,7 +3211,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url #if USE_EASTL NAMESPACE_STL::quick_sort(files.begin(), files.end()); #else - NAMESPACE_STL::sort(files.begin(), files.end()); + std::sort(files.begin(), files.end()); #endif // replicate archive logic below @@ -3367,7 +3367,7 @@ - (BOOL)loadTextureFromURL:(NSURL *)url if (formerEntry) { // lookup the index in the remapIndices table _fileArchiveIndex = - (uintptr_t)(formerEntry - &_zip.zipEntrys().front()); + (uintptr_t)(formerEntry - &_zip.zipEntrys()[0]); } else { _fileArchiveIndex = 0; diff --git a/libkram/etc2comp/EtcImage.cpp b/libkram/etc2comp/EtcImage.cpp index 77f5a071..d555bc10 100644 --- a/libkram/etc2comp/EtcImage.cpp +++ b/libkram/etc2comp/EtcImage.cpp @@ -459,7 +459,7 @@ namespace Etc } // sorts largest errors to front - NAMESPACE_STL::sort(sortedBlocks.begin(), sortedBlocks.end(), std::greater()); + std::sort(sortedBlocks.begin(), sortedBlocks.end(), std::greater()); // lop off the end of the array where blocks are 0 error or don int counter = 0; diff --git a/libkram/fastl/LICENSE b/libkram/fastl/LICENSE new file mode 100644 index 00000000..9bcd9455 --- /dev/null +++ b/libkram/fastl/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Ramon Viladomat + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/libkram/fastl/falgorithm.h b/libkram/fastl/falgorithm.h new file mode 100644 index 00000000..17f8d055 --- /dev/null +++ b/libkram/fastl/falgorithm.h @@ -0,0 +1,99 @@ +#pragma once + +#ifdef USE_FASTL + +// for size_t +#include "../fastl/vector.h" + +namespace fastl +{ + //------------------------------------------------------------------------------------------ + template + constexpr Iterator find_if(Iterator first, Iterator last, Predicate p) + { + for (; first != last; ++first) + { + if (p(*first)) return first; + } + return last; + } + + //------------------------------------------------------------------------------------------ + template + constexpr Iterator find(Iterator first, Iterator last, const T& value) + { + return find_if(first, last, [=](const T& input) { return input == value; }); + } + + //------------------------------------------------------------------------------------------ + template + Iterator remove_if(Iterator first, Iterator last, Predicate p) + { + first = fastl::find_if(first, last, p); + if (first != last) + { + for(Iterator i = first; ++i != last; ) + { + if (!p(*i)) *first++ = *i; + } + } + return first; + } + + //------------------------------------------------------------------------------------------ + template< class Iterator, class T > + Iterator remove(Iterator first, Iterator last, const T& value) + { + return remove_if(first, last, [=](const T& input) { return input == value; }); + } + + //------------------------------------------------------------------------------------------ + template + Iterator lower_bound(Iterator first, Iterator last, const T& value, Compare comp) + { + //specific implementation for continous memory ( vectors ) + Iterator it; + size_t count = last-first; + while (count > 0) { + it = first; + size_t step = count / 2; + it += step; + if (comp(*it,value)) + { + first = ++it; + count -= step + 1; + } + else + { + count = step; + } + } + return first; + } + + //------------------------------------------------------------------------------------------ + template + Iterator lower_bound(Iterator first, Iterator last, const T& value) + { + return lower_bound(first, last, value, [=](const T& lhs, const T& rhs) { return lhs < rhs; }); + } +} + +#else + +//#include +// +//namespace fastl +//{ +// template constexpr Iterator find(Iterator first, Iterator last, const T& value) { return std::find(first, last, value); } +// template constexpr Iterator find_if(Iterator first, Iterator last, Predicate p) { return std::find_if(first, last, p); } +// +// template inline Iterator remove(Iterator first, Iterator last, const T& value) { return std::remove(first, last, value); } +// template inline Iterator remove_if(Iterator first, Iterator last, Predicate p) { return std::remove_if(first, last, p); } +// +// template Iterator lower_bound(Iterator first, Iterator last, const T& value) { return std::lower_bound(first, last, value); } +// template Iterator lower_bound(Iterator first, Iterator last, const T& value, Compare comp) { return std::lower_bound(first, last, value, comp); } +//} + +#endif //USE_FASTL + diff --git a/libkram/fastl/fstring.h b/libkram/fastl/fstring.h new file mode 100644 index 00000000..4d0f4241 --- /dev/null +++ b/libkram/fastl/fstring.h @@ -0,0 +1,211 @@ +#pragma once + +#ifdef USE_FASTL + +#include "../fastl/vector.h" + +namespace fastl +{ + //------------------------------------------------------------------------------------------ + template + size_t ComputeStrLen(const TChar* str) // strlen + { + size_t ret; + for (ret = 0u; str[ret] != '\0';++ret){} + return ret; + } + + //------------------------------------------------------------------------------------------ + template + int ComputeStrCmp(const TChar* a, const TChar* b) // strcmp + { + for (size_t i = 0; ;++i) + { + if (a[i] != b[i]) + return a[i] < b[i] ? -1 : 1; + if (a[i] == '\0') + return 0; + } + } + + //////////////////////////////////////////////////////////////////////////////////////////// + // Build string as a vector + template + class StringImpl + { + private: + typedef vector TData; + public: + typedef TChar value_type; + typedef typename TData::size_type size_type; + + static constexpr size_type npos = -1; + public: + StringImpl(); + StringImpl(const char* input); + StringImpl(const char* input, const size_type length); + + void clear(); + + bool empty() const { return size() == 0u; } + size_type size() const { return m_data.empty() ? 0 : m_data.size() - 1; } + size_type length() const { return size(); } + + value_type* begin() { return m_data.begin(); } + const value_type* begin() const { return m_data.begin(); } + value_type* end() { return m_data.end() - 1; } + const value_type* end() const { return m_data.end() - 1; } + + const value_type* c_str() const { return m_data.begin(); } + + value_type& operator[](size_type index) { return m_data[index]; } + value_type operator[](size_type index) const { return m_data[index]; } + + StringImpl& erase(size_type index){ m_data.erase(m_data.begin()+index); return *this; } + StringImpl& erase(size_type index, size_type count){ m_data.erase(m_data.begin()+index,m_data.begin()+index+count); return *this; } + + void append( const char* str ); + + StringImpl operator+(const char c); + StringImpl operator+(const char* str); + StringImpl operator+(const StringImpl& str); + + StringImpl& operator += (const char c) { m_data.insert(m_data.end()-1,c); return *this; } + StringImpl& operator += (const char* str) { Append(str,ComputeStrLen(str)); return *this; } + StringImpl& operator += (const StringImpl& str) { Append(str.c_str(), str.size()); return *this; } + + bool operator == (const char* str) const { return ComputeStrCmp(c_str(), str) == 0; } + bool operator != (const char* str) const { return ComputeStrCmp(c_str(), str) != 0; } + bool operator < (const char* str) const { return ComputeStrCmp(c_str(), str) < 0; } + bool operator > (const char* str) const { return ComputeStrCmp(c_str(), str) > 0; } + + bool operator == (const StringImpl& str) const { return *this == str.c_str(); } + bool operator != (const StringImpl& str) const { return *this != str.c_str(); } + bool operator < (const StringImpl& str) const { return *this < str.c_str(); } + bool operator > (const StringImpl& str) const { return *this > str.c_str(); } + + private: + void Append(const char* str, const size_type appendSize); + + private: + TData m_data; + }; + + //Implementation + + //------------------------------------------------------------------------------------------ + template + StringImpl::StringImpl() + { + clear(); + } + + //------------------------------------------------------------------------------------------ + template + StringImpl::StringImpl(const char* input) + { + clear(); + Append(input, ComputeStrLen(input)); + } + + //------------------------------------------------------------------------------------------ + template + StringImpl::StringImpl(const char* input, const size_type length) + { + clear(); + Append(input, length); + } + + //------------------------------------------------------------------------------------------ + template + inline void StringImpl::clear() + { + // TODO: this requires an allocate in all ctors + // need small string optimization + m_data.resize(1); + m_data[0] = '\0'; + } + + //------------------------------------------------------------------------------------------ + template + void StringImpl::append( const char* str ) + { + Append(str, ComputeStrLen(str)); + } + + //------------------------------------------------------------------------------------------ + template + StringImpl StringImpl::operator+(const char c) + { + StringImpl ret; + ret.reserve(m_data.size() + 1); + + char cstr[2] = { c, 0 }; + ret.Append(c_str(), size()); + ret.Append(cstr, 1); + return ret; + } + //------------------------------------------------------------------------------------------ + template + StringImpl StringImpl::operator+(const char* str) + { + StringImpl ret; + size_t len = ComputeStrLen(str); + ret.reserve(m_data.size() + len); + + ret.Append(c_str(), size()); + ret.Append(str, len); + return ret; + } + + //------------------------------------------------------------------------------------------ + template + StringImpl StringImpl::operator+(const StringImpl& str) + { + StringImpl ret; + size_t len = str.size(); + ret.reserve(m_data.size() + len); + + ret.Append(c_str(), size()); + ret.Append(str, len); + return ret; + } + + //------------------------------------------------------------------------------------------ + template + void StringImpl::Append(const char* str, const size_type appendSize) + { + size_type writeIndex = size(); + m_data.resize(m_data.size()+appendSize); + for (size_type i = 0; i < appendSize; ++i, ++writeIndex) + { + m_data[writeIndex] = str[i]; + } + m_data.back() = '\0'; + } + + using string = StringImpl; + + // Code above is using char* in many places instead of TChar + // TODO: elim wstring if possible + // using wstring = StringImpl; +} + +#else + +//#include +// +//namespace fastl +//{ +// using string = std::string; +// using wstring = std::wstring; +//} + +#endif //USE_FASTL + +#ifdef FASTL_EXPOSE_PLAIN_ALIAS + +using string = fastl::string; +using wstring = fastl::wstring; + +#endif //FASTL_EXPOSE_PLAIN_ALIAS diff --git a/libkram/fastl/map.h b/libkram/fastl/map.h new file mode 100644 index 00000000..3c49f73f --- /dev/null +++ b/libkram/fastl/map.h @@ -0,0 +1,122 @@ +#pragma once + +#ifdef USE_FASTL + +#include "../fastl/vector.h" +#include "../fastl/pair.h" +#include "../fastl/falgorithm.h" + +namespace fastl +{ + //////////////////////////////////////////////////////////////////////////////////////////// + // Build map as a vectorMap + template + class map + { + private: + typedef vector> TData; + + public: + typedef typename TData::iterator iterator; + typedef typename TData::const_iterator const_iterator; + typedef typename TData::value_type value_type; + typedef typename TData::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + + public: + iterator begin() { return m_data.begin(); } + const_iterator begin() const { return m_data.begin(); } + iterator end() { return m_data.end(); } + const_iterator end() const { return m_data.end(); } + + bool empty() const { return m_data.empty(); } + size_type size() const { return m_data.size(); } + + TValue& operator[]( const TKey& key ); + + void clear() { m_data.clear(); } + + iterator insert(iterator hint, const value_type& value) { return m_data.insert(hint, value); } + iterator insert(const_iterator hint, const value_type& value) { return m_data.insert(hint, value); } + pair insert( value_type&& value ); + + void erase(iterator it) { m_data.erase(it); } + size_type erase(const TKey& key); + + iterator find( const TKey& key ); + const_iterator find( const TKey& key ) const; + + private: + TData m_data; + }; + + // Implementation + + //------------------------------------------------------------------------------------------ + template TValue& map::operator[]( const TKey& key ) + { + iterator entryIt = fastl::lower_bound(begin(), end(), key, [=](value_type& value, const TKey& key) {return value.first < key; }); + if (entryIt == end() || entryIt->first != key) + { + entryIt = m_data.emplace(entryIt,key,TValue()); + } + + return entryIt->second; + } + + //------------------------------------------------------------------------------------------ + template pair::iterator,bool> map::insert( value_type&& inputValue ) + { + iterator entryIt = fastl::lower_bound(begin(), end(), inputValue, [=](value_type& a, const value_type& b) {return a.first < b.first; }); + if (entryIt == end() || entryIt->first != inputValue.first) + { + entryIt = m_data.emplace(entryIt,move(inputValue)); + return pair(entryIt,true); + } + return pair(entryIt,false); + } + + //------------------------------------------------------------------------------------------ + template typename map::size_type map::erase(const TKey& key) + { + iterator found = find(key); + if (found != end()) + { + erase(found); + } + return size(); + } + + //------------------------------------------------------------------------------------------ + template typename map::iterator map::find( const TKey& key ) + { + iterator found = fastl::lower_bound(begin(), end(), key, [=](value_type& value, const TKey& key) {return value.first < key; }); + return found != end() && found->first == key ? found : end(); + } + + //------------------------------------------------------------------------------------------ + template typename map::const_iterator map::find(const TKey& key) const + { + const_iterator found = fastl::lower_bound(begin(), end(), key, [=](const value_type& value, const TKey& key) {return value.first < key; }); + return found != end() && found->first == key ? found : end(); + } + +} + +#else + +//#include +// +//namespace fastl +//{ +// template using map = std::map; +//} + +#endif //USE_FASTL + +#ifdef FASTL_EXPOSE_PLAIN_ALIAS + +template using map = fastl::map; + +#endif //FASTL_EXPOSE_PLAIN_ALIAS diff --git a/libkram/fastl/pair.h b/libkram/fastl/pair.h new file mode 100644 index 00000000..3eb68ec0 --- /dev/null +++ b/libkram/fastl/pair.h @@ -0,0 +1,60 @@ +#pragma once + +#ifdef USE_FASTL + +namespace fastl +{ + template + struct pair + { + typedef T1 first_type; + typedef T2 second_type; + + pair() + :first(),second(){} + pair(const T1& _first, const T2& _second) + :first(_first),second(_second) {} + + // added these + pair(const pair& rhs) + :first(rhs.first),second(rhs.second) {} + pair(pair&& rhs) + :first(std::move(rhs.first)),second(std::move(rhs.second)) {} + + pair& operator=(const pair& rhs) + { + first = rhs.first; + second = rhs.second; + return *this; + } + pair& operator=(pair&& rhs) + { + first = std::move(rhs.first); + second = std::move(rhs.second); + return *this; + } + + T1 first; + T2 second; + }; + + template + pair make_pair(const T1& k, const T2& v) { return pair(k,v); } +} + +#else + +//#include +// +//namespace fastl +//{ +// template using pair = std::pair; +//} + +#endif //USE_FASTL + +#ifdef FASTL_EXPOSE_PLAIN_ALIAS + +template using pair = fastl::pair; + +#endif //FASTL_EXPOSE_PLAIN_ALIAS diff --git a/libkram/fastl/set.h b/libkram/fastl/set.h new file mode 100644 index 00000000..e334c466 --- /dev/null +++ b/libkram/fastl/set.h @@ -0,0 +1,108 @@ +#pragma once + +//#ifdef USE_FASTL + +#include "../fastl/falgorithm.h" +#include "../fastl/pair.h" +#include "../fastl/vector.h" + +namespace fastl +{ + //////////////////////////////////////////////////////////////////////////////////////////// + // Build map as a vectorMap + template + class set + { + private: + typedef vector TData; + + public: + typedef typename TData::iterator iterator; + typedef typename TData::const_iterator const_iterator; + typedef typename TData::value_type value_type; + typedef typename TData::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + + public: + iterator begin() { return m_data.begin(); } + const_iterator begin() const { return m_data.begin(); } + iterator end() { return m_data.end(); } + const_iterator end() const { return m_data.end(); } + + bool empty() const { return m_data.empty(); } + size_type size() const { return m_data.size(); } + + void clear() { m_data.clear(); } + + template< class... Args > pair emplace( Args&&... args ); + + void erase( iterator it ) { m_data.erase( it ); } + size_type erase( const TKey& key ); + + iterator find( const TKey& key ); + const_iterator find( const TKey& key ) const; + + private: + TData m_data; + }; + + // Implementation + + //------------------------------------------------------------------------------------------ + template + template pair::iterator, bool> set::emplace( Args&&... args ) + { + TKey inputValue{ args... }; + iterator entryIt = fastl::lower_bound( begin(), end(), inputValue, [=]( value_type& a, const value_type& b ) {return a < b; } ); + if( entryIt == end() || *entryIt != inputValue ) + { + entryIt = m_data.emplace( entryIt, args... ); + return pair( entryIt, true ); + } + return pair( entryIt, false ); + } + + //------------------------------------------------------------------------------------------ + template typename set::size_type set::erase( const TKey& key ) + { + iterator found = find( key ); + if( found != end() ) + { + erase( found ); + } + return size(); + } + + //------------------------------------------------------------------------------------------ + template typename set::iterator set::find( const TKey& key ) + { + iterator found = fastl::lower_bound( begin(), end(), key, [=]( const TKey& value, const TKey& key ) {return value < key; } ); + return found != end() && *found == key ? found : end(); + } + + //------------------------------------------------------------------------------------------ + template typename set::const_iterator set::find( const TKey& key ) const + { + const_iterator found = fastl::lower_bound( begin(), end(), key, [=]( const TKey& value, const TKey& key ) {return value < key; } ); + return found != end() && *found == key ? found : end(); + } + +} + +//#else +// +//#include +// +//namespace fastl +//{ +// template using set = std::set; +//} +// +//#endif //USE_FASTL +// +//#ifdef FASTL_EXPOSE_PLAIN_ALIAS +// +//template using set = fastl::set; +// +//#endif //FASTL_EXPOSE_PLAIN_ALIAS diff --git a/libkram/fastl/unordered_map.h b/libkram/fastl/unordered_map.h new file mode 100644 index 00000000..1b312a1b --- /dev/null +++ b/libkram/fastl/unordered_map.h @@ -0,0 +1,28 @@ +#pragma once + +#ifdef USE_FASTL + +#include "../fastl/map.h" + +namespace fastl +{ + // Build unordered_map as a map + template using unordered_map = fastl::map; +} + +#else + +//#include +// +//namespace fastl +//{ +// template using unordered_map = std::unordered_map; +//} + +#endif //USE_FASTL + +#ifdef FASTL_EXPOSE_PLAIN_ALIAS + +template using unordered_map = fastl::unordered_map; + +#endif //FASTL_EXPOSE_PLAIN_ALIAS diff --git a/libkram/fastl/unordered_set.h b/libkram/fastl/unordered_set.h new file mode 100644 index 00000000..f2aed10d --- /dev/null +++ b/libkram/fastl/unordered_set.h @@ -0,0 +1,28 @@ +#pragma once + +#ifdef USE_FASTL + +#include "../fastl/set.h" + +namespace fastl +{ + // Build unordered_map as a map + template using unordered_set = fastl::set; +} + +#else + +//#include +// +//namespace fastl +//{ +// template using unordered_set = std::unordered_set; +//} + +#endif //USE_FASTL + +#ifdef FASTL_EXPOSE_PLAIN_ALIAS + +template using unordered_set = fastl::unordered_set; + +#endif //FASTL_EXPOSE_PLAIN_ALIAS diff --git a/libkram/fastl/vector.h b/libkram/fastl/vector.h new file mode 100644 index 00000000..dbc5f4a0 --- /dev/null +++ b/libkram/fastl/vector.h @@ -0,0 +1,383 @@ +#pragma once + +#ifdef USE_FASTL + +#include // for size_t +#include // for placement new +#include // for move + +//Forward declare the placement new in order to avoid #include +//extern void* operator new (size_t size, void* ptr) noexcept; + +namespace fastl +{ + //------------------------------------------------------------------------------------------ + //Consider moving this around if needed somewhere else + template struct remove_reference { typedef T type; }; + template struct remove_reference { typedef T type; }; + template struct remove_reference { typedef T type; }; + + // This is ambigous if included + //template typename remove_reference::type&& move(T&& arg) { return static_cast::type&&>(arg); } + + template + struct enable_if {}; + template + struct enable_if { typedef T type; }; + template + using enable_if_t = typename enable_if::type; + + template + void Construct(T* ptr, Args&&... args) { new (ptr) T(std::move(args)...); } + + template + T* CreateBuffer(size_t size){ return (T*) new char[size*sizeof(T)]; } + template + void DestroyBuffer(T* buffer){ delete[] reinterpret_cast(buffer); } + + //////////////////////////////////////////////////////////////////////////////////////////// + template + class vector + { + private: + enum { DEFAULT_CAPACITY_SIZE = 8 }; + public: + typedef T value_type; + typedef size_t size_type; + + typedef T* iterator; + typedef const T* const_iterator; + typedef T& reference; + typedef const T& const_reference; + + public: + vector(); + explicit vector(size_t size); + + //If more than 1 argument is provided we assume that we want to construct the vector with its elements ( using SFINAE - fake initializer list ) + template 1)>* = nullptr> + vector(Args&&... args) : m_data(CreateBuffer(sizeof...(Args))), m_size(0u), m_capacity(sizeof...(Args)) + { + (emplace_back(args),...); + } + + vector(const vector& input); + vector(vector&& input); + ~vector(); + + vector& operator = (const vector& t); + vector& operator = (vector&& t); + + reference operator[](size_type index) { return m_data[index]; } + const_reference operator[](size_type index) const { return m_data[index]; } + + size_type size() const{ return m_size; } + size_type capacity() const { return m_capacity; } + + iterator begin() { return m_data; } + const_iterator begin() const { return m_data; } + iterator end() { return m_data+m_size; } + const_iterator end() const { return m_data+m_size; } + reference back() { return m_data[m_size-1]; } + bool empty() const { return m_size == 0u; } + + void reserve(const size_type size); + void resize(const size_type size); + void clear(); + + void push_back(const value_type& value); + iterator insert(iterator it, const value_type& value); + void insert(iterator it, const value_type* begin , const value_type* end) + { + // TODO: fix this isn't fast + while (begin != end) + { + insert(it, *begin); + + ++it; + ++begin; + } + } + + template + iterator emplace(iterator it, Args&&... args); + + template + void emplace_back(Args&&... args); + + void pop_back(); + + iterator erase(iterator it); + iterator erase(iterator fromIt,iterator toIt); + + const value_type* data() const { return m_data; } + value_type* data() { return m_data; } + + // TODO: no-op for now, but should release memory + void shrink_to_fit() { } + + private: + void Destroy(); + + private: + value_type* m_data; + size_type m_size; + size_type m_capacity; + }; + + //Implementation + + //------------------------------------------------------------------------------------------ + template + vector::vector() + : m_data(nullptr) + , m_size(0u) + , m_capacity(0u) + { + } + + //------------------------------------------------------------------------------------------ + template + vector::vector(size_t size) + : m_data(CreateBuffer(size)) + , m_size(size) + , m_capacity(size) + { + //Call the default constructor for all preallocated elements + for (size_type i = 0u; i < m_size; ++i) + { + Construct(&m_data[i]); + } + } + + //------------------------------------------------------------------------------------------ + template vector::vector(const vector& input) + : m_data(CreateBuffer(input.m_capacity)) + , m_size(input.m_size) + , m_capacity(input.m_capacity) + { + for (size_t i = 0u; i < m_size; ++i) + { + Construct(&m_data[i]); + m_data[i] = input[i]; + } + } + + //------------------------------------------------------------------------------------------ + template + vector::vector(vector&& input) + : m_data(std::move(input.m_data)) + , m_size(input.m_size) + , m_capacity(input.m_capacity) + { + input.m_data = nullptr; + input.m_size = 0u; + input.m_capacity = 0u; + } + + //------------------------------------------------------------------------------------------ + template + vector::~vector() + { + Destroy(); + } + + //------------------------------------------------------------------------------------------ + template + inline vector& vector::operator= (const vector& input) + { + clear(); + reserve(input.m_capacity); + m_size = input.m_size; + for (size_type i = 0u; i < m_size; ++i) + { + Construct(&m_data[i], input[i]); + } + return *this; + } + + //------------------------------------------------------------------------------------------ + template + vector& vector::operator = (vector&& t) + { + if (this != &t) + { + Destroy(); + m_data = std::move(t.m_data); + m_size = t.m_size; + m_capacity = t.m_capacity; + t.m_data = nullptr; + t.m_size = 0u; + t.m_capacity = 0u; + } + return *this; + } + + //------------------------------------------------------------------------------------------ + template + inline void vector::reserve(const size_type size) + { + if (size > m_capacity) + { + m_capacity = size; + T* newData = CreateBuffer(m_capacity); + + for (size_type i = 0u; i < m_size; ++i) + { + Construct(&newData[i], std::move(m_data[i])); + m_data[i].~T(); + } + + DestroyBuffer(m_data); + m_data = newData; + + } + } + + //------------------------------------------------------------------------------------------ + template + inline void vector::resize(const size_type size) + { + reserve(size); + + for (size_type i=size;i(&m_data[i]); + } + + m_size = size; + } + + //------------------------------------------------------------------------------------------ + template + inline void vector::clear() + { + resize(0u); + } + + //------------------------------------------------------------------------------------------ + template + inline void vector::push_back(const value_type& value) + { + emplace(end(), value); + } + + //------------------------------------------------------------------------------------------ + template + inline typename vector::iterator vector::insert(iterator it,const value_type& value) + { + return emplace(it, value); + } + + //------------------------------------------------------------------------------------------ + template template + void vector::emplace_back(Args&&... args) + { + emplace(end(),std::move(args)...); + } + + //------------------------------------------------------------------------------------------ + template + inline void vector::pop_back() + { + if (!empty()) + { + resize(m_size-1); + } + } + + //------------------------------------------------------------------------------------------ + template template + typename vector::iterator vector::emplace(iterator it, Args&&... args) + { + const size_type index = it-begin(); + + if (m_size == m_capacity) + { + reserve(m_capacity == 0u? DEFAULT_CAPACITY_SIZE : 2u*m_capacity); + } + + iterator insertIt = begin() + index; //this is important as reserve might move the memory around + iterator endIt = end(); + + if(endIt == insertIt) + { + Construct(insertIt, std::move(args)...); + } + else + { + //Build the new element + Construct(end(), std::move(*(end()-1))); + + //Shift remaining elements + for (iterator i = end()-1; i > insertIt;--i) + { + *i = std::move(*(i-1)); + } + + *insertIt = T(std::move(args)...); + } + + ++m_size; + + return insertIt; + } + + //------------------------------------------------------------------------------------------ + template + inline typename vector::iterator vector::erase(iterator it) + { + return erase(it,it+1); + } + + //------------------------------------------------------------------------------------------ + template + inline typename vector::iterator vector::erase(iterator fromIt, iterator toIt) + { + const size_type rangeSize = toIt-fromIt; + const_iterator batchEndIt = end()-rangeSize; + + for (iterator i = fromIt; i < batchEndIt; ++i) + { + *i = std::move(*(i + rangeSize)); + } + + resize(m_size - rangeSize); + return fromIt; + } + + //------------------------------------------------------------------------------------------ + template + void vector::Destroy() + { + for (size_type i=0u;i +// +//namespace fastl +//{ +// template using vector = std::vector; +//} + +#endif //USE_FASTL + + +#ifdef FASTL_EXPOSE_PLAIN_ALIAS + +template using vector = fastl::vector; + +#endif //FASTL_EXPOSE_PLAIN_ALIAS diff --git a/libkram/kram/KTXImage.cpp b/libkram/kram/KTXImage.cpp index 047b45aa..ed60c5e1 100644 --- a/libkram/kram/KTXImage.cpp +++ b/libkram/kram/KTXImage.cpp @@ -1279,8 +1279,12 @@ void KTXImage::initProps(const uint8_t* propsData, size_t propDataSize) } //LOGD("KTXImage", "KTXProp '%s': %s\n", keyStart, valueStart); - - props.push_back(make_pair(string((const char*)keyStart), string((const char*)valueStart))); + auto propPair = NAMESPACE_STL::make_pair( + string((const char*)keyStart), + string((const char*)valueStart) + ); + + props.emplace_back(propPair); // pad to 4 byte alignment int32_t valuePadding = 3 - ((dataSize + 3) % 4); @@ -1297,7 +1301,11 @@ void KTXImage::addProp(const char* name, const char* value) return; } } - props.push_back(make_pair(string(name), string(value))); + auto propPair = NAMESPACE_STL::make_pair( + string(name), + string(value) + ); + props.emplace_back(propPair); } string KTXImage::getProp(const char* name) const @@ -1399,8 +1407,8 @@ void KTXImage::toPropsData(vector& propsData) const const char* value = prop.second.c_str(); // add null-terminate key, and value data - propsData.insert(propsData.end(), key, key + prop.first.length() + 1); - propsData.insert(propsData.end(), value, value + prop.second.length() + 1); + propsData.insert(propsData.end(), (const uint8_t*)key, (const uint8_t*)key + prop.first.length() + 1); + propsData.insert(propsData.end(), (const uint8_t*)value, (const uint8_t*)value + prop.second.length() + 1); // padding to 4 byte multiple uint32_t numPadding = 3 - ((size + 3) % 4); diff --git a/libkram/kram/KTXImage.h b/libkram/kram/KTXImage.h index fbc0724d..1efc651c 100644 --- a/libkram/kram/KTXImage.h +++ b/libkram/kram/KTXImage.h @@ -7,7 +7,7 @@ //#include //#include -#include "KramConfig.h" +//#include "KramConfig.h" namespace kram { diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index fc467a3b..f810ec66 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -1221,7 +1221,8 @@ string formatInputAndOutput(int32_t testNumber, const char* srcFilename, MyMTLPi cmd += " -input " srcDir; cmd += srcFilename; - cmd += " -output " dstDir + dst; + cmd += " -output " dstDir; + cmd += dst; // replace png with ktx dst = srcFilename; @@ -1264,9 +1265,8 @@ bool kramTestCommand(int32_t testNumber, // Encoder may be fast, but want quality. Just decode this file using astcenc to see. testNumber = 1; encoder = kTexEncoderATE; - cmd += - " -normal" ASTCSwizzle2nm + - formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul); + cmd += " -normal" ASTCSwizzle2nm; + cmd += formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul); break; @@ -1281,9 +1281,8 @@ bool kramTestCommand(int32_t testNumber, case 3: testNumber = 3; encoder = kTexEncoderATE; - cmd += - " -normal" + // " -quality 100" - formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatBC5_RGUnorm, encoder); + cmd += " -normal"; // " -quality 100" + cmd += formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatBC5_RGUnorm, encoder); break; @@ -1306,9 +1305,8 @@ bool kramTestCommand(int32_t testNumber, case 10: testNumber = 10; encoder = kTexEncoderAstcenc; - cmd += - " -normal" ASTCSwizzle2nm + - formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul); + cmd += " -normal" ASTCSwizzle2nm; + cmd += formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul); break; @@ -1323,17 +1321,15 @@ bool kramTestCommand(int32_t testNumber, case 12: testNumber = 12; encoder = kTexEncoderSquish; - cmd += - " -normal" + - formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatBC5_RGUnorm, encoder); + cmd += " -normal"; + cmd += formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatBC5_RGUnorm, encoder); break; case 13: testNumber = 13; encoder = kTexEncoderBcenc; - cmd += - " -normal" + - formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatBC5_RGUnorm, encoder); + cmd += " -normal"; + cmd += formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatBC5_RGUnorm, encoder); break; @@ -1362,8 +1358,8 @@ bool kramTestCommand(int32_t testNumber, testNumber = 1020; // bc7enc with source, also handles other bc formats, way slower than ATE but why? encoder = kTexEncoderBcenc; - cmd += " -optopaque" + - formatInputAndOutput(testNumber, "ColorMap-a.png", MyMTLPixelFormatBC7_RGBAUnorm_sRGB, encoder); + cmd += " -optopaque"; + cmd += formatInputAndOutput(testNumber, "ColorMap-a.png", MyMTLPixelFormatBC7_RGBAUnorm_sRGB, encoder); break; // this takes 12s to process, may need to adjust quality to settings, but they're low already @@ -1395,17 +1391,16 @@ bool kramTestCommand(int32_t testNumber, case 2002: testNumber = 2002; encoder = kTexEncoderEtcenc; - cmd += - " -normal" + - formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatEAC_RG11Unorm, encoder); + cmd += " -normal"; + cmd += formatInputAndOutput(testNumber, "collectorbarrel-n.png", MyMTLPixelFormatEAC_RG11Unorm, encoder); break; case 2003: testNumber = 2003; encoder = kTexEncoderEtcenc; - cmd += " -optopaque" + - formatInputAndOutput(testNumber, "color_grid-a.png", MyMTLPixelFormatEAC_RGBA8_sRGB, encoder); + cmd += " -optopaque"; + cmd += formatInputAndOutput(testNumber, "color_grid-a.png", MyMTLPixelFormatEAC_RGBA8_sRGB, encoder); break; //-------------- @@ -1414,35 +1409,31 @@ bool kramTestCommand(int32_t testNumber, case 3001: testNumber = 3001; encoder = kTexEncoderExplicit; - cmd += - " -sdf" + - formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatR8Unorm, encoder); + cmd += " -sdf"; + cmd += formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatR8Unorm, encoder); break; case 3002: testNumber = 3002; encoder = kTexEncoderSquish; - cmd += - " -sdf" + - formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatBC4_RUnorm, encoder); + cmd += " -sdf"; + cmd += formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatBC4_RUnorm, encoder); break; case 3003: testNumber = 3003; encoder = kTexEncoderEtcenc; - cmd += - " -sdf" + - formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatEAC_R11Unorm, encoder); + cmd += " -sdf"; + cmd += formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatEAC_R11Unorm, encoder); break; case 3004: testNumber = 3004; encoder = kTexEncoderATE; - cmd += - " -sdf" ASTCSwizzleL1 + - formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul); + cmd += " -sdf" ASTCSwizzleL1; + cmd += formatInputAndOutput(testNumber, "flipper-sdf.png", MyMTLPixelFormatASTC_4x4_LDR, encoder, isNotPremul); break; default: diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h index 9d186d8a..157466eb 100644 --- a/libkram/kram/KramConfig.h +++ b/libkram/kram/KramConfig.h @@ -151,6 +151,10 @@ #define COMPILE_EASTL 0 #endif +#ifndef COMPILE_FASTL +#define COMPILE_FASTL 0 +#endif + // basis transcoder only (read not writes) #ifndef COMPILE_BASIS #define COMPILE_BASIS 0 @@ -164,8 +168,12 @@ // This needs debug support that native stl already has. // EASTL only seems to define that for Visual Studio, and not lldb #define USE_EASTL COMPILE_EASTL +#define USE_FASTL COMPILE_FASTL + #if USE_EASTL +#define NAMESPACE_STL eastl + // this probably breaks all STL debugging #include // for max //#include "EASTL/atomic.h" @@ -185,9 +193,37 @@ #include #include +// std - simpler than using eastl version #include -#define NAMESPACE_STL eastl + +#elif USE_FASTL + +#define NAMESPACE_STL fastl + +// these are all vector based +#include "../fastl/falgorithm.h" +#include "../fastl/vector.h" +#include "../fastl/map.h" +#include "../fastl/set.h" +#include "../fastl/unordered_map.h" +#include "../fastl/unordered_set.h" + +// still too many holes in this (rfind, insert, back, pop_back, find_last_of, substr) +//#include "../fastl/fstring.h" +#include +namespace NAMESPACE_STL +{ + using string = std::string; +} + +// std - for missing functionality +#include +#include +#include +#include +#include // for unique_ptr/shared_ptr +#include #else @@ -202,6 +238,7 @@ import std.filesystem; import std.regex; */ +// all std #include // for max #include @@ -218,7 +255,6 @@ import std.regex; #include #include - #define NAMESPACE_STL std #endif @@ -295,18 +331,21 @@ class half4 { #if !USE_EASTL -namespace std { -inline float clamp(float x, float minValue, float maxValue) { return min(max(x, minValue), maxValue); } -inline double clamp(double x, double minValue, double maxValue) { return min(max(x, minValue), maxValue); } +namespace NAMESPACE_STL { + +// scalar ops +#if USE_FASTL +template +inline T min(T x, T minValue) { return x < minValue ? x : minValue; } +template +inline T max(T x, T maxValue) { return x > maxValue ? x : maxValue; } +#endif -inline double clamp(int8_t x, int8_t minValue, int8_t maxValue) { return min(max(x, minValue), maxValue); } -inline double clamp(uint8_t x, uint8_t minValue, uint8_t maxValue) { return min(max(x, minValue), maxValue); } +// already defined in C++17 +//template +//inline T clamp(T x, T minValue, T maxValue) { return min(max(x, minValue), maxValue); } -inline double clamp(int16_t x, int16_t minValue, int16_t maxValue) { return min(max(x, minValue), maxValue); } -inline double clamp(uint16_t x, uint16_t minValue, uint16_t maxValue) { return min(max(x, minValue), maxValue); } -inline double clamp(int32_t x, int32_t minValue, int32_t maxValue) { return min(max(x, minValue), maxValue); } -inline double clamp(int64_t x, int64_t minValue, int64_t maxValue) { return min(max(x, minValue), maxValue); } } // namespace std #endif @@ -358,11 +397,11 @@ inline float4 float4m(float x) inline float saturate(float v) { - return NAMESPACE_STL::clamp(v, 0.0f, 1.0f); + return std::clamp(v, 0.0f, 1.0f); } inline double saturate(double v) { - return NAMESPACE_STL::clamp(v, 0.0, 1.0); + return std::clamp(v, 0.0, 1.0); } inline float2 saturate(const float2& v) { diff --git a/libkram/kram/KramDDSHelper.h b/libkram/kram/KramDDSHelper.h index 0fa25088..ec8dd1ec 100644 --- a/libkram/kram/KramDDSHelper.h +++ b/libkram/kram/KramDDSHelper.h @@ -7,7 +7,7 @@ #include #include -#include "KramConfig.h" +//#include "KramConfig.h" namespace kram { using namespace NAMESPACE_STL; diff --git a/libkram/kram/KramFileHelper.cpp b/libkram/kram/KramFileHelper.cpp index 05996cb8..a762657f 100644 --- a/libkram/kram/KramFileHelper.cpp +++ b/libkram/kram/KramFileHelper.cpp @@ -152,7 +152,7 @@ bool FileHelper::copyTemporaryFileTo(const char* dstFilename) // DONE: copy in smaller buffered chunks size_t maxBufferSize = 256 * 1024; - size_t bufferSize = min(size_, maxBufferSize); + size_t bufferSize = std::min(size_, maxBufferSize); vector tmpBuf; tmpBuf.resize(bufferSize); @@ -176,7 +176,7 @@ bool FileHelper::copyTemporaryFileTo(const char* dstFilename) size_t bytesRemaining = size_; while (bytesRemaining > 0) { - size_t bytesToRead = min(bufferSize, bytesRemaining); + size_t bytesToRead = std::min(bufferSize, bytesRemaining); bytesRemaining -= bytesToRead; if (!read(tmpBuf.data(), bytesToRead) || diff --git a/libkram/kram/KramFileHelper.h b/libkram/kram/KramFileHelper.h index b91f0a05..2bb50511 100644 --- a/libkram/kram/KramFileHelper.h +++ b/libkram/kram/KramFileHelper.h @@ -9,7 +9,7 @@ //#include -#include "KramConfig.h" +//#include "KramConfig.h" namespace kram { using namespace NAMESPACE_STL; diff --git a/libkram/kram/KramImage.cpp b/libkram/kram/KramImage.cpp index 4d6db30c..3cda6865 100644 --- a/libkram/kram/KramImage.cpp +++ b/libkram/kram/KramImage.cpp @@ -1689,7 +1689,7 @@ bool KramEncoder::saveKTX2(const KTXImage& srcImage, const KTX2Compressor& compr // allocate big enough to hold entire uncompressed level vector compressedData; - compressedData.resize(mz_compressBound(ktx2Levels.front().length)); // largest mip + compressedData.resize(mz_compressBound(ktx2Levels[0].length)); // largest mip size_t compressedDataSize = 0; // reuse a context here diff --git a/libkram/kram/KramImage.h b/libkram/kram/KramImage.h index f450bd39..3901fc1c 100644 --- a/libkram/kram/KramImage.h +++ b/libkram/kram/KramImage.h @@ -8,7 +8,7 @@ //#include #include "KTXImage.h" // for MyMTLTextureType -#include "KramConfig.h" +//#include "KramConfig.h" #include "KramImageInfo.h" #include "KramMipper.h" diff --git a/libkram/kram/KramImageInfo.h b/libkram/kram/KramImageInfo.h index 1241a1f3..9f883de6 100644 --- a/libkram/kram/KramImageInfo.h +++ b/libkram/kram/KramImageInfo.h @@ -7,8 +7,8 @@ //#include //#include +//#include "KramConfig.h" #include "KTXImage.h" -#include "KramConfig.h" #include "KramMipper.h" // for Color namespace kram { diff --git a/libkram/kram/KramLib.h b/libkram/kram/KramLib.h index f737d034..068f4168 100644 --- a/libkram/kram/KramLib.h +++ b/libkram/kram/KramLib.h @@ -7,7 +7,7 @@ // This is a master header. Can be used if turning this into a framework. // But found frameworks to be more difficult to use than libs. -#include "KramConfig.h" +//#include "KramConfig.h" // helpers #include "KTXImage.h" diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp index 5a7726b9..762e522a 100644 --- a/libkram/kram/KramLog.cpp +++ b/libkram/kram/KramLog.cpp @@ -130,7 +130,16 @@ bool endsWith(const string& value, const string& ending) } // reverse comparison at end of value - return equal(ending.rbegin(), ending.rend(), value.rbegin()); + if (value.size() < ending.size()) + return false; + uint32_t start = value.size() - ending.size(); + + for (uint32_t i = 0; i < ending.size(); ++i) { + if (value[start + i] != ending[i]) + return false; + } + + return true; } bool endsWithExtension(const char* str, const string& substring) diff --git a/libkram/kram/KramLog.h b/libkram/kram/KramLog.h index b8333449..cc3ae985 100644 --- a/libkram/kram/KramLog.h +++ b/libkram/kram/KramLog.h @@ -7,7 +7,7 @@ #include //#include -#include "KramConfig.h" +//#include "KramConfig.h" namespace kram { diff --git a/libkram/kram/KramMipper.h b/libkram/kram/KramMipper.h index 5a6ddd43..4672367a 100644 --- a/libkram/kram/KramMipper.h +++ b/libkram/kram/KramMipper.h @@ -7,7 +7,7 @@ #include //#include -#include "KramConfig.h" +//#include "KramConfig.h" namespace kram { using namespace NAMESPACE_STL; diff --git a/libkram/kram/KramMmapHelper.h b/libkram/kram/KramMmapHelper.h index 7380038b..04dfe523 100644 --- a/libkram/kram/KramMmapHelper.h +++ b/libkram/kram/KramMmapHelper.h @@ -7,7 +7,7 @@ #include #include -#include "KramConfig.h" +//#include "KramConfig.h" // this holds onto the open file and address from mmap operation class MmapHelper { diff --git a/libkram/kram/KramSDFMipper.h b/libkram/kram/KramSDFMipper.h index 8bae8e4f..f65d21d0 100644 --- a/libkram/kram/KramSDFMipper.h +++ b/libkram/kram/KramSDFMipper.h @@ -6,7 +6,7 @@ //#include -#include "KramConfig.h" +//#include "KramConfig.h" namespace kram { using namespace NAMESPACE_STL; diff --git a/libkram/kram/KramTimer.h b/libkram/kram/KramTimer.h index e5bf96ca..281663df 100644 --- a/libkram/kram/KramTimer.h +++ b/libkram/kram/KramTimer.h @@ -6,7 +6,7 @@ #include -#include "KramConfig.h" +//#include "KramConfig.h" namespace kram { // Can obtain a timestamp to nanosecond accuracy. diff --git a/libkram/kram/KramZipHelper.cpp b/libkram/kram/KramZipHelper.cpp index d37d3c48..61fa7215 100644 --- a/libkram/kram/KramZipHelper.cpp +++ b/libkram/kram/KramZipHelper.cpp @@ -37,7 +37,7 @@ bool ZipHelper::openForRead(const uint8_t* zipData_, uint64_t zipDataSize) zipData = zipData_; - zip = make_unique(); + zip = std::make_unique(); mz_zip_zero_struct(zip.get()); mz_uint flags = 0; diff --git a/libkram/kram/KramZipHelper.h b/libkram/kram/KramZipHelper.h index a3410ea2..0f62c6ac 100644 --- a/libkram/kram/KramZipHelper.h +++ b/libkram/kram/KramZipHelper.h @@ -64,7 +64,7 @@ struct ZipHelper { int32_t zipEntryIndex(const char* name) const; private: - unique_ptr zip; + std::unique_ptr zip; vector _zipEntrys; const uint8_t* zipData; // aliased diff --git a/libkram/kram/TaskSystem.cpp b/libkram/kram/TaskSystem.cpp index d2e2fcea..672c07fe 100644 --- a/libkram/kram/TaskSystem.cpp +++ b/libkram/kram/TaskSystem.cpp @@ -212,7 +212,7 @@ static const CoreInfo& GetCoreInfo() #endif // sort faster cores first in the remap table - sort(coreInfo.remapTable.begin(), coreInfo.remapTable.end(), [](const CoreNum& lhs, const CoreNum& rhs){ + std::sort(coreInfo.remapTable.begin(), coreInfo.remapTable.end(), [](const CoreNum& lhs, const CoreNum& rhs){ #if KRAM_ANDROID // sort largest index if (lhs.type == rhs.type) @@ -251,40 +251,6 @@ std::thread::native_handle_type getCurrentThread() #if KRAM_WIN -/* This is the old way. This name is only available if debugger attached. - -// Isn't this in a header? -#pragma pack(push,8) -struct THREADNAME_INFO -{ - DWORD dwType; // Must be 0x1000. - LPCSTR szName; // Pointer to name (in user addr space). - DWORD dwThreadID; // Thread ID (-1=caller thread). - DWORD dwFlags; // Reserved for future use, must be zero. -}; -#pragma pack(pop) - -void setThreadName(std::thread::native_handle_type handle, const char* threadName) -{ - DWORD threadID = ::GetThreadId(handle); - - THREADNAME_INFO info; - info.dwType = 0x1000; - info.szName = threadName; - info.dwThreadID = threadID; - info.dwFlags = 0; - - __try - { - // Limits to how long this name can be. Also copy into ptr to change name. - RaiseException(0x406D1388, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info); - } - __except(EXCEPTION_EXECUTE_HANDLER) - { - } -} -*/ - // TODO: on Win, also need to set the following. Then use Windows Termnial. // SetConsoleOutputCP(CP_UTF8); @@ -569,7 +535,7 @@ void task_system::run(int32_t threadIndex) while (true) { // pop() wait avoids a spinloop. - function f; + myfunction f; // start with ours, but steal from other queues if nothing found // Note that if threadIndex queue is empty and stays empty diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index ebd86e90..d09d79b0 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -30,14 +30,17 @@ using mymutex = std::recursive_mutex; using mylock = std::unique_lock; using mycondition = std::condition_variable_any; +#define mydeque std::deque +#define myfunction std::function + class notification_queue { - deque> _q; + mydeque> _q; bool _done = false; mymutex _mutex; mycondition _ready; public: - bool try_pop(function& x) + bool try_pop(myfunction& x) { mylock lock{_mutex, std::try_to_lock}; if (!lock || _q.empty()) { @@ -48,7 +51,7 @@ class notification_queue { return true; } - bool pop(function& x) + bool pop(myfunction& x) { mylock lock{_mutex}; while (_q.empty() && !_done) { @@ -85,12 +88,12 @@ class notification_queue { { { mylock lock{_mutex}; - // TODO: fix this construct, it's saying no matching sctor for eastl::deque>>::value_type + // TODO: fix this construct, it's saying no matching sctor for mydeque>>::value_type #if USE_EASTL KLOGE("TaskSystem", "Fix eastl deque or function"); //_q.emplace_back(forward(f)); #else - _q.emplace_back(forward(f)); + _q.emplace_back(std::forward(f)); #endif } // allow a waiting pop() to awaken @@ -184,7 +187,7 @@ class task_system { // } // otherwise just push to the next indexed queue - _q[i % _count].push(forward(f)); + _q[i % _count].push(std::forward(f)); } }; diff --git a/libkram/kram/float4a.h b/libkram/kram/float4a.h index aa95be0e..f8a21bbf 100644 --- a/libkram/kram/float4a.h +++ b/libkram/kram/float4a.h @@ -4,7 +4,7 @@ #pragma once -#include "KramConfig.h" +//#include "KramConfig.h" // This is only meant to emulate float4 when lib not available // (f.e. win or linux w/o clang) but may move off simd lib to this. So diff --git a/libkram/lodepng/lodepng.cpp b/libkram/lodepng/lodepng.cpp index f7789c49..23bb89c9 100644 --- a/libkram/lodepng/lodepng.cpp +++ b/libkram/lodepng/lodepng.cpp @@ -6325,7 +6325,7 @@ unsigned decompress(vector& out, const unsigned char* in, size_t size_t buffersize = 0; unsigned error = zlib_decompress(&buffer, &buffersize, 0, in, insize, &settings); if(buffer) { - out.insert(out.end(), &buffer[0], &buffer[buffersize]); + out.insert(out.end(), buffer, buffer + buffersize); lodepng_free(buffer); } return error; @@ -6344,7 +6344,7 @@ unsigned compress(vector& out, const unsigned char* in, size_t in size_t buffersize = 0; unsigned error = zlib_compress(&buffer, &buffersize, in, insize, &settings); if(buffer) { - out.insert(out.end(), &buffer[0], &buffer[buffersize]); + out.insert(out.end(), buffer, buffer + buffersize); lodepng_free(buffer); } return error; From b93caa08994fa80c7a6a77ab3566cc79b7794560 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Mon, 8 Aug 2022 01:20:40 -0700 Subject: [PATCH 062/615] kram - update CMake removed iOS and Unix builds. I don't have time to maintain these. iOS is already built by Xcode. CMake is purely for Win builds and project creation. Added fastl support --- CMakeLists.txt | 43 +++++--------------------- libkram/CMakeLists.txt | 70 +++++++++++++++--------------------------- 2 files changed, 33 insertions(+), 80 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 064119e7..59baebc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,25 +7,15 @@ endif() #----------------------------------------------------- -set(BUILD_IOS FALSE) set(BUILD_MAC FALSE) set(BUILD_WIN FALSE) -set(BUILD_UNIX FALSE) if (APPLE) - if (CMAKE_SYSTEM_NAME STREQUAL "iOS") - message("build for iOS") - set(BUILD_IOS TRUE) - else() - message("build for macOS") - set(BUILD_MAC TRUE) - endif() + message("build for macOS") + set(BUILD_MAC TRUE) elseif (WIN32) message("build for win x64") set(BUILD_WIN TRUE) -elseif (UNIX AND NOT APPLE) - message("build for unix") - set(BUILD_UNIX TRUE) endif() #----------------------------------------------------- @@ -54,10 +44,7 @@ set(CMAKE_CXX_EXTENSIONS NO) # set(CMAKE_OSX_SYSROOT macos) # this doesn't work # CMAKE_OSX_DEPLOYMENT_TARGET must be set as a CACHE variable, or it will be stripped -if (BUILD_IOS) - set(CMAKE_OSX_DEPLOYMENT_TARGET "14.1" CACHE STRING "Minimum iOS") - set(CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Architecture iOS") -elseif (BUILD_MAC) +if (BUILD_MAC) set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS") set(CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)" CACHE STRING "Architecture macOS") endif() @@ -67,8 +54,6 @@ set(CMAKE_BUILD_TYPE Release) if (BUILD_MAC) set(CMAKE_DEFAULT_STARTUP_PROJECT "kramc") -elseif (BUILD_IOS) - set(CMAKE_DEFAULT_STARTUP_PROJECT "kramc") elseif (BUILD_WIN) set(CMAKE_DEFAULT_STARTUP_PROJECT "kramc") endif() @@ -80,7 +65,7 @@ endif() # No way to make xcode workspaces, but could do manually. set(myTargetWorkspace kramWorkspace) -if (BUILD_MAC OR BUILD_IOS) +if (BUILD_MAC) project(${myTargetWorkspace} LANGUAGES C CXX OBJCXX) elseif (BUILD_WIN) project(${myTargetWorkspace} LANGUAGES C CXX) @@ -108,7 +93,7 @@ endif() # https://discourse.cmake.org/t/specifying-cmake-osx-sysroot-breaks-xcode-projects-but-no-other-choice/2532/8 # use snipet from Alian Martin to validate SDK -if (BUILD_MAC OR BUILD_IOS) +if (BUILD_MAC) if(NOT DEFINED CMAKE_OSX_SYSROOT) message(FATAL_ERROR "Cannot check SDK version if CMAKE_OSX_SYSROOT is not defined." ) @@ -129,18 +114,8 @@ if (BUILD_MAC OR BUILD_IOS) OUTPUT_VARIABLE SDK_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE ) - - if (BUILD_IOS) - message("iOS SDK ${SDK_VERSION}") - message("iOS deploy ${CMAKE_OSX_DEPLOYMENT_TARGET}") - message("iOS arch ${CMAKE_OSX_ARCHITECTURES}") - - if (SDK_VERSION VERSION_LESS XCODE_MIN_SDK_IOS) - message(FATAL_ERROR "This project requires at least iPhoneOS ${XCODE_MIN_SDK_IOS}" - ) - endif() - - elseif (BUILD_MAC) + + if (BUILD_MAC) message("macOS SDK ${SDK_VERSION}") message("macOS deploy ${CMAKE_OSX_DEPLOYMENT_TARGET}") message("macOS arch ${CMAKE_OSX_ARCHITECTURES}") @@ -157,9 +132,7 @@ endif() # was considering platform-specific builds, but mac/win don't conflict set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin) -#if (BUILD_IOS) -# set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin/ios) -#elseif (BUILD_MAC) +#if (BUILD_MAC) # set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin/mac) #elseif (BUILD_WIN) # set(BIN_DIR ${PROJECT_SOURCE_DIR}/bin/win) diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index 479589bf..7eed163a 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -2,25 +2,15 @@ #----------------------------------------------------- -set(BUILD_IOS FALSE) set(BUILD_MAC FALSE) set(BUILD_WIN FALSE) -set(BUILD_UNIX FALSE) if (APPLE) - if (CMAKE_SYSTEM_NAME STREQUAL "iOS") - message("build for iOS") - set(BUILD_IOS TRUE) - else() - message("build for macOS") - set(BUILD_MAC TRUE) - endif() + message("build for macOS") + set(BUILD_MAC TRUE) elseif (WIN32) message("build for win x64") set(BUILD_WIN TRUE) -elseif (UNIX AND NOT APPLE) - message("build for unix") - set(BUILD_UNIX TRUE) endif() #----------------------------------------------------- @@ -34,6 +24,7 @@ option(BCENC "Compile BCenc Encoder" ON) option(COMP "Compile Compressonator Encoder" ON) option(EASTL "Compile EASTL" OFF) +option(FASTL "Compile FASTL" ON) # convert ON to 1, UGH set(COMPILE_ATE 0) @@ -43,7 +34,7 @@ set(COMPILE_SQUISH 0) set(COMPILE_ASTCENC 0) set(COMPILE_COMP 0) -if (ATE AND (BUILD_MAC OR BUILD_IOS)) +if (ATE AND BUILD_MAC) set(COMPILE_ATE 1) endif() @@ -67,14 +58,22 @@ if (COMP) set(COMPILE_COMP 1) endif() +#----------------------------------------------------- +# stl used -# this isn't an encoder, but replaces stl with eastl +# replaces parts of std/stl with eastl set(COMPILE_EASTL 0) - if (EASTL) set(COMPILE_EASTL 1) endif() +# replace parts of std/stdl with fastl +set(COMPILE_EASTL 0) +if (FASTL) + set(COMPILE_FASTL 1) +endif() + + #----------------------------------------------------- # libkram @@ -133,6 +132,9 @@ file(GLOB_RECURSE libSources CONFIGURE_DEPENDS "${SOURCE_DIR}/eastl/*.cpp" "${SOURCE_DIR}/eastl/*.h" + "${SOURCE_DIR}/fastl/*.cpp" + "${SOURCE_DIR}/fastl/*.h" + "${SOURCE_DIR}/lodepng/lodepng.cpp" "${SOURCE_DIR}/lodepng/lodepng.h" @@ -164,9 +166,6 @@ file(GLOB_RECURSE libSources CONFIGURE_DEPENDS if (BUILD_WIN) list(FILTER libSources EXCLUDE REGEX ".*ateencoder.mm$") list(FILTER libSources EXCLUDE REGEX ".*ateencoder.h$") -elseif (BUILD_UNIX) - list(FILTER libSources EXCLUDE REGEX ".*ateencoder.mm$") - list(FILTER libSources EXCLUDE REGEX ".*ateencoder.h$") endif() # remove files not used @@ -194,6 +193,8 @@ target_include_directories(${myTargetLib} PUBLIC # why are these public, must be in public headers "${SOURCE_DIR}/eastl/include/" + + "${SOURCE_DIR}/fastl/" ) target_include_directories(${myTargetLib} PRIVATE @@ -213,11 +214,12 @@ target_include_directories(${myTargetLib} PRIVATE # only add sources to the library target_sources(${myTargetLib} PRIVATE ${libSources}) +# note: mac build is all done via Xcode workspace/project now, this cmake build is legacy if (BUILD_MAC) set_target_properties(${myTargetLib} PROPERTIES # Note: match this up with CXX version # c++11 min - XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++14" + XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++17" XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++" # avx1 (ignored by universal?) @@ -231,31 +233,13 @@ if (BUILD_MAC) XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES ) - target_compile_options(${myTargetLib} PRIVATE -include KramConfig.h -W -Wall) - -elseif (BUILD_IOS) - set_target_properties(${myTargetLib} PROPERTIES - # Note: match this up with CXX version - # c++11 min - XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++14" - XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++" - - # avx1 - #XCODE_ATTRIBUTE_CLANG_X86_VECTOR_INSTRUCTIONS "avx" - - # turn off exceptions/rtti - XCODE_ATTRIBUTE_GCC_ENABLE_CPP_EXCEPTIONS NO - XCODE_ATTRIBUTE_GCC_ENABLE_CPP_RTTI NO - - # can't believe this isn't on by default in CMAKE - XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES - ) - + # TODO: switch to pch setup (KramConfig.pch) target_compile_options(${myTargetLib} PRIVATE -include KramConfig.h -W -Wall) elseif (BUILD_WIN) # TODO: switch to add_target_definitions + # TODO: turn on C++17 # to turn off exceptions/rtti use /GR and /EHsc replacement string(REGEX REPLACE "/GR" "/GR-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") @@ -269,12 +253,7 @@ elseif (BUILD_WIN) # fix STL target_compile_definitions(${myTargetLib} PRIVATE "-D_D_HAS_EXCEPTIONS=0 -D_ITERATOR_DEBUG_LEVEL=0") - -elseif (BUILD_UNIX) - # TODO: finish this - - target_compile_options(${myTargetLib} PRIVATE -include KramConfig.h -W -Wall) - + endif() target_compile_definitions(${myTargetLib} PUBLIC @@ -285,4 +264,5 @@ target_compile_definitions(${myTargetLib} PUBLIC "-DCOMPILE_ASTCENC=${COMPILE_ASTCENC}" "-DCOMPILE_COMP=${COMPILE_COMP}" "-DCOMPILE_EASTL=${COMPILE_EASTL}" + "-DCOMPILE_FASTL=${COMPILE_FASTL}" ) From e672edfcbb24736693d63110842dcda9bcef5cf4 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Mon, 8 Aug 2022 01:31:12 -0700 Subject: [PATCH 063/615] kram - fix Win build --- libkram/kram/KramConfig.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h index 157466eb..64af857a 100644 --- a/libkram/kram/KramConfig.h +++ b/libkram/kram/KramConfig.h @@ -224,6 +224,7 @@ namespace NAMESPACE_STL #include #include // for unique_ptr/shared_ptr #include +#include // for copy_if and back_inserter on Win #else From 9b7172738aaef68ab6beb6eaf5849446de67dcf5 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Mon, 8 Aug 2022 22:47:34 -0700 Subject: [PATCH 064/615] kram - cmake updates --- kramc/CMakeLists.txt | 5 ----- libkram/CMakeLists.txt | 7 ++++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/kramc/CMakeLists.txt b/kramc/CMakeLists.txt index ba3aef5c..a1d1f687 100644 --- a/kramc/CMakeLists.txt +++ b/kramc/CMakeLists.txt @@ -2,10 +2,8 @@ #----------------------------------------------------- -set(BUILD_IOS FALSE) set(BUILD_MAC FALSE) set(BUILD_WIN FALSE) -set(BUILD_UNIX FALSE) if (APPLE) if (CMAKE_SYSTEM_NAME STREQUAL "iOS") @@ -18,9 +16,6 @@ if (APPLE) elseif (WIN32) message("build for win x64") set(BUILD_WIN TRUE) -elseif (UNIX AND NOT APPLE) - message("build for unix") - set(BUILD_UNIX TRUE) endif() #----------------------------------------------------- diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index 7eed163a..4c89a961 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -219,7 +219,7 @@ if (BUILD_MAC) set_target_properties(${myTargetLib} PROPERTIES # Note: match this up with CXX version # c++11 min - XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++17" + XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++20" XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++" # avx1 (ignored by universal?) @@ -245,8 +245,8 @@ elseif (BUILD_WIN) string(REGEX REPLACE "/GR" "/GR-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REGEX REPLACE "/EHsc" "/EHs-c-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - # force include - target_compile_options(${myTargetLib} PRIVATE /FIKramConfig.h) + # force include (public) + target_compile_options(${myTargetLib} PUBLIC /FIKramConfig.h) # all warnings, AVX1, and multiprocess compiles target_compile_options(${myTargetLib} PRIVATE /W3 /arch:AVX /MP) @@ -256,6 +256,7 @@ elseif (BUILD_WIN) endif() +# public target_compile_definitions(${myTargetLib} PUBLIC "-DCOMPILE_ATE=${COMPILE_ATE}" "-DCOMPILE_BCENC=${COMPILE_BCENC}" From 461aec64e93d29acdbb638907fb8c26722009b78 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Thu, 11 Aug 2022 00:40:28 -0700 Subject: [PATCH 065/615] kram - turn on precompiled pch for libkram build for Xcode KramPrefix.pch is only including KramConfig.h for now. Still need to enable in Cmake for Win build. I also ran Aras's build analysis tool on the json output of -ftime-trace. It reported 11s vs 78s of parsing time. That may be spread across the 6+2 cores on my machine. Here are M1 (6/2) timings for PCH vs. not from Xcode build timings. Debug (arm64) 7s vs. 16s Release (arm64+x64) 33s vs. 44s. --- build2/kram.xcodeproj/project.pbxproj | 4 ---- 1 file changed, 4 deletions(-) diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index e672e339..a90e8726 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -1908,7 +1908,6 @@ CLANG_X86_VECTOR_INSTRUCTIONS = avx; CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; - GCC_PRECOMPILE_PREFIX_HEADER = NO; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; SYSTEM_HEADER_SEARCH_PATHS = ""; @@ -1923,7 +1922,6 @@ CLANG_X86_VECTOR_INSTRUCTIONS = avx; CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; - GCC_PRECOMPILE_PREFIX_HEADER = NO; PRODUCT_NAME = "$(TARGET_NAME)"; SKIP_INSTALL = YES; SYSTEM_HEADER_SEARCH_PATHS = ""; @@ -1938,7 +1936,6 @@ CLANG_X86_VECTOR_INSTRUCTIONS = default; CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; - GCC_PRECOMPILE_PREFIX_HEADER = NO; IPHONEOS_DEPLOYMENT_TARGET = 14.1; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = iphoneos; @@ -1956,7 +1953,6 @@ CLANG_X86_VECTOR_INSTRUCTIONS = default; CODE_SIGN_STYLE = Automatic; EXECUTABLE_PREFIX = lib; - GCC_PRECOMPILE_PREFIX_HEADER = NO; IPHONEOS_DEPLOYMENT_TARGET = 14.1; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = iphoneos; From 8f9dab4353a587f8c75e7ddc42956dfac28cb46e Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Thu, 11 Aug 2022 09:41:15 -0700 Subject: [PATCH 066/615] kram - turn on cmake pch This should speed up Win builds as well. --- libkram/CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index 4c89a961..f0f438a1 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -100,9 +100,9 @@ else() ) endif() -set_target_properties(${myTargetLib} PROPERTIES - # turn off pch - DISABLE_PRECOMPILE_HEADERS ON +# this will be force include (-include, /FI) on GCC/clang/VS +target_precompile_headers(${myTargetLib} PUBLIC + ${SOURCE_DIR}/kram/KramPrefix.pch ) # add_library doesn't establish a project, so still pointing at root CMake @@ -125,6 +125,7 @@ file(GLOB_RECURSE libSources CONFIGURE_DEPENDS "${SOURCE_DIR}/heman/hedistance.cpp" "${SOURCE_DIR}/heman/hedistance.h" + "${SOURCE_DIR}/kram/*.pch" "${SOURCE_DIR}/kram/*.cpp" "${SOURCE_DIR}/kram/*.h" From 719421833907b11027b6c01d8420220edf23c8d3 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Thu, 11 Aug 2022 09:44:05 -0700 Subject: [PATCH 067/615] kram - fix cmake pch --- libkram/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index f0f438a1..0f723dbc 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -100,14 +100,14 @@ else() ) endif() +# add_library doesn't establish a project, so still pointing at root CMake +set(SOURCE_DIR ${PROJECT_SOURCE_DIR}/libkram) + # this will be force include (-include, /FI) on GCC/clang/VS target_precompile_headers(${myTargetLib} PUBLIC ${SOURCE_DIR}/kram/KramPrefix.pch ) -# add_library doesn't establish a project, so still pointing at root CMake -set(SOURCE_DIR ${PROJECT_SOURCE_DIR}/libkram) - file(GLOB_RECURSE libSources CONFIGURE_DEPENDS "${SOURCE_DIR}/astc-encoder/*.cpp" "${SOURCE_DIR}/astc-encoder/*.h" From e0bbeaeba3f416f0930d1bc7332329db67808058 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Thu, 11 Aug 2022 09:51:40 -0700 Subject: [PATCH 068/615] kram - keep trying to fix cmake pch Try .h instead of referencing .pch file --- libkram/CMakeLists.txt | 5 +++-- libkram/kram/KramPrefix.h | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 libkram/kram/KramPrefix.h diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index 0f723dbc..2a9da8d1 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -104,8 +104,9 @@ endif() set(SOURCE_DIR ${PROJECT_SOURCE_DIR}/libkram) # this will be force include (-include, /FI) on GCC/clang/VS -target_precompile_headers(${myTargetLib} PUBLIC - ${SOURCE_DIR}/kram/KramPrefix.pch +# can't seem to ref KramPrefix.pch file, since this goes into cmake_pch.hxx file +target_precompile_headers(${myTargetLib} PRIVATE + ${SOURCE_DIR}/kram/KramPrefix.h ) file(GLOB_RECURSE libSources CONFIGURE_DEPENDS diff --git a/libkram/kram/KramPrefix.h b/libkram/kram/KramPrefix.h new file mode 100644 index 00000000..02374c5e --- /dev/null +++ b/libkram/kram/KramPrefix.h @@ -0,0 +1,9 @@ +// kram - Copyright 2020-2022 by Alec Miller. - MIT License +// The license and copyright notice shall be included +// in all copies or substantial portions of the Software. + +#pragma once + +#include "KramConfig.h" + +//#include "KramLib.h" From 0e35619a68a7f1b9709a16f3ab61b0b5fb602b67 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 12:10:39 -0700 Subject: [PATCH 069/615] kram - more std cleanup, add cba script --- kramv/KramViewerMain.mm | 12 +- libkram/CMakeLists.txt | 25 ++-- .../astc-encoder/astcenc_diagnostic_trace.cpp | 2 +- .../astc-encoder/astcenc_diagnostic_trace.h | 2 +- libkram/astc-encoder/astcenc_internal.h | 9 +- .../astc-encoder/astcenc_vecmathlib_none_4.h | 2 +- libkram/ate/ateencoder.mm | 2 +- libkram/bc7enc/bc7decomp.h | 2 +- libkram/bc7enc/bc7enc.cpp | 2 +- libkram/bc7enc/ert.cpp | 6 +- libkram/bc7enc/ert.h | 10 +- libkram/bc7enc/rdo_bc_encoder.cpp | 22 ++-- libkram/bc7enc/rgbcx.cpp | 2 +- libkram/bc7enc/rgbcx.h | 2 +- libkram/bc7enc/utils.cpp | 19 +-- libkram/bc7enc/utils.h | 30 ++--- libkram/fastl/fstring.h | 118 +++++++++++++----- libkram/fastl/vector.h | 16 ++- libkram/kram/Kram.cpp | 10 +- libkram/kram/KramConfig.h | 33 +++-- libkram/kram/KramLog.cpp | 2 +- libkram/kram/KramZipHelper.cpp | 2 +- libkram/transcoder/basisu_transcoder.cpp | 3 +- libkram/transcoder/basisu_transcoder.h | 4 +- scripts/cba.sh | 15 +++ 25 files changed, 234 insertions(+), 118 deletions(-) create mode 100755 scripts/cba.sh diff --git a/kramv/KramViewerMain.mm b/kramv/KramViewerMain.mm index da8563f6..925a7d02 100644 --- a/kramv/KramViewerMain.mm +++ b/kramv/KramViewerMain.mm @@ -2454,7 +2454,8 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD [self setEyedropperText:""]; isChanged = true; - text = "Loaded " + _showSettings->lastFilename; + text = "Loaded "; + text += _showSettings->lastFilename; } } else if (_showSettings->isFolder) { @@ -2464,7 +2465,8 @@ - (bool)handleEventAction:(const Action*)action isShiftKeyDown:(bool)isShiftKeyD [self setEyedropperText:""]; isChanged = true; - text = "Loaded " + _showSettings->lastFilename; + text = "Loaded "; + text += _showSettings->lastFilename; } } } @@ -2829,10 +2831,12 @@ static void findPossibleNormalMapFromAlbedoFilename(const char* filename, vector const char* ext = strrchr(filename, '.'); - auto dotPos = filenameShort.find_last_of("."); - if (dotPos == string::npos) + const char* dosPosStr = strchr(filenameShort.c_str(), '.'); + if (dosPosStr == nullptr) return; + auto dotPos = dosPosStr - filenameShort.c_str(); + // now chop off the extension filenameShort = filenameShort.substr(0, dotPos); diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index 2a9da8d1..95de547a 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -103,12 +103,6 @@ endif() # add_library doesn't establish a project, so still pointing at root CMake set(SOURCE_DIR ${PROJECT_SOURCE_DIR}/libkram) -# this will be force include (-include, /FI) on GCC/clang/VS -# can't seem to ref KramPrefix.pch file, since this goes into cmake_pch.hxx file -target_precompile_headers(${myTargetLib} PRIVATE - ${SOURCE_DIR}/kram/KramPrefix.h -) - file(GLOB_RECURSE libSources CONFIGURE_DEPENDS "${SOURCE_DIR}/astc-encoder/*.cpp" "${SOURCE_DIR}/astc-encoder/*.h" @@ -247,8 +241,9 @@ elseif (BUILD_WIN) string(REGEX REPLACE "/GR" "/GR-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REGEX REPLACE "/EHsc" "/EHs-c-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + # this is already done by pch # force include (public) - target_compile_options(${myTargetLib} PUBLIC /FIKramConfig.h) + #target_compile_options(${myTargetLib} PUBLIC /FIKramConfig.h) # all warnings, AVX1, and multiprocess compiles target_compile_options(${myTargetLib} PRIVATE /W3 /arch:AVX /MP) @@ -258,14 +253,24 @@ elseif (BUILD_WIN) endif() +# This will be force include (-include, /FI) on GCC/clang/VS. +# Can't seem to ref KramPrefix.pch file. Goes into cmake_pch.hxx file +target_precompile_headers(${myTargetLib} PRIVATE + ${SOURCE_DIR}/kram/KramPrefix.h +) + # public -target_compile_definitions(${myTargetLib} PUBLIC +target_compile_definitions(${myTargetLib} + PUBLIC + "-DCOMPILE_EASTL=${COMPILE_EASTL}" + "-DCOMPILE_FASTL=${COMPILE_FASTL}" + + PRIVATE "-DCOMPILE_ATE=${COMPILE_ATE}" "-DCOMPILE_BCENC=${COMPILE_BCENC}" "-DCOMPILE_ETCENC=${COMPILE_ETCENC}" "-DCOMPILE_SQUISH=${COMPILE_SQUISH}" "-DCOMPILE_ASTCENC=${COMPILE_ASTCENC}" "-DCOMPILE_COMP=${COMPILE_COMP}" - "-DCOMPILE_EASTL=${COMPILE_EASTL}" - "-DCOMPILE_FASTL=${COMPILE_FASTL}" + ) diff --git a/libkram/astc-encoder/astcenc_diagnostic_trace.cpp b/libkram/astc-encoder/astcenc_diagnostic_trace.cpp index fbf01a97..04afa825 100644 --- a/libkram/astc-encoder/astcenc_diagnostic_trace.cpp +++ b/libkram/astc-encoder/astcenc_diagnostic_trace.cpp @@ -24,7 +24,7 @@ #include #include #include -#include +//#include #include "astcenc_diagnostic_trace.h" diff --git a/libkram/astc-encoder/astcenc_diagnostic_trace.h b/libkram/astc-encoder/astcenc_diagnostic_trace.h index 61489498..96c97c87 100644 --- a/libkram/astc-encoder/astcenc_diagnostic_trace.h +++ b/libkram/astc-encoder/astcenc_diagnostic_trace.h @@ -158,7 +158,7 @@ class TraceLog /** * @brief The stack of nodes (newest at the back). */ - std::vector m_stack; + vector m_stack; private: /** diff --git a/libkram/astc-encoder/astcenc_internal.h b/libkram/astc-encoder/astcenc_internal.h index aa7f6001..a1b7b39f 100644 --- a/libkram/astc-encoder/astcenc_internal.h +++ b/libkram/astc-encoder/astcenc_internal.h @@ -22,15 +22,16 @@ #ifndef ASTCENC_INTERNAL_INCLUDED #define ASTCENC_INTERNAL_INCLUDED -#include +//#include +//#include +//#include +//#include + #include #include #include #include #include -#include -#include -#include #include #include "astcenc.h" diff --git a/libkram/astc-encoder/astcenc_vecmathlib_none_4.h b/libkram/astc-encoder/astcenc_vecmathlib_none_4.h index 5a399ef5..6aad161e 100644 --- a/libkram/astc-encoder/astcenc_vecmathlib_none_4.h +++ b/libkram/astc-encoder/astcenc_vecmathlib_none_4.h @@ -40,7 +40,7 @@ #error "Include astcenc_vecmathlib.h, do not include directly" #endif -#include +//#include #include #include #include diff --git a/libkram/ate/ateencoder.mm b/libkram/ate/ateencoder.mm index f3a43b9b..265110a8 100644 --- a/libkram/ate/ateencoder.mm +++ b/libkram/ate/ateencoder.mm @@ -2,7 +2,7 @@ #if COMPILE_ATE -#include +//#include #include "KTXImage.h" // for MyMTLPixelFormat // this contains ATE encoder (libate.dylib) diff --git a/libkram/bc7enc/bc7decomp.h b/libkram/bc7enc/bc7decomp.h index 49dc9341..37822fef 100644 --- a/libkram/bc7enc/bc7decomp.h +++ b/libkram/bc7enc/bc7decomp.h @@ -7,7 +7,7 @@ #include #include -#include +//#include #include #include diff --git a/libkram/bc7enc/bc7enc.cpp b/libkram/bc7enc/bc7enc.cpp index 4cbdd552..d7aec202 100644 --- a/libkram/bc7enc/bc7enc.cpp +++ b/libkram/bc7enc/bc7enc.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +//#include // Helpers static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high; return value; } diff --git a/libkram/bc7enc/ert.cpp b/libkram/bc7enc/ert.cpp index c09b9668..6fc2459d 100644 --- a/libkram/bc7enc/ert.cpp +++ b/libkram/bc7enc/ert.cpp @@ -282,7 +282,7 @@ namespace ert uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps, const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified, pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data, - std::vector* pBlock_mse_scales) + vector* pBlock_mse_scales) { assert(total_block_stride_in_bytes && block_size_to_optimize_in_bytes); assert(total_block_stride_in_bytes >= block_size_to_optimize_in_bytes); @@ -311,8 +311,8 @@ namespace ert const int total_blocks_to_check = std::max(1U, params.m_lookback_window_size / total_block_stride_in_bytes); - std::vector len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); - std::vector second_len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); + vector len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); + vector second_len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); uint32_t total_second_matches = 0; int prev_match_window_ofs_to_favor_cont = -1, prev_match_dist_to_favor = -1; diff --git a/libkram/bc7enc/ert.h b/libkram/bc7enc/ert.h index d387f527..509b1aa3 100644 --- a/libkram/bc7enc/ert.h +++ b/libkram/bc7enc/ert.h @@ -4,14 +4,16 @@ #include #include #include -#include #include #include -#include -#include +//#include +//#include +//#include namespace ert { + using namespace NAMESPACE_STL; + struct color_rgba { uint8_t m_c[4]; }; struct reduce_entropy_params @@ -76,6 +78,6 @@ namespace ert uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps, const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified, pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data, - std::vector* pBlock_mse_scales = nullptr); + vector* pBlock_mse_scales = nullptr); } // namespace ert diff --git a/libkram/bc7enc/rdo_bc_encoder.cpp b/libkram/bc7enc/rdo_bc_encoder.cpp index 44d39333..8718dfde 100644 --- a/libkram/bc7enc/rdo_bc_encoder.cpp +++ b/libkram/bc7enc/rdo_bc_encoder.cpp @@ -28,7 +28,7 @@ namespace rdo_bc return "?"; } - static std::vector compute_block_mse_scales(const image_u8& source_image, uint32_t blocks_x, uint32_t blocks_y, uint32_t total_blocks, bool rdo_debug_output) + static vector compute_block_mse_scales(const image_u8& source_image, uint32_t blocks_x, uint32_t blocks_y, uint32_t total_blocks, bool rdo_debug_output) { const float ULTRASMOOTH_BLOCK_STD_DEV_THRESHOLD = 2.9f; const float DARK_THRESHOLD = 13.0f; @@ -134,7 +134,7 @@ namespace rdo_bc if (!is_ultrasmooth) continue; - std::vector filled_pixels; + vector filled_pixels; filled_pixels.reserve(256); uint32_t total_set_pixels = ultrasmooth_blocks_vis.flood_fill(bx, by, color_quad_u8(255, 255, 255, 255), color_quad_u8(0, 0, 0, 255), &filled_pixels); @@ -155,7 +155,7 @@ namespace rdo_bc save_png("ultrasmooth_block_mask.png", ultrasmooth_blocks_vis, false); } - std::vector block_mse_scales(total_blocks); + vector block_mse_scales(total_blocks); uint32_t total_ultrasmooth_blocks = 0; for (uint32_t by = 0; by < blocks_y; by++) @@ -674,7 +674,7 @@ namespace rdo_bc printf("rdo_total_threads: %u\n", rdo_total_threads); int blocks_remaining = m_total_blocks, cur_block_index = 0; - std::vector blocks_to_do(rdo_total_threads), first_block_index(rdo_total_threads); + vector blocks_to_do(rdo_total_threads), first_block_index(rdo_total_threads); for (int p = 0; p < rdo_total_threads; p++) { const int num_blocks = (p == (rdo_total_threads - 1)) ? blocks_remaining : (m_total_blocks / rdo_total_threads); @@ -699,9 +699,9 @@ namespace rdo_bc ert_p.m_allow_relative_movement = m_params.m_rdo_allow_relative_movement; ert_p.m_skip_zero_mse_blocks = false; - std::vector block_rgb_mse_scales(compute_block_mse_scales(m_source_image, m_blocks_x, m_blocks_y, m_total_blocks, m_params.m_rdo_debug_output)); + vector block_rgb_mse_scales(compute_block_mse_scales(m_source_image, m_blocks_x, m_blocks_y, m_total_blocks, m_params.m_rdo_debug_output)); - std::vector block_pixels(m_total_blocks * 16); + vector block_pixels(m_total_blocks * 16); for (uint32_t by = 0; by < m_blocks_y; by++) for (uint32_t bx = 0; bx < m_blocks_x; bx++) @@ -758,7 +758,7 @@ namespace rdo_bc uint32_t total_modified_local = 0; - std::vector local_block_rgb_mse_scales(num_blocks_to_encode); + vector local_block_rgb_mse_scales(num_blocks_to_encode); for (int i = 0; i < num_blocks_to_encode; i++) local_block_rgb_mse_scales[i] = block_rgb_mse_scales[first_block_to_encode + i]; @@ -817,7 +817,7 @@ namespace rdo_bc ert_p.m_lookback_window_size = std::max(16U, m_params.m_lookback_window_size); - std::vector block_pixels_r(m_total_blocks * 16), block_pixels_g(m_total_blocks * 16); + vector block_pixels_r(m_total_blocks * 16), block_pixels_g(m_total_blocks * 16); for (uint32_t by = 0; by < m_blocks_y; by++) { @@ -1004,7 +1004,7 @@ namespace rdo_bc uint32_t total_modified_local = 0; - std::vector local_block_rgb_mse_scales(num_blocks_to_encode); + vector local_block_rgb_mse_scales(num_blocks_to_encode); for (int i = 0; i < num_blocks_to_encode; i++) local_block_rgb_mse_scales[i] = block_rgb_mse_scales[first_block_to_encode + i]; @@ -1036,7 +1036,7 @@ namespace rdo_bc ert_p.m_lookback_window_size = std::max(16U, m_params.m_lookback_window_size); - std::vector block_pixels_a(m_total_blocks * 16); + vector block_pixels_a(m_total_blocks * 16); for (uint32_t by = 0; by < m_blocks_y; by++) { @@ -1112,7 +1112,7 @@ namespace rdo_bc (ert::color_rgba*)&block_pixels_a[16 * first_block_to_encode], ert_alpha_p, total_modified_local_alpha, unpacker_funcs::unpack_bc4_block, &block_unpackers); - std::vector local_block_rgb_mse_scales(num_blocks_to_encode); + vector local_block_rgb_mse_scales(num_blocks_to_encode); for (int i = 0; i < num_blocks_to_encode; i++) local_block_rgb_mse_scales[i] = block_rgb_mse_scales[first_block_to_encode + i]; diff --git a/libkram/bc7enc/rgbcx.cpp b/libkram/bc7enc/rgbcx.cpp index b0c40880..6a718da1 100644 --- a/libkram/bc7enc/rgbcx.cpp +++ b/libkram/bc7enc/rgbcx.cpp @@ -2,7 +2,7 @@ #include "rgbcx.h" #include #include -#include +//#include namespace rgbcx { diff --git a/libkram/bc7enc/rgbcx.h b/libkram/bc7enc/rgbcx.h index cf793921..d5680bc0 100644 --- a/libkram/bc7enc/rgbcx.h +++ b/libkram/bc7enc/rgbcx.h @@ -64,7 +64,7 @@ #include #include -#include +//#include #include #include diff --git a/libkram/bc7enc/utils.cpp b/libkram/bc7enc/utils.cpp index 2b3b04d7..b388d3f9 100644 --- a/libkram/bc7enc/utils.cpp +++ b/libkram/bc7enc/utils.cpp @@ -9,19 +9,20 @@ namespace utils -{ - +{ +using namespace NAMESPACE_STL; + #define FLOOD_PUSH(y, xl, xr, dy) if (((y + (dy)) >= 0) && ((y + (dy)) < (int)m_height)) { stack.push_back(fill_segment(y, xl, xr, dy)); } // See http://www.realtimerendering.com/resources/GraphicsGems/gems/SeedFill.c -uint32_t image_u8::flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector* pSet_pixels) +uint32_t image_u8::flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, vector* pSet_pixels) { uint32_t total_set = 0; if (!flood_fill_is_inside(x, y, b)) return 0; - std::vector stack; + vector stack; stack.reserve(64); FLOOD_PUSH(y, x, x, 1); @@ -173,7 +174,7 @@ bool load_png(const char* pFilename, image_u8& img) { img.clear(); - std::vector pixels; + vector pixels; unsigned int w = 0, h = 0; unsigned int e = lodepng::decode(pixels, w, h, pFilename); if (e != 0) @@ -193,7 +194,7 @@ bool save_png(const char* pFilename, const image_u8& img, bool save_alpha) const uint32_t w = img.width(); const uint32_t h = img.height(); - std::vector pixels; + vector pixels; if (save_alpha) { pixels.resize(w * h * sizeof(color_quad_u8)); @@ -285,7 +286,7 @@ void gaussian_filter(imagef& dst, const imagef& orig_img, uint32_t odd_filter_wi assert(odd_filter_width && (odd_filter_width & 1)); odd_filter_width |= 1; - std::vector kernel(odd_filter_width * odd_filter_width); + vector kernel(odd_filter_width * odd_filter_width); compute_gaussian_kernel(&kernel[0], odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize); const int dst_width = orig_img.get_width() / width_divisor; @@ -682,7 +683,7 @@ bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void return true; } -void strip_extension(std::string& s) +void strip_extension(string& s) { for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--) { @@ -694,7 +695,7 @@ void strip_extension(std::string& s) } } -void strip_path(std::string& s) +void strip_path(string& s) { for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--) { diff --git a/libkram/bc7enc/utils.h b/libkram/bc7enc/utils.h index 841710c4..2d66b825 100644 --- a/libkram/bc7enc/utils.h +++ b/libkram/bc7enc/utils.h @@ -9,11 +9,11 @@ #include #include #include -#include #include #include -#include -#include +//#include +//#include +//#include #include #include #include @@ -33,6 +33,8 @@ namespace utils { +using namespace NAMESPACE_STL; + extern const uint32_t g_pretty_colors[]; extern const uint32_t g_num_pretty_colors; @@ -1399,13 +1401,13 @@ typedef vec<4, double> vec4D; typedef vec<1, float> vec1F; typedef vec<2, float> vec2F; -typedef std::vector vec2F_array; +typedef vector vec2F_array; typedef vec<3, float> vec3F; -typedef std::vector vec3F_array; +typedef vector vec3F_array; typedef vec<4, float> vec4F; -typedef std::vector vec4F_array; +typedef vector vec4F_array; typedef vec<2, uint32_t> vec2U; typedef vec<3, uint32_t> vec3U; @@ -1744,7 +1746,7 @@ struct color_quad_u8 return (r == rhs.r) && (g == rhs.g) && (b == rhs.b); } }; -typedef std::vector color_quad_u8_vec; +typedef vector color_quad_u8_vec; inline uint32_t color_distance(bool perceptual, const color_quad_u8& e1, const color_quad_u8& e2, bool alpha) { @@ -1991,7 +1993,7 @@ class image_u8 pixel_coord(uint32_t x, uint32_t y) : m_x((uint16_t)x), m_y((uint16_t)y) { } }; - uint32_t flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector* pSet_pixels = nullptr); + uint32_t flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, vector* pSet_pixels = nullptr); void draw_line(int xs, int ys, int xe, int ye, const color_quad_u8& color); @@ -2364,19 +2366,19 @@ struct block8 uint64_t m_vals[1]; }; -typedef std::vector block8_vec; +typedef vector block8_vec; struct block16 { uint64_t m_vals[2]; }; -typedef std::vector block16_vec; +typedef vector block16_vec; //bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void* pBlocks, uint32_t pixel_format_bpp, DXGI_FORMAT dxgi_format, bool srgb, bool force_dx10_header); -void strip_extension(std::string& s); -void strip_path(std::string& s); +void strip_extension(string& s); +void strip_path(string& s); uint32_t hash_hsieh(const uint8_t* pBuf, size_t len); @@ -2590,7 +2592,7 @@ class value_stats if (!m_num) return 0.0f; - std::vector sorted_vals(m_vals); + vector sorted_vals(m_vals); std::sort(sorted_vals.begin(), sorted_vals.end()); return sorted_vals[sorted_vals.size() / 2]; @@ -2605,7 +2607,7 @@ class value_stats double m_min; double m_max; - mutable std::vector m_vals; + mutable vector m_vals; }; uint32_t get_deflate_size(const void* pData, size_t data_size); diff --git a/libkram/fastl/fstring.h b/libkram/fastl/fstring.h index 4d0f4241..87821e9f 100644 --- a/libkram/fastl/fstring.h +++ b/libkram/fastl/fstring.h @@ -7,6 +7,8 @@ namespace fastl { //------------------------------------------------------------------------------------------ + + // TODO: could make these macros instead to increase debug build speed template size_t ComputeStrLen(const TChar* str) // strlen { @@ -21,6 +23,7 @@ namespace fastl { for (size_t i = 0; ;++i) { + // This also works for utf8 if (a[i] != b[i]) return a[i] < b[i] ? -1 : 1; if (a[i] == '\0') @@ -42,8 +45,8 @@ namespace fastl static constexpr size_type npos = -1; public: StringImpl(); - StringImpl(const char* input); - StringImpl(const char* input, const size_type length); + StringImpl(const TChar* input); + StringImpl(const TChar* input, size_type length); void clear(); @@ -51,41 +54,92 @@ namespace fastl size_type size() const { return m_data.empty() ? 0 : m_data.size() - 1; } size_type length() const { return size(); } - value_type* begin() { return m_data.begin(); } - const value_type* begin() const { return m_data.begin(); } - value_type* end() { return m_data.end() - 1; } - const value_type* end() const { return m_data.end() - 1; } + TChar* begin() { return m_data.begin(); } + const TChar* begin() const { return m_data.begin(); } + + // this exludes the \0 + TChar* end() { return m_data.end() - 1; } + const TChar* end() const { return m_data.end() - 1; } - const value_type* c_str() const { return m_data.begin(); } + TChar front() const { return *begin(); } + TChar back() const { return *end(); } + + const value_type* c_str() const { return m_data.begin(); } - value_type& operator[](size_type index) { return m_data[index]; } - value_type operator[](size_type index) const { return m_data[index]; } + TChar& operator[](size_type index) { return m_data[index]; } + TChar operator[](size_type index) const { return m_data[index]; } StringImpl& erase(size_type index){ m_data.erase(m_data.begin()+index); return *this; } StringImpl& erase(size_type index, size_type count){ m_data.erase(m_data.begin()+index,m_data.begin()+index+count); return *this; } - void append( const char* str ); + void append(const TChar* str ); - StringImpl operator+(const char c); - StringImpl operator+(const char* str); + StringImpl operator+(const TChar c); + StringImpl operator+(const TChar* str); StringImpl operator+(const StringImpl& str); - StringImpl& operator += (const char c) { m_data.insert(m_data.end()-1,c); return *this; } - StringImpl& operator += (const char* str) { Append(str,ComputeStrLen(str)); return *this; } + StringImpl& operator += (TChar c) { m_data.insert(m_data.end()-1,c); return *this; } + StringImpl& operator += (const TChar* str) { Append(str,ComputeStrLen(str)); return *this; } StringImpl& operator += (const StringImpl& str) { Append(str.c_str(), str.size()); return *this; } - bool operator == (const char* str) const { return ComputeStrCmp(c_str(), str) == 0; } - bool operator != (const char* str) const { return ComputeStrCmp(c_str(), str) != 0; } - bool operator < (const char* str) const { return ComputeStrCmp(c_str(), str) < 0; } - bool operator > (const char* str) const { return ComputeStrCmp(c_str(), str) > 0; } + bool operator == (const TChar* str) const { return ComputeStrCmp(c_str(), str) == 0; } + bool operator != (const TChar* str) const { return ComputeStrCmp(c_str(), str) != 0; } + bool operator < (const TChar* str) const { return ComputeStrCmp(c_str(), str) < 0; } + bool operator > (const TChar* str) const { return ComputeStrCmp(c_str(), str) > 0; } bool operator == (const StringImpl& str) const { return *this == str.c_str(); } bool operator != (const StringImpl& str) const { return *this != str.c_str(); } bool operator < (const StringImpl& str) const { return *this < str.c_str(); } bool operator > (const StringImpl& str) const { return *this > str.c_str(); } + bool find_last_of(TChar c) + { + return strrchr(m_data.data(), c); + } + + StringImpl substr(size_type start, size_type count) + { + return StringImpl(&m_data[start], count); + } + + void pop_back() + { + if (!empty()) + { + // This doesn't work for multibyte chars + m_data.pop_back(); + m_data[m_data.size()-1] = (TChar)0; + } + } + + void insert(size_type index, const TChar* str) + { + size_type len = ComputeStrLen(str); + m_data.insert(m_data.begin()+index, str, str+len); + } + + void resize(size_type size, TChar value = 0) + { + size_type oldSize = m_data.size(); + size_type newSize = size+1; + if (newSize == oldSize) + return; + + m_data.resize(newSize); + + // Note: length and strlen with value of 0 unless those chars are filled + if (newSize > oldSize) + { + for (uint32_t i = oldSize-1; i < newSize; ++i) + { + m_data[i] = value; + } + } + m_data[newSize-1] = 0; + } + private: - void Append(const char* str, const size_type appendSize); + void Append(const TChar* str, const size_type appendSize); private: TData m_data; @@ -96,22 +150,27 @@ namespace fastl //------------------------------------------------------------------------------------------ template StringImpl::StringImpl() - { + { + // TODO: this requires a heap allocate for all empty strings + m_data.reserve(1); clear(); } //------------------------------------------------------------------------------------------ template - StringImpl::StringImpl(const char* input) + StringImpl::StringImpl(const TChar* input) { - clear(); - Append(input, ComputeStrLen(input)); + size_t length = ComputeStrLen(input); + m_data.reserve(length + 1); + clear(); + Append(input, length); } //------------------------------------------------------------------------------------------ template - StringImpl::StringImpl(const char* input, const size_type length) - { + StringImpl::StringImpl(const TChar* input, const size_type length) + { + m_data.reserve(length + 1); clear(); Append(input, length); } @@ -120,7 +179,6 @@ namespace fastl template inline void StringImpl::clear() { - // TODO: this requires an allocate in all ctors // need small string optimization m_data.resize(1); m_data[0] = '\0'; @@ -128,14 +186,14 @@ namespace fastl //------------------------------------------------------------------------------------------ template - void StringImpl::append( const char* str ) + void StringImpl::append( const TChar* str ) { Append(str, ComputeStrLen(str)); } //------------------------------------------------------------------------------------------ template - StringImpl StringImpl::operator+(const char c) + StringImpl StringImpl::operator+(TChar c) { StringImpl ret; ret.reserve(m_data.size() + 1); @@ -147,7 +205,7 @@ namespace fastl } //------------------------------------------------------------------------------------------ template - StringImpl StringImpl::operator+(const char* str) + StringImpl StringImpl::operator+(const TChar* str) { StringImpl ret; size_t len = ComputeStrLen(str); @@ -173,7 +231,7 @@ namespace fastl //------------------------------------------------------------------------------------------ template - void StringImpl::Append(const char* str, const size_type appendSize) + void StringImpl::Append(const TChar* str, const size_type appendSize) { size_type writeIndex = size(); m_data.resize(m_data.size()+appendSize); diff --git a/libkram/fastl/vector.h b/libkram/fastl/vector.h index dbc5f4a0..1144e093 100644 --- a/libkram/fastl/vector.h +++ b/libkram/fastl/vector.h @@ -87,15 +87,15 @@ namespace fastl void push_back(const value_type& value); iterator insert(iterator it, const value_type& value); - void insert(iterator it, const value_type* begin , const value_type* end) + void insert(iterator it, const value_type* beg, const value_type* en) { // TODO: fix this isn't fast - while (begin != end) + while (beg != en) { - insert(it, *begin); + insert(it, *beg); ++it; - ++begin; + ++beg; } } @@ -116,6 +116,14 @@ namespace fastl // TODO: no-op for now, but should release memory void shrink_to_fit() { } + void swap(vector& rhs) + { + if (this == &rhs) return; + std::swap(m_data, rhs.m_data); + std::swap(m_size, rhs.m_size); + std::swap(m_capacity, rhs.m_capacity); + } + private: void Destroy(); diff --git a/libkram/kram/Kram.cpp b/libkram/kram/Kram.cpp index f810ec66..e0861795 100644 --- a/libkram/kram/Kram.cpp +++ b/libkram/kram/Kram.cpp @@ -1226,8 +1226,9 @@ string formatInputAndOutput(int32_t testNumber, const char* srcFilename, MyMTLPi // replace png with ktx dst = srcFilename; - size_t extSeparator = dst.rfind('.'); - assert(extSeparator != string::npos); + const char* extSeparatorStr = strchr(dst.c_str(), '.'); + assert(extSeparatorStr != nullptr); + size_t extSeparator = extSeparatorStr - dst.c_str(); dst.erase(extSeparator); dst.append(".ktx"); // TODO: test ktx2 too @@ -3593,9 +3594,10 @@ TexContentType findContentTypeFromFilename(const char* filename) { string filenameShort = filename; - auto dotPos = filenameShort.find_last_of("."); - if (dotPos == string::npos) + const char* dotPosStr = strrchr(filenameShort.c_str(), '.'); + if (dotPosStr == nullptr) return TexContentTypeUnknown; + auto dotPos = dotPosStr - filenameShort.c_str(); // now chop off the extension filenameShort = filenameShort.substr(0, dotPos); diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h index 64af857a..b4b6fead 100644 --- a/libkram/kram/KramConfig.h +++ b/libkram/kram/KramConfig.h @@ -210,22 +210,37 @@ #include "../fastl/unordered_set.h" // still too many holes in this (rfind, insert, back, pop_back, find_last_of, substr) -//#include "../fastl/fstring.h" -#include -namespace NAMESPACE_STL -{ - using string = std::string; -} + +#include "../fastl/fstring.h" +//#include +//namespace NAMESPACE_STL +//{ +// using string = std::string; +//} + +// what is causing string to instantiate? +//namespace std +//{ +//class basic_string +//{ +// int32_t b; +//}; +//} // std - for missing functionality -#include -#include #include #include #include // for unique_ptr/shared_ptr -#include +//#include #include // for copy_if and back_inserter on Win +// threads +#include +#include +#include +#include +#include + #else /* diff --git a/libkram/kram/KramLog.cpp b/libkram/kram/KramLog.cpp index 762e522a..ff597edf 100644 --- a/libkram/kram/KramLog.cpp +++ b/libkram/kram/KramLog.cpp @@ -85,7 +85,7 @@ int32_t append_vsprintf(string& str, const char* format, va_list args) // resize and format again into string str.resize(existingLen + len, 0); - vsnprintf((char*)str.data() + existingLen, len + 1, format, args); + vsnprintf((char*)str.c_str() + existingLen, len + 1, format, args); } return len; diff --git a/libkram/kram/KramZipHelper.cpp b/libkram/kram/KramZipHelper.cpp index 61fa7215..22215684 100644 --- a/libkram/kram/KramZipHelper.cpp +++ b/libkram/kram/KramZipHelper.cpp @@ -9,7 +9,7 @@ //#include //#include -//#include // for copy_if on Win +// // for copy_if on Win#include //#include // for copy_if on Win //#include diff --git a/libkram/transcoder/basisu_transcoder.cpp b/libkram/transcoder/basisu_transcoder.cpp index 081add30..fed078d8 100644 --- a/libkram/transcoder/basisu_transcoder.cpp +++ b/libkram/transcoder/basisu_transcoder.cpp @@ -183,6 +183,7 @@ namespace basisu namespace basist { + using namespace NAMESPACE_STL; #if BASISD_ENABLE_DEBUG_FLAGS static uint32_t g_debug_flags = 0; @@ -17069,7 +17070,7 @@ namespace basist return get_etc1s_image_descs()[etc1s_image_index].m_image_flags; } - const basisu::uint8_vec* ktx2_transcoder::find_key(const std::string& key_name) const + const basisu::uint8_vec* ktx2_transcoder::find_key(const string& key_name) const { for (uint32_t i = 0; i < m_key_values.size(); i++) if (strcmp((const char *)m_key_values[i].m_key.data(), key_name.c_str()) == 0) diff --git a/libkram/transcoder/basisu_transcoder.h b/libkram/transcoder/basisu_transcoder.h index bf3aed3d..c8d6ec1a 100644 --- a/libkram/transcoder/basisu_transcoder.h +++ b/libkram/transcoder/basisu_transcoder.h @@ -39,6 +39,8 @@ namespace basist { + using namespace NAMESPACE_STL; + // High-level composite texture formats supported by the transcoder. // Each of these texture formats directly correspond to OpenGL/D3D/Vulkan etc. texture formats. // Notes: @@ -857,7 +859,7 @@ namespace basist // The order of key values fields in this array exactly matches the order they were stored in the file. The keys are supposed to be sorted by their Unicode code points. const key_value_vec& get_key_values() const { return m_key_values; } - const basisu::uint8_vec *find_key(const std::string& key_name) const; + const basisu::uint8_vec *find_key(const string& key_name) const; // Low-level ETC1S specific accessors diff --git a/scripts/cba.sh b/scripts/cba.sh new file mode 100755 index 00000000..4537ff36 --- /dev/null +++ b/scripts/cba.sh @@ -0,0 +1,15 @@ +#!/bin/zsh + +# so can use aliases, why isn't this automatic? +source ~/.zshrc + +# Note this part will change depending on build type, etc. TODO: see if can obtain from xcodebuild +# or could force projects to use fixed output folder +# erbkczkopelnfhennypqjfnicqai + +ClangBuildAnalyzer --all ~/Library/Developer/Xcode/DerivedData/kram-erbkczkopelnfhennypqjfnicqai/Build/Intermediates.noindex/kram.build/Debug/kram.build/Objects-normal/arm64 ClangBuildAnalysisPre.dat + +ClangBuildAnalyzer --analyze ClangBuildAnalysisPre.dat > ClangBuildAnalysis.txt + +subl ClangBuildAnalysis.txt + From 24848691216f8d3c22a424ed787fad0c77ccb6a0 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 13:20:21 -0700 Subject: [PATCH 070/615] kram - reduce basic_string usage in headers Wanted to use fastl/string everywhere. But buried in , , is , and in that is use of . Also include too. So these slow the build even if precompiled, but instanciates 5x versions of basic_string in char, char8_t, cart16_t, char32_t, and wchar_t flavors. Ugh, no wonder C++ is so slow to build these days. This added up to 2s of time across the build. Removed these headers from KramConfig.h. Added comments warning about use in other files. --- build2/kram.xcodeproj/project.pbxproj | 12 ++++++------ libkram/astc-encoder/astcenc_internal.h | 6 ++++-- libkram/bc7enc/rdo_bc_encoder.cpp | 4 ++-- libkram/bc7enc/utils.h | 3 +++ libkram/kram/KramConfig.h | 16 ++++++++++------ libkram/kram/TaskSystem.h | 4 ++++ libkram/transcoder/basisu.h | 2 ++ 7 files changed, 31 insertions(+), 16 deletions(-) diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index a90e8726..17ae8512 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -237,7 +237,6 @@ 706EFF7626D34740001C950E /* assert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5826D3473F001C950E /* assert.cpp */; }; 706EFF7726D34740001C950E /* string.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5926D3473F001C950E /* string.cpp */; }; 706EFF7826D34740001C950E /* string.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5926D3473F001C950E /* string.cpp */; }; - 706EFF7926D34740001C950E /* allocator_eastl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5A26D3473F001C950E /* allocator_eastl.cpp */; }; 706EFF7A26D34740001C950E /* allocator_eastl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5A26D3473F001C950E /* allocator_eastl.cpp */; }; 706EFF7B26D34740001C950E /* numeric_limits.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5B26D3473F001C950E /* numeric_limits.cpp */; }; 706EFF7C26D34740001C950E /* numeric_limits.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 706EFD5B26D3473F001C950E /* numeric_limits.cpp */; }; @@ -1622,7 +1621,6 @@ 70871DFF27DDDBCD00D0B9E1 /* astcenc_pick_best_endpoint_format.cpp in Sources */, 70871E0927DDDBCD00D0B9E1 /* astcenc_ideal_endpoints_and_weights.cpp in Sources */, 70871DCF27DDDBCD00D0B9E1 /* astcenc_symbolic_physical.cpp in Sources */, - 706EFF7926D34740001C950E /* allocator_eastl.cpp in Sources */, 706EEFC026D1595E001C950E /* maths.cpp in Sources */, 706EEFC126D1595E001C950E /* singlecolourfit.cpp in Sources */, 706EEFC226D1595E001C950E /* zstd.cpp in Sources */, @@ -1807,10 +1805,11 @@ "-DCOMPILE_ETCENC=1", "-DCOMPILE_SQUISH=1", "-DCOMPILE_BCENC=1", - "-DCOMPILE_EASTL=0", - "-DCOMPILE_FASTL=1", "-DCOMPILE_COMP=1", "-DCOMPILE_BASIS=1", + "-DCOMPILE_EASTL=0", + "-DCOMPILE_FASTL=1", + "-ftime-trace", ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; @@ -1889,10 +1888,11 @@ "-DCOMPILE_ETCENC=1", "-DCOMPILE_SQUISH=1", "-DCOMPILE_BCENC=1", - "-DCOMPILE_EASTL=0", - "-DCOMPILE_FASTL=1", "-DCOMPILE_COMP=1", "-DCOMPILE_BASIS=1", + "-DCOMPILE_EASTL=0", + "-DCOMPILE_FASTL=1", + "-ftime-trace", ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; diff --git a/libkram/astc-encoder/astcenc_internal.h b/libkram/astc-encoder/astcenc_internal.h index a1b7b39f..12fa58b4 100644 --- a/libkram/astc-encoder/astcenc_internal.h +++ b/libkram/astc-encoder/astcenc_internal.h @@ -23,9 +23,11 @@ #define ASTCENC_INTERNAL_INCLUDED //#include -//#include //#include -//#include + +// these pull in string from system_error which is slow to instantiate on macOS +#include +#include #include #include diff --git a/libkram/bc7enc/rdo_bc_encoder.cpp b/libkram/bc7enc/rdo_bc_encoder.cpp index 8718dfde..7b1ab29c 100644 --- a/libkram/bc7enc/rdo_bc_encoder.cpp +++ b/libkram/bc7enc/rdo_bc_encoder.cpp @@ -10,10 +10,10 @@ #pragma warning (disable: 4127) // conditional expression is constant #endif -using namespace utils; - namespace rdo_bc { + using namespace utils; + static const char* get_dxgi_format_string(DXGI_FORMAT fmt) { switch (fmt) diff --git a/libkram/bc7enc/utils.h b/libkram/bc7enc/utils.h index 2d66b825..e07a0c20 100644 --- a/libkram/bc7enc/utils.h +++ b/libkram/bc7enc/utils.h @@ -14,7 +14,10 @@ //#include //#include //#include + +// on macOS, random pulls in std::string w/5x impls instanciated #include + #include #include //#include "dds_defs.h" diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h index b4b6fead..508e892f 100644 --- a/libkram/kram/KramConfig.h +++ b/libkram/kram/KramConfig.h @@ -237,9 +237,12 @@ // threads #include #include -#include -#include -#include + +// On macOS, mutex, codition_variable, thread pull in system_error which pulls in std::string +// when then instantiates 5 versions of basic_string into all files +//#include +//#include +//#include #else @@ -275,9 +278,10 @@ import std.regex; #endif -#if COMPILE_BASIS -#include "basisu_transcoder.h" -#endif +// Get this out of config, it pulls in random and other std::fields +//#if COMPILE_BASIS +//#include "basisu_transcoder.h" +//#endif // includes that are usable across all files #include "KramLog.h" diff --git a/libkram/kram/TaskSystem.h b/libkram/kram/TaskSystem.h index d09d79b0..af21d2b5 100644 --- a/libkram/kram/TaskSystem.h +++ b/libkram/kram/TaskSystem.h @@ -12,9 +12,13 @@ //#include //#include //#include + +// TODO: get these three out of header, they pull in basic_string via system_errror header +// but this file isn't included in many places. #include #include #include + //#include diff --git a/libkram/transcoder/basisu.h b/libkram/transcoder/basisu.h index 4557c17c..c95a83f8 100644 --- a/libkram/transcoder/basisu.h +++ b/libkram/transcoder/basisu.h @@ -64,6 +64,8 @@ //#include #include #include + +// on macOS, random pulls in std::string w/5x impls instanciated #include #include "basisu_containers.h" From 059f6213cb97dd3666a6a6a07d6ba546d68f335c Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 15:32:34 -0700 Subject: [PATCH 071/615] kram - add ASTCENC_USE_THREADS flag to chop ParallelManager code use of mutex/condition_variable/atomic kram isn't using the threading of astcenc, and runs one thread or process with one thread per texture. This was just pulling in basic_string via system_error, and causing a slower build. --- libkram/astc-encoder/astcenc_internal.h | 98 +++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 7 deletions(-) diff --git a/libkram/astc-encoder/astcenc_internal.h b/libkram/astc-encoder/astcenc_internal.h index 12fa58b4..fea15879 100644 --- a/libkram/astc-encoder/astcenc_internal.h +++ b/libkram/astc-encoder/astcenc_internal.h @@ -25,11 +25,14 @@ //#include //#include +#define ASTCENC_USE_THREADS 0 +#if ASTCENC_USE_THREADS // these pull in string from system_error which is slow to instantiate on macOS #include #include - #include +#endif + #include #include #include @@ -213,21 +216,27 @@ static_assert((WEIGHTS_MAX_BLOCK_MODES % ASTCENC_SIMD_WIDTH) == 0, class ParallelManager { private: + +#if ASTCENC_USE_THREADS /** @brief Lock used for critical section and condition synchronization. */ std::mutex m_lock; + /** @brief Contition variable for tracking stage processing completion. */ + std::condition_variable m_complete; + + /** @brief Number of tasks started, but not necessarily finished. */ + std::atomic m_start_count; +#else + + unsigned int m_start_count; +#endif + /** @brief True if the stage init() step has been executed. */ bool m_init_done; /** @brief True if the stage term() step has been executed. */ bool m_term_done; - /** @brief Contition variable for tracking stage processing completion. */ - std::condition_variable m_complete; - - /** @brief Number of tasks started, but not necessarily finished. */ - std::atomic m_start_count; - /** @brief Number of tasks finished. */ unsigned int m_done_count; @@ -256,6 +265,79 @@ class ParallelManager m_task_count = 0; } +#if !ASTCENC_USE_THREADS + void init(std::function init_func) + { + if (!m_init_done) + { + m_task_count = init_func(); + m_init_done = true; + } + } + + void init(unsigned int task_count) + { + if (!m_init_done) + { + m_task_count = task_count; + m_init_done = true; + } + } + + unsigned int get_task_assignment(unsigned int granule, unsigned int& count) + { + unsigned int base = m_start_count + granule; + if (base >= m_task_count) + { + count = 0; + return 0; + } + + count = astc::min(m_task_count - base, granule); + return base; + } + + void complete_task_assignment(unsigned int count) + { + // Note: m_done_count cannot use an atomic without the mutex; this has a race between the + // update here and the wait() for other threads + m_done_count += count; + +// if (m_done_count == m_task_count) +// { +// lck.unlock(); +// m_complete.notify_all(); +// } + } + + /** + * @brief Wait for stage processing to complete. + */ + void wait() + { + // no wait + } + + /** + * @brief Trigger the pipeline stage term step. + * + * This can be called from multi-threaded code. The first thread to hit this will process the + * thread termintion. Caller must have called @c wait() prior to calling this function to ensure + * that processing is complete. + * + * @param term_func Callable which executes the stage termination. + */ + void term(std::function term_func) + { + if (!m_term_done) + { + term_func(); + m_term_done = true; + } + } + +#else + /** * @brief Trigger the pipeline stage init step. * @@ -364,6 +446,8 @@ class ParallelManager m_term_done = true; } } + +#endif }; /* ============================================================================ From e424897b5e5baaecacb8ead777978e98dc11f335 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 15:33:04 -0700 Subject: [PATCH 072/615] kram - also chop out the headers on astc_internal.h --- libkram/astc-encoder/astcenc_internal.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libkram/astc-encoder/astcenc_internal.h b/libkram/astc-encoder/astcenc_internal.h index fea15879..bc129492 100644 --- a/libkram/astc-encoder/astcenc_internal.h +++ b/libkram/astc-encoder/astcenc_internal.h @@ -27,10 +27,10 @@ #define ASTCENC_USE_THREADS 0 #if ASTCENC_USE_THREADS -// these pull in string from system_error which is slow to instantiate on macOS -#include -#include -#include + // these pull in string from system_error which is slow to instantiate on macOS + #include + #include + #include #endif #include From 806803b673f8e3a411d9b6a43095af786b7ac8a7 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 15:37:56 -0700 Subject: [PATCH 073/615] kram - turn off basis_transcoder.cpp/h, remove -ftime-trace I haven't had a chance to hook this up, and it was just taking 1s of compile time. --- build2/kram.xcodeproj/project.pbxproj | 6 ++---- libkram/transcoder/basisu_transcoder.cpp | 4 ++++ libkram/transcoder/basisu_transcoder.h | 4 ++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index 17ae8512..b1f08968 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -1806,10 +1806,9 @@ "-DCOMPILE_SQUISH=1", "-DCOMPILE_BCENC=1", "-DCOMPILE_COMP=1", - "-DCOMPILE_BASIS=1", + "-DCOMPILE_BASIS=0", "-DCOMPILE_EASTL=0", "-DCOMPILE_FASTL=1", - "-ftime-trace", ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; @@ -1889,10 +1888,9 @@ "-DCOMPILE_SQUISH=1", "-DCOMPILE_BCENC=1", "-DCOMPILE_COMP=1", - "-DCOMPILE_BASIS=1", + "-DCOMPILE_BASIS=0", "-DCOMPILE_EASTL=0", "-DCOMPILE_FASTL=1", - "-ftime-trace", ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; diff --git a/libkram/transcoder/basisu_transcoder.cpp b/libkram/transcoder/basisu_transcoder.cpp index fed078d8..6579060f 100644 --- a/libkram/transcoder/basisu_transcoder.cpp +++ b/libkram/transcoder/basisu_transcoder.cpp @@ -13,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#if COMPILE_BASIS + #include "basisu_transcoder.h" #include #include "basisu_containers_impl.h" @@ -17615,3 +17617,5 @@ namespace basist } } // namespace basist + +#endif diff --git a/libkram/transcoder/basisu_transcoder.h b/libkram/transcoder/basisu_transcoder.h index c8d6ec1a..9cf29a63 100644 --- a/libkram/transcoder/basisu_transcoder.h +++ b/libkram/transcoder/basisu_transcoder.h @@ -13,6 +13,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#if COMPILE_BASIS + #pragma once // By default KTX2 support is enabled to simplify compilation. This implies the need for the Zstandard library (which we distribute as a single source file in the "zstd" directory) by default. @@ -941,3 +944,4 @@ namespace basist } // namespace basisu +#endif From 1652ebf8e50bebe864b3d1b4e6b0205da3214ca8 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 16:40:24 -0700 Subject: [PATCH 074/615] kram - turn off FASTL for now The map/set and unordered_map/set behavior likely isn't shippable. It shifts the address of the values because they are insertion sorted into an array. All that copying is costly too. This is likely why a linked-list is used by std, but that has poor allocation locality and allocates one value at a time unless a chunk allocator is specified. --- libkram/CMakeLists.txt | 6 ++++-- libkram/bc7enc/rdo_bc_encoder.cpp | 2 +- libkram/fastl/map.h | 16 ++++++++++++++-- libkram/fastl/set.h | 6 ++++++ libkram/fastl/vector.h | 23 ++++++++++++++++++----- 5 files changed, 43 insertions(+), 10 deletions(-) diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index 95de547a..e56ae0f8 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -24,7 +24,7 @@ option(BCENC "Compile BCenc Encoder" ON) option(COMP "Compile Compressonator Encoder" ON) option(EASTL "Compile EASTL" OFF) -option(FASTL "Compile FASTL" ON) +option(FASTL "Compile FASTL" OFF) # convert ON to 1, UGH set(COMPILE_ATE 0) @@ -245,7 +245,7 @@ elseif (BUILD_WIN) # force include (public) #target_compile_options(${myTargetLib} PUBLIC /FIKramConfig.h) - # all warnings, AVX1, and multiprocess compiles + # all warnings, AVX, and multiprocess compiles target_compile_options(${myTargetLib} PRIVATE /W3 /arch:AVX /MP) # fix STL @@ -253,6 +253,8 @@ elseif (BUILD_WIN) endif() +# TODO: Missing dead-strip on Release builds for macOS/Win. Needed to minimize app size. + # This will be force include (-include, /FI) on GCC/clang/VS. # Can't seem to ref KramPrefix.pch file. Goes into cmake_pch.hxx file target_precompile_headers(${myTargetLib} PRIVATE diff --git a/libkram/bc7enc/rdo_bc_encoder.cpp b/libkram/bc7enc/rdo_bc_encoder.cpp index 7b1ab29c..8f51f8e4 100644 --- a/libkram/bc7enc/rdo_bc_encoder.cpp +++ b/libkram/bc7enc/rdo_bc_encoder.cpp @@ -57,7 +57,7 @@ namespace rdo_bc float yl = max_std_dev / ULTRASMOOTH_BLOCK_STD_DEV_THRESHOLD; - yl = clamp(yl, 0.0f, 1.0f); + yl = std::clamp(yl, 0.0f, 1.0f); yl *= yl; float y_avg = y_stats.get_mean(); diff --git a/libkram/fastl/map.h b/libkram/fastl/map.h index 3c49f73f..aeee6bfd 100644 --- a/libkram/fastl/map.h +++ b/libkram/fastl/map.h @@ -58,7 +58,11 @@ namespace fastl { iterator entryIt = fastl::lower_bound(begin(), end(), key, [=](value_type& value, const TKey& key) {return value.first < key; }); if (entryIt == end() || entryIt->first != key) - { + { + // TODO: this is expensive to insertion sort into a vector + // This causes all elements above to have to be copied and they don't have constant addresses. + // Also keys for unordered_map/set only provide == and hash, and not < + entryIt = m_data.emplace(entryIt,key,TValue()); } @@ -70,7 +74,11 @@ namespace fastl { iterator entryIt = fastl::lower_bound(begin(), end(), inputValue, [=](value_type& a, const value_type& b) {return a.first < b.first; }); if (entryIt == end() || entryIt->first != inputValue.first) - { + { + // TODO: this is expensive to insertion sort into a vector + // This causes all elements above to have to be copied and they don't have constant addresses. + // Also keys for unordered_map/set only provide == and hash, and not < + entryIt = m_data.emplace(entryIt,move(inputValue)); return pair(entryIt,true); } @@ -83,6 +91,10 @@ namespace fastl iterator found = find(key); if (found != end()) { + // TODO: this is expensive to erase an element from a vector + // This causes all elements above to have to be copied and they don't have constant addresses. + // Also keys for unordered_map/set only provide == and hash, and not < + erase(found); } return size(); diff --git a/libkram/fastl/set.h b/libkram/fastl/set.h index e334c466..d964050f 100644 --- a/libkram/fastl/set.h +++ b/libkram/fastl/set.h @@ -57,6 +57,9 @@ namespace fastl iterator entryIt = fastl::lower_bound( begin(), end(), inputValue, [=]( value_type& a, const value_type& b ) {return a < b; } ); if( entryIt == end() || *entryIt != inputValue ) { + // TODO: this isn't fast to emplace into a vector, all elements above shift + // and addresses are no longer constant on elements + entryIt = m_data.emplace( entryIt, args... ); return pair( entryIt, true ); } @@ -69,6 +72,9 @@ namespace fastl iterator found = find( key ); if( found != end() ) { + // TODO: this isn't fast to erase from a vector, all elements above shift + // and addresses are no longer constant on elements + erase( found ); } return size(); diff --git a/libkram/fastl/vector.h b/libkram/fastl/vector.h index 1144e093..b07b041f 100644 --- a/libkram/fastl/vector.h +++ b/libkram/fastl/vector.h @@ -13,9 +13,12 @@ namespace fastl { //------------------------------------------------------------------------------------------ //Consider moving this around if needed somewhere else - template struct remove_reference { typedef T type; }; - template struct remove_reference { typedef T type; }; - template struct remove_reference { typedef T type; }; + template + struct remove_reference { typedef T type; }; + template + struct remove_reference { typedef T type; }; + template + struct remove_reference { typedef T type; }; // This is ambigous if included //template typename remove_reference::type&& move(T&& arg) { return static_cast::type&&>(arg); } @@ -78,7 +81,10 @@ namespace fastl const_iterator begin() const { return m_data; } iterator end() { return m_data+m_size; } const_iterator end() const { return m_data+m_size; } + + // TOOD: need front reference back() { return m_data[m_size-1]; } + bool empty() const { return m_size == 0u; } void reserve(const size_type size); @@ -86,10 +92,15 @@ namespace fastl void clear(); void push_back(const value_type& value); + iterator insert(iterator it, const value_type& value); void insert(iterator it, const value_type* beg, const value_type* en) { - // TODO: fix this isn't fast + size_type len = en - beg; + reserve(size() + len); + + // TODO: fix this isn't fast, since it has to shift all elements above + // the iterator. Do that once. while (beg != en) { insert(it, *beg); @@ -113,7 +124,7 @@ namespace fastl const value_type* data() const { return m_data; } value_type* data() { return m_data; } - // TODO: no-op for now, but should release memory + // TODO: no-op for now, but should copy and release memory void shrink_to_fit() { } void swap(vector& rhs) @@ -129,6 +140,8 @@ namespace fastl private: value_type* m_data; + + // TODO: could map size_type to int32_t size_type m_size; size_type m_capacity; }; From ad33819c3ca55f7732a41caee62a6913b95e6a36 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 16:40:54 -0700 Subject: [PATCH 075/615] kram - update Xcode projects to turn off FASTL --- build2/kram.xcodeproj/project.pbxproj | 4 ++-- build2/kramv.xcodeproj/project.pbxproj | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index b1f08968..3028966c 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -1808,7 +1808,7 @@ "-DCOMPILE_COMP=1", "-DCOMPILE_BASIS=0", "-DCOMPILE_EASTL=0", - "-DCOMPILE_FASTL=1", + "-DCOMPILE_FASTL=0", ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; @@ -1890,7 +1890,7 @@ "-DCOMPILE_COMP=1", "-DCOMPILE_BASIS=0", "-DCOMPILE_EASTL=0", - "-DCOMPILE_FASTL=1", + "-DCOMPILE_FASTL=0", ); PRESERVE_DEAD_CODE_INITS_AND_TERMS = NO; SDKROOT = macosx; diff --git a/build2/kramv.xcodeproj/project.pbxproj b/build2/kramv.xcodeproj/project.pbxproj index 4e79c2bf..9bf1ff07 100644 --- a/build2/kramv.xcodeproj/project.pbxproj +++ b/build2/kramv.xcodeproj/project.pbxproj @@ -562,7 +562,7 @@ MTL_LANGUAGE_REVISION = UseDeploymentTarget; ONLY_ACTIVE_ARCH = YES; OTHER_CFLAGS = ( - "-DCOMPILE_FASTL=1", + "-DCOMPILE_FASTL=0", "-DCOMPILE_EASTL=0", "-include", KramConfig.h, @@ -628,7 +628,7 @@ MTL_FAST_MATH = YES; MTL_LANGUAGE_REVISION = UseDeploymentTarget; OTHER_CFLAGS = ( - "-DCOMPILE_FASTL=1", + "-DCOMPILE_FASTL=0", "-DCOMPILE_EASTL=0", "-include", KramConfig.h, From 8adeece0358b70577d33d1e455364c15967f022c Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 16:41:13 -0700 Subject: [PATCH 076/615] kram - one more file --- kram-thumb/KramThumbnailProvider.mm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kram-thumb/KramThumbnailProvider.mm b/kram-thumb/KramThumbnailProvider.mm index b8dc82f3..b190181b 100644 --- a/kram-thumb/KramThumbnailProvider.mm +++ b/kram-thumb/KramThumbnailProvider.mm @@ -98,11 +98,11 @@ - (void)provideThumbnailForFileRequest:(QLFileThumbnailRequest *)request complet if (imageAspect >= 1.0f) { requestWidth = contextSize.width; - requestHeight = clamp((contextSize.width / imageAspect), 1.0, contextSize.height); + requestHeight = std::clamp((contextSize.width / imageAspect), 1.0, contextSize.height); } else { - requestWidth = clamp((contextSize.height * imageAspect), 1.0, contextSize.width); + requestWidth = std::clamp((contextSize.height * imageAspect), 1.0, contextSize.width); requestHeight = contextSize.height; } From 1916f8bef17494673de60f4eb0bab14ceb7a0be7 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 17:22:13 -0700 Subject: [PATCH 077/615] kram - fix build with FASTL off in CMake --- libkram/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index e56ae0f8..17c806a6 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -68,7 +68,7 @@ if (EASTL) endif() # replace parts of std/stdl with fastl -set(COMPILE_EASTL 0) +set(COMPILE_FASTL 0) if (FASTL) set(COMPILE_FASTL 1) endif() From a110756db21a97a9f3605b7bf8311ad31742c376 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 14 Aug 2022 17:44:25 -0700 Subject: [PATCH 078/615] kram - more cmake fixes Add dead-strip, and add back /FI for app projects --- libkram/CMakeLists.txt | 18 +++++++++++++----- libkram/kram/KramConfig.h | 16 ++++++---------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index 17c806a6..f6a77594 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -229,8 +229,12 @@ if (BUILD_MAC) XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES ) + target_compile_options(${myTargetLib} PRIVATE -W -Wall) + # TODO: switch to pch setup (KramConfig.pch) - target_compile_options(${myTargetLib} PRIVATE -include KramConfig.h -W -Wall) + # this is already done by pch for libkram, but other projects need the force include inherited + # force include (public) + target_compile_options(${myTargetLib} PUBLIC -include KramConfig.h) elseif (BUILD_WIN) @@ -241,10 +245,10 @@ elseif (BUILD_WIN) string(REGEX REPLACE "/GR" "/GR-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REGEX REPLACE "/EHsc" "/EHs-c-" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - # this is already done by pch + # this is already done by pch for libkram, but other projects need the force include inherited # force include (public) - #target_compile_options(${myTargetLib} PUBLIC /FIKramConfig.h) - + target_compile_options(${myTargetLib} PUBLIC /FIKramConfig.h) + # all warnings, AVX, and multiprocess compiles target_compile_options(${myTargetLib} PRIVATE /W3 /arch:AVX /MP) @@ -253,7 +257,11 @@ elseif (BUILD_WIN) endif() -# TODO: Missing dead-strip on Release builds for macOS/Win. Needed to minimize app size. +# turn on dead-code stripping in release. Don't set this in debug. +# does this make sense on lib, or just on apps ? +if (CMAKE_BUILD_TYPE STREQUAL "RELEASE") + target_compile_options(${myTargetLib} PUBLIC -dead_strip) +endif() # This will be force include (-include, /FI) on GCC/clang/VS. # Can't seem to ref KramPrefix.pch file. Goes into cmake_pch.hxx file diff --git a/libkram/kram/KramConfig.h b/libkram/kram/KramConfig.h index 508e892f..61a50788 100644 --- a/libkram/kram/KramConfig.h +++ b/libkram/kram/KramConfig.h @@ -204,6 +204,8 @@ // these are all vector based #include "../fastl/falgorithm.h" #include "../fastl/vector.h" + +// These don't really work. They are constantly shifting the key-value pairs on add/revmoe #include "../fastl/map.h" #include "../fastl/set.h" #include "../fastl/unordered_map.h" @@ -212,21 +214,14 @@ // still too many holes in this (rfind, insert, back, pop_back, find_last_of, substr) #include "../fastl/fstring.h" + +// This was to fallback on sso of basic_string //#include //namespace NAMESPACE_STL //{ // using string = std::string; //} -// what is causing string to instantiate? -//namespace std -//{ -//class basic_string -//{ -// int32_t b; -//}; -//} - // std - for missing functionality #include #include @@ -257,6 +252,8 @@ import std.filesystem; import std.regex; */ +#define NAMESPACE_STL std + // all std #include // for max #include @@ -274,7 +271,6 @@ import std.regex; #include #include -#define NAMESPACE_STL std #endif From 70369467216fad2e4b2ac501672370713dc68d65 Mon Sep 17 00:00:00 2001 From: Alec Miller Date: Sun, 18 Sep 2022 17:33:11 -0700 Subject: [PATCH 079/615] kram - add fmt support This is a precursor to std::format, but has nice ways to bury the code implementation similar to how I handle vsprintf. It pulls in enough inline functions and template mechanisms, that I have it split off into KramFmt.h. There are similar log calls, and sprintf/append_sprintf calls there. Still need to actually append to the existing string while writing instead of allocating another temp dynamic string. --- build2/kram.xcodeproj/project.pbxproj | 106 + libkram/CMakeLists.txt | 4 + libkram/fmt/LICENSE.rst | 27 + libkram/fmt/args.h | 234 ++ libkram/fmt/chrono.h | 2069 ++++++++++++ libkram/fmt/color.h | 651 ++++ libkram/fmt/compile.h | 611 ++++ libkram/fmt/core.h | 3338 +++++++++++++++++++ libkram/fmt/fmt.cpp | 100 + libkram/fmt/format-inl.h | 1754 ++++++++++ libkram/fmt/format.cpp | 44 + libkram/fmt/format.h | 4310 +++++++++++++++++++++++++ libkram/fmt/os.cpp | 373 +++ libkram/fmt/os.h | 478 +++ libkram/fmt/ostream.h | 237 ++ libkram/fmt/printf.h | 640 ++++ libkram/fmt/ranges.h | 722 +++++ libkram/fmt/std.h | 240 ++ libkram/fmt/xchar.h | 248 ++ libkram/kram/KramFmt.h | 69 + libkram/kram/KramLog.cpp | 107 +- libkram/kram/KramLog.h | 5 +- 22 files changed, 16346 insertions(+), 21 deletions(-) create mode 100644 libkram/fmt/LICENSE.rst create mode 100644 libkram/fmt/args.h create mode 100644 libkram/fmt/chrono.h create mode 100644 libkram/fmt/color.h create mode 100644 libkram/fmt/compile.h create mode 100644 libkram/fmt/core.h create mode 100644 libkram/fmt/fmt.cpp create mode 100644 libkram/fmt/format-inl.h create mode 100644 libkram/fmt/format.cpp create mode 100644 libkram/fmt/format.h create mode 100644 libkram/fmt/os.cpp create mode 100644 libkram/fmt/os.h create mode 100644 libkram/fmt/ostream.h create mode 100644 libkram/fmt/printf.h create mode 100644 libkram/fmt/ranges.h create mode 100644 libkram/fmt/std.h create mode 100644 libkram/fmt/xchar.h create mode 100644 libkram/kram/KramFmt.h diff --git a/build2/kram.xcodeproj/project.pbxproj b/build2/kram.xcodeproj/project.pbxproj index 3028966c..8f8efb1f 100644 --- a/build2/kram.xcodeproj/project.pbxproj +++ b/build2/kram.xcodeproj/project.pbxproj @@ -356,6 +356,38 @@ 708A6AA12708CE4700BA5410 /* bc6h_definitions.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A902708CE4700BA5410 /* bc6h_definitions.h */; }; 708A6AA42708CE4700BA5410 /* bc6h_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A922708CE4700BA5410 /* bc6h_utils.h */; }; 708A6AA52708CE4700BA5410 /* bc6h_utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 708A6A922708CE4700BA5410 /* bc6h_utils.h */; }; + 709B8D2D28D7BCAD0081BD1F /* ostream.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1C28D7BCAD0081BD1F /* ostream.h */; }; + 709B8D2E28D7BCAD0081BD1F /* ostream.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1C28D7BCAD0081BD1F /* ostream.h */; }; + 709B8D2F28D7BCAD0081BD1F /* format-inl.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1D28D7BCAD0081BD1F /* format-inl.h */; }; + 709B8D3028D7BCAD0081BD1F /* format-inl.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1D28D7BCAD0081BD1F /* format-inl.h */; }; + 709B8D3128D7BCAD0081BD1F /* ranges.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1E28D7BCAD0081BD1F /* ranges.h */; }; + 709B8D3228D7BCAD0081BD1F /* ranges.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1E28D7BCAD0081BD1F /* ranges.h */; }; + 709B8D3328D7BCAD0081BD1F /* xchar.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1F28D7BCAD0081BD1F /* xchar.h */; }; + 709B8D3428D7BCAD0081BD1F /* xchar.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D1F28D7BCAD0081BD1F /* xchar.h */; }; + 709B8D3528D7BCAD0081BD1F /* core.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2028D7BCAD0081BD1F /* core.h */; }; + 709B8D3628D7BCAD0081BD1F /* core.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2028D7BCAD0081BD1F /* core.h */; }; + 709B8D3728D7BCAD0081BD1F /* os.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2128D7BCAD0081BD1F /* os.cpp */; }; + 709B8D3828D7BCAD0081BD1F /* os.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2128D7BCAD0081BD1F /* os.cpp */; }; + 709B8D3928D7BCAD0081BD1F /* format.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2228D7BCAD0081BD1F /* format.cpp */; }; + 709B8D3A28D7BCAD0081BD1F /* format.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 709B8D2228D7BCAD0081BD1F /* format.cpp */; }; + 709B8D3D28D7BCAD0081BD1F /* chrono.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2428D7BCAD0081BD1F /* chrono.h */; }; + 709B8D3E28D7BCAD0081BD1F /* chrono.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2428D7BCAD0081BD1F /* chrono.h */; }; + 709B8D3F28D7BCAD0081BD1F /* os.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2528D7BCAD0081BD1F /* os.h */; }; + 709B8D4028D7BCAD0081BD1F /* os.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2528D7BCAD0081BD1F /* os.h */; }; + 709B8D4128D7BCAD0081BD1F /* color.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2628D7BCAD0081BD1F /* color.h */; }; + 709B8D4228D7BCAD0081BD1F /* color.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2628D7BCAD0081BD1F /* color.h */; }; + 709B8D4328D7BCAD0081BD1F /* args.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2728D7BCAD0081BD1F /* args.h */; }; + 709B8D4428D7BCAD0081BD1F /* args.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2728D7BCAD0081BD1F /* args.h */; }; + 709B8D4528D7BCAD0081BD1F /* printf.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2828D7BCAD0081BD1F /* printf.h */; }; + 709B8D4628D7BCAD0081BD1F /* printf.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2828D7BCAD0081BD1F /* printf.h */; }; + 709B8D4728D7BCAD0081BD1F /* compile.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2928D7BCAD0081BD1F /* compile.h */; }; + 709B8D4828D7BCAD0081BD1F /* compile.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2928D7BCAD0081BD1F /* compile.h */; }; + 709B8D4928D7BCAD0081BD1F /* format.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2B28D7BCAD0081BD1F /* format.h */; }; + 709B8D4A28D7BCAD0081BD1F /* format.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2B28D7BCAD0081BD1F /* format.h */; }; + 709B8D4B28D7BCAD0081BD1F /* std.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2C28D7BCAD0081BD1F /* std.h */; }; + 709B8D4C28D7BCAD0081BD1F /* std.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D2C28D7BCAD0081BD1F /* std.h */; }; + 709B8D4F28D7C15F0081BD1F /* KramFmt.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D4D28D7C15F0081BD1F /* KramFmt.h */; }; + 709B8D5028D7C15F0081BD1F /* KramFmt.h in Headers */ = {isa = PBXBuildFile; fileRef = 709B8D4D28D7C15F0081BD1F /* KramFmt.h */; }; 70A7BD3027092A1200DBCCF7 /* hdr_encode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */; }; 70A7BD3127092A1200DBCCF7 /* hdr_encode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */; }; 70A7BD3227092A1200DBCCF7 /* hdr_encode.h in Headers */ = {isa = PBXBuildFile; fileRef = 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */; }; @@ -689,6 +721,23 @@ 708A6A8E2708CE4700BA5410 /* bc6h_encode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bc6h_encode.h; sourceTree = ""; }; 708A6A902708CE4700BA5410 /* bc6h_definitions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bc6h_definitions.h; sourceTree = ""; }; 708A6A922708CE4700BA5410 /* bc6h_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bc6h_utils.h; sourceTree = ""; }; + 709B8D1C28D7BCAD0081BD1F /* ostream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ostream.h; sourceTree = ""; }; + 709B8D1D28D7BCAD0081BD1F /* format-inl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "format-inl.h"; sourceTree = ""; }; + 709B8D1E28D7BCAD0081BD1F /* ranges.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ranges.h; sourceTree = ""; }; + 709B8D1F28D7BCAD0081BD1F /* xchar.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = xchar.h; sourceTree = ""; }; + 709B8D2028D7BCAD0081BD1F /* core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = core.h; sourceTree = ""; }; + 709B8D2128D7BCAD0081BD1F /* os.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = os.cpp; sourceTree = ""; }; + 709B8D2228D7BCAD0081BD1F /* format.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = format.cpp; sourceTree = ""; }; + 709B8D2428D7BCAD0081BD1F /* chrono.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chrono.h; sourceTree = ""; }; + 709B8D2528D7BCAD0081BD1F /* os.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = os.h; sourceTree = ""; }; + 709B8D2628D7BCAD0081BD1F /* color.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = color.h; sourceTree = ""; }; + 709B8D2728D7BCAD0081BD1F /* args.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = args.h; sourceTree = ""; }; + 709B8D2828D7BCAD0081BD1F /* printf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = printf.h; sourceTree = ""; }; + 709B8D2928D7BCAD0081BD1F /* compile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = compile.h; sourceTree = ""; }; + 709B8D2A28D7BCAD0081BD1F /* LICENSE.rst */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = LICENSE.rst; sourceTree = ""; }; + 709B8D2B28D7BCAD0081BD1F /* format.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = format.h; sourceTree = ""; }; + 709B8D2C28D7BCAD0081BD1F /* std.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = std.h; sourceTree = ""; }; + 709B8D4D28D7C15F0081BD1F /* KramFmt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramFmt.h; sourceTree = ""; }; 70A7BD2E27092A1200DBCCF7 /* hdr_encode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hdr_encode.cpp; sourceTree = ""; }; 70A7BD2F27092A1200DBCCF7 /* hdr_encode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hdr_encode.h; sourceTree = ""; }; 70C6398C289FB234006E7422 /* KramPrefix.pch */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KramPrefix.pch; sourceTree = ""; }; @@ -755,6 +804,7 @@ 708A6A882708CE4700BA5410 /* compressonator */, 706EFC3E26D3473F001C950E /* eastl */, 704738AF289F6AEE00C77A9F /* fastl */, + 709B8D1B28D7BCAD0081BD1F /* fmt */, 706EEDA926D1583E001C950E /* etc2comp */, 706EEDC926D1583E001C950E /* bc7enc */, 706EEDD226D1583E001C950E /* astc-encoder */, @@ -946,6 +996,7 @@ 706EEE2326D1583F001C950E /* KramConfig.h */, 706EEE3126D1583F001C950E /* KramImageInfo.h */, 706EEE2526D1583F001C950E /* KramImageInfo.cpp */, + 709B8D4D28D7C15F0081BD1F /* KramFmt.h */, 706EEE2726D1583F001C950E /* KramLib.h */, 706EEE2426D1583F001C950E /* KramLog.h */, 706EEE2826D1583F001C950E /* KramLog.cpp */, @@ -1264,6 +1315,29 @@ path = bc6h; sourceTree = ""; }; + 709B8D1B28D7BCAD0081BD1F /* fmt */ = { + isa = PBXGroup; + children = ( + 709B8D1C28D7BCAD0081BD1F /* ostream.h */, + 709B8D2B28D7BCAD0081BD1F /* format.h */, + 709B8D1D28D7BCAD0081BD1F /* format-inl.h */, + 709B8D2228D7BCAD0081BD1F /* format.cpp */, + 709B8D1E28D7BCAD0081BD1F /* ranges.h */, + 709B8D1F28D7BCAD0081BD1F /* xchar.h */, + 709B8D2028D7BCAD0081BD1F /* core.h */, + 709B8D2428D7BCAD0081BD1F /* chrono.h */, + 709B8D2528D7BCAD0081BD1F /* os.h */, + 709B8D2128D7BCAD0081BD1F /* os.cpp */, + 709B8D2628D7BCAD0081BD1F /* color.h */, + 709B8D2728D7BCAD0081BD1F /* args.h */, + 709B8D2828D7BCAD0081BD1F /* printf.h */, + 709B8D2928D7BCAD0081BD1F /* compile.h */, + 709B8D2A28D7BCAD0081BD1F /* LICENSE.rst */, + 709B8D2C28D7BCAD0081BD1F /* std.h */, + ); + path = fmt; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXHeadersBuildPhase section */ @@ -1274,6 +1348,7 @@ 706EEFD126D15984001C950E /* EtcErrorMetric.h in Headers */, 706EEFD226D15984001C950E /* EtcColor.h in Headers */, 70C6398D289FB234006E7422 /* KramPrefix.pch in Headers */, + 709B8D3D28D7BCAD0081BD1F /* chrono.h in Headers */, 706EEFD326D15984001C950E /* EtcDifferentialTrys.h in Headers */, 706EEFD426D15984001C950E /* EtcBlock4x4Encoding_RGB8.h in Headers */, 706EEFD526D15984001C950E /* EtcConfig.h in Headers */, @@ -1290,7 +1365,9 @@ 706EEFDD26D15984001C950E /* Etc.h in Headers */, 707789D72881BA81008A51BC /* bc7enc.h in Headers */, 706EEFDE26D15984001C950E /* EtcImage.h in Headers */, + 709B8D4B28D7BCAD0081BD1F /* std.h in Headers */, 70CDB65027A1382700A546C1 /* KramDDSHelper.h in Headers */, + 709B8D4328D7BCAD0081BD1F /* args.h in Headers */, 708A6A9C2708CE4700BA5410 /* bc6h_encode.h in Headers */, 706EEFDF26D15984001C950E /* EtcBlock4x4Encoding_ETC1.h in Headers */, 706EEFE026D15984001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */, @@ -1299,9 +1376,11 @@ 706EEFF226D15984001C950E /* ateencoder.h in Headers */, 706EEFF326D15984001C950E /* basisu_transcoder.h in Headers */, 70A7BD3227092A1200DBCCF7 /* hdr_encode.h in Headers */, + 709B8D4728D7BCAD0081BD1F /* compile.h in Headers */, 708A6AA02708CE4700BA5410 /* bc6h_definitions.h in Headers */, 706EEFF426D15984001C950E /* basisu_containers.h in Headers */, 70871DD527DDDBCD00D0B9E1 /* astcenc.h in Headers */, + 709B8D4528D7BCAD0081BD1F /* printf.h in Headers */, 706EEFF526D15985001C950E /* basisu_containers_impl.h in Headers */, 707789EB2881BA81008A51BC /* utils.h in Headers */, 706EEFF626D15985001C950E /* basisu_transcoder_internal.h in Headers */, @@ -1323,6 +1402,7 @@ 706EF00026D15985001C950E /* KramSDFMipper.h in Headers */, 706EF00126D15985001C950E /* sse2neon.h in Headers */, 70871DF127DDDBCD00D0B9E1 /* astcenc_mathlib.h in Headers */, + 709B8D3128D7BCAD0081BD1F /* ranges.h in Headers */, 706EF00226D15985001C950E /* KramConfig.h in Headers */, 706EF00326D15985001C950E /* KramLog.h in Headers */, 706EF00426D15985001C950E /* KramLib.h in Headers */, @@ -1330,6 +1410,7 @@ 706EF00626D15985001C950E /* KramImage.h in Headers */, 706EF00726D15985001C950E /* win_mmap.h in Headers */, 70871DDD27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */, + 709B8D4F28D7C15F0081BD1F /* KramFmt.h in Headers */, 707789D92881BA81008A51BC /* bc7decomp.h in Headers */, 706EF00826D15985001C950E /* Kram.h in Headers */, 704738C8289F6AEE00C77A9F /* vector.h in Headers */, @@ -1344,13 +1425,16 @@ 706EF00C26D15985001C950E /* KramMmapHelper.h in Headers */, 706EF00D26D15985001C950E /* float4a.h in Headers */, 706EF00E26D15985001C950E /* KramFileHelper.h in Headers */, + 709B8D3F28D7BCAD0081BD1F /* os.h in Headers */, 706EF00F26D15985001C950E /* KramMipper.h in Headers */, 706EF01026D15985001C950E /* TaskSystem.h in Headers */, 706EF01126D15985001C950E /* squish.h in Headers */, 706EF01226D15985001C950E /* clusterfit.h in Headers */, + 709B8D3528D7BCAD0081BD1F /* core.h in Headers */, 706EF01326D15985001C950E /* colourfit.h in Headers */, 70871DFD27DDDBCD00D0B9E1 /* astcenc_vecmathlib.h in Headers */, 706EF01426D15985001C950E /* alpha.h in Headers */, + 709B8D4128D7BCAD0081BD1F /* color.h in Headers */, 708A6A982708CE4700BA5410 /* bc6h_decode.h in Headers */, 706EF01526D15985001C950E /* singlecolourfit.h in Headers */, 706EF01626D15985001C950E /* maths.h in Headers */, @@ -1362,9 +1446,13 @@ 706EF01926D15985001C950E /* rangefit.h in Headers */, 706EF01A26D15985001C950E /* zstd.h in Headers */, 70871DF327DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */, + 709B8D2F28D7BCAD0081BD1F /* format-inl.h in Headers */, 704738CC289F6AEE00C77A9F /* fstring.h in Headers */, + 709B8D2D28D7BCAD0081BD1F /* ostream.h in Headers */, 706EF01B26D15985001C950E /* lodepng.h in Headers */, + 709B8D4928D7BCAD0081BD1F /* format.h in Headers */, 706EF01C26D15985001C950E /* tmpfileplus.h in Headers */, + 709B8D3328D7BCAD0081BD1F /* xchar.h in Headers */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -1375,6 +1463,7 @@ 706EF14B26D166C5001C950E /* EtcErrorMetric.h in Headers */, 706EF14C26D166C5001C950E /* EtcColor.h in Headers */, 70C6398E289FB234006E7422 /* KramPrefix.pch in Headers */, + 709B8D3E28D7BCAD0081BD1F /* chrono.h in Headers */, 706EF14D26D166C5001C950E /* EtcDifferentialTrys.h in Headers */, 706EF14E26D166C5001C950E /* EtcBlock4x4Encoding_RGB8.h in Headers */, 706EF14F26D166C5001C950E /* EtcConfig.h in Headers */, @@ -1391,7 +1480,9 @@ 706EF15726D166C5001C950E /* Etc.h in Headers */, 707789D82881BA81008A51BC /* bc7enc.h in Headers */, 706EF15826D166C5001C950E /* EtcImage.h in Headers */, + 709B8D4C28D7BCAD0081BD1F /* std.h in Headers */, 70CDB65127A1382700A546C1 /* KramDDSHelper.h in Headers */, + 709B8D4428D7BCAD0081BD1F /* args.h in Headers */, 708A6A9D2708CE4700BA5410 /* bc6h_encode.h in Headers */, 706EF15926D166C5001C950E /* EtcBlock4x4Encoding_ETC1.h in Headers */, 706EF15A26D166C5001C950E /* EtcBlock4x4Encoding_RGBA8.h in Headers */, @@ -1400,9 +1491,11 @@ 706EF16C26D166C5001C950E /* ateencoder.h in Headers */, 706EF16D26D166C5001C950E /* basisu_transcoder.h in Headers */, 70A7BD3327092A1200DBCCF7 /* hdr_encode.h in Headers */, + 709B8D4828D7BCAD0081BD1F /* compile.h in Headers */, 708A6AA12708CE4700BA5410 /* bc6h_definitions.h in Headers */, 706EF16E26D166C5001C950E /* basisu_containers.h in Headers */, 70871DD627DDDBCD00D0B9E1 /* astcenc.h in Headers */, + 709B8D4628D7BCAD0081BD1F /* printf.h in Headers */, 706EF16F26D166C5001C950E /* basisu_containers_impl.h in Headers */, 707789EC2881BA81008A51BC /* utils.h in Headers */, 706EF17026D166C5001C950E /* basisu_transcoder_internal.h in Headers */, @@ -1424,6 +1517,7 @@ 706EF17A26D166C5001C950E /* KramSDFMipper.h in Headers */, 706EF17B26D166C5001C950E /* sse2neon.h in Headers */, 70871DF227DDDBCD00D0B9E1 /* astcenc_mathlib.h in Headers */, + 709B8D3228D7BCAD0081BD1F /* ranges.h in Headers */, 706EF17C26D166C5001C950E /* KramConfig.h in Headers */, 706EF17D26D166C5001C950E /* KramLog.h in Headers */, 706EF17E26D166C5001C950E /* KramLib.h in Headers */, @@ -1431,6 +1525,7 @@ 706EF18026D166C5001C950E /* KramImage.h in Headers */, 706EF18126D166C5001C950E /* win_mmap.h in Headers */, 70871DDE27DDDBCD00D0B9E1 /* astcenc_vecmathlib_sse_4.h in Headers */, + 709B8D5028D7C15F0081BD1F /* KramFmt.h in Headers */, 707789DA2881BA81008A51BC /* bc7decomp.h in Headers */, 706EF18226D166C5001C950E /* Kram.h in Headers */, 704738C9289F6AEE00C77A9F /* vector.h in Headers */, @@ -1445,13 +1540,16 @@ 706EF18626D166C5001C950E /* KramMmapHelper.h in Headers */, 706EF18726D166C5001C950E /* float4a.h in Headers */, 706EF18826D166C5001C950E /* KramFileHelper.h in Headers */, + 709B8D4028D7BCAD0081BD1F /* os.h in Headers */, 706EF18926D166C5001C950E /* KramMipper.h in Headers */, 706EF18A26D166C5001C950E /* TaskSystem.h in Headers */, 706EF18B26D166C5001C950E /* squish.h in Headers */, 706EF18C26D166C5001C950E /* clusterfit.h in Headers */, + 709B8D3628D7BCAD0081BD1F /* core.h in Headers */, 706EF18D26D166C5001C950E /* colourfit.h in Headers */, 70871DFE27DDDBCD00D0B9E1 /* astcenc_vecmathlib.h in Headers */, 706EF18E26D166C5001C950E /* alpha.h in Headers */, + 709B8D4228D7BCAD0081BD1F /* color.h in Headers */, 708A6A992708CE4700BA5410 /* bc6h_decode.h in Headers */, 706EF18F26D166C5001C950E /* singlecolourfit.h in Headers */, 706EF19026D166C5001C950E /* maths.h in Headers */, @@ -1463,9 +1561,13 @@ 706EF19326D166C5001C950E /* rangefit.h in Headers */, 706EF19426D166C5001C950E /* zstd.h in Headers */, 70871DF427DDDBCD00D0B9E1 /* astcenc_internal.h in Headers */, + 709B8D3028D7BCAD0081BD1F /* format-inl.h in Headers */, 704738CD289F6AEE00C77A9F /* fstring.h in Headers */, + 709B8D2E28D7BCAD0081BD1F /* ostream.h in Headers */, 706EF19526D166C5001C950E /* lodepng.h in Headers */, + 709B8D4A28D7BCAD0081BD1F /* format.h in Headers */, 706EF19626D166C5001C950E /* tmpfileplus.h in Headers */, + 709B8D3428D7BCAD0081BD1F /* xchar.h in Headers */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -1603,6 +1705,7 @@ 706EEFB226D1595D001C950E /* KramLog.cpp in Sources */, 706EEFB326D1595D001C950E /* KramSDFMipper.cpp in Sources */, 706EEFB426D1595D001C950E /* KramMmapHelper.cpp in Sources */, + 709B8D3928D7BCAD0081BD1F /* format.cpp in Sources */, 70871DCB27DDDBCD00D0B9E1 /* astcenc_image.cpp in Sources */, 706EEFB526D1595D001C950E /* float4a.cpp in Sources */, 706EFF7326D34740001C950E /* thread_support.cpp in Sources */, @@ -1610,6 +1713,7 @@ 706EEFB726D1595D001C950E /* squish.cpp in Sources */, 706EEFB826D1595D001C950E /* colourset.cpp in Sources */, 70871DD327DDDBCD00D0B9E1 /* astcenc_partition_tables.cpp in Sources */, + 709B8D3728D7BCAD0081BD1F /* os.cpp in Sources */, 706EFF8126D34740001C950E /* hashtable.cpp in Sources */, 70871DEF27DDDBCD00D0B9E1 /* astcenc_weight_align.cpp in Sources */, 70871DD927DDDBCD00D0B9E1 /* astcenc_compute_variance.cpp in Sources */, @@ -1641,6 +1745,7 @@ 70871E0427DDDBCD00D0B9E1 /* astcenc_color_unquantize.cpp in Sources */, 70871DD227DDDBCD00D0B9E1 /* astcenc_averages_and_directions.cpp in Sources */, 70871DE027DDDBCD00D0B9E1 /* astcenc_mathlib_softfloat.cpp in Sources */, + 709B8D3828D7BCAD0081BD1F /* os.cpp in Sources */, 706EFC2426D1C39B001C950E /* ateencoder.mm in Sources */, 707789EE2881BA81008A51BC /* bc7decomp_ref.cpp in Sources */, 706EF19826D166C5001C950E /* EtcBlock4x4Encoding_RGB8.cpp in Sources */, @@ -1720,6 +1825,7 @@ 706EF1DA26D166C5001C950E /* lodepng.cpp in Sources */, 707789E22881BA81008A51BC /* utils.cpp in Sources */, 706EF1DB26D166C5001C950E /* tmpfileplus.cpp in Sources */, + 709B8D3A28D7BCAD0081BD1F /* format.cpp in Sources */, 70871E0227DDDBCD00D0B9E1 /* astcenc_weight_quant_xfer_tables.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/libkram/CMakeLists.txt b/libkram/CMakeLists.txt index f6a77594..52b2b1ea 100644 --- a/libkram/CMakeLists.txt +++ b/libkram/CMakeLists.txt @@ -117,6 +117,9 @@ file(GLOB_RECURSE libSources CONFIGURE_DEPENDS "${SOURCE_DIR}/etc2comp/*.cpp" "${SOURCE_DIR}/etc2comp/*.h" + "${SOURCE_DIR}/fmt/*.cpp" + "${SOURCE_DIR}/fmt/*.h" + "${SOURCE_DIR}/heman/hedistance.cpp" "${SOURCE_DIR}/heman/hedistance.h" @@ -199,6 +202,7 @@ target_include_directories(${myTargetLib} PRIVATE "${SOURCE_DIR}/bc7enc/" "${SOURCE_DIR}/compressonator/bc6h/" "${SOURCE_DIR}/etc2comp/" + "${SOURCE_DIR}/fmt/" "${SOURCE_DIR}/heman/" "${SOURCE_DIR}/lodepng" "${SOURCE_DIR}/miniz/" diff --git a/libkram/fmt/LICENSE.rst b/libkram/fmt/LICENSE.rst new file mode 100644 index 00000000..f0ec3db4 --- /dev/null +++ b/libkram/fmt/LICENSE.rst @@ -0,0 +1,27 @@ +Copyright (c) 2012 - present, Victor Zverovich + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +--- Optional exception to the license --- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into a machine-executable object form of such +source code, you may redistribute such embedded portions in such object form +without including the above copyright and permission notices. diff --git a/libkram/fmt/args.h b/libkram/fmt/args.h new file mode 100644 index 00000000..a3966d14 --- /dev/null +++ b/libkram/fmt/args.h @@ -0,0 +1,234 @@ +// Formatting library for C++ - dynamic format arguments +// +// Copyright (c) 2012 - present, Victor Zverovich +// All rights reserved. +// +// For the license information refer to format.h. + +#ifndef FMT_ARGS_H_ +#define FMT_ARGS_H_ + +#include // std::reference_wrapper +#include // std::unique_ptr +#include + +#include "core.h" + +FMT_BEGIN_NAMESPACE + +namespace detail { + +template struct is_reference_wrapper : std::false_type {}; +template +struct is_reference_wrapper> : std::true_type {}; + +template const T& unwrap(const T& v) { return v; } +template const T& unwrap(const std::reference_wrapper& v) { + return static_cast(v); +} + +class dynamic_arg_list { + // Workaround for clang's -Wweak-vtables. Unlike for regular classes, for + // templates it doesn't complain about inability to deduce single translation + // unit for placing vtable. So storage_node_base is made a fake template. + template struct node { + virtual ~node() = default; + std::unique_ptr> next; + }; + + template struct typed_node : node<> { + T value; + + template + FMT_CONSTEXPR typed_node(const Arg& arg) : value(arg) {} + + template + FMT_CONSTEXPR typed_node(const basic_string_view& arg) + : value(arg.data(), arg.size()) {} + }; + + std::unique_ptr> head_; + + public: + template const T& push(const Arg& arg) { + auto new_node = std::unique_ptr>(new typed_node(arg)); + auto& value = new_node->value; + new_node->next = std::move(head_); + head_ = std::move(new_node); + return value; + } +}; +} // namespace detail + +/** + \rst + A dynamic version of `fmt::format_arg_store`. + It's equipped with a storage to potentially temporary objects which lifetimes + could be shorter than the format arguments object. + + It can be implicitly converted into `~fmt::basic_format_args` for passing + into type-erased formatting functions such as `~fmt::vformat`. + \endrst + */ +template +class dynamic_format_arg_store +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409 + // Workaround a GCC template argument substitution bug. + : public basic_format_args +#endif +{ + private: + using char_type = typename Context::char_type; + + template struct need_copy { + static constexpr detail::type mapped_type = + detail::mapped_type_constant::value; + + enum { + value = !(detail::is_reference_wrapper::value || + std::is_same>::value || + std::is_same>::value || + (mapped_type != detail::type::cstring_type && + mapped_type != detail::type::string_type && + mapped_type != detail::type::custom_type)) + }; + }; + + template + using stored_type = conditional_t< + std::is_convertible>::value && + !detail::is_reference_wrapper::value, + std::basic_string, T>; + + // Storage of basic_format_arg must be contiguous. + std::vector> data_; + std::vector> named_info_; + + // Storage of arguments not fitting into basic_format_arg must grow + // without relocation because items in data_ refer to it. + detail::dynamic_arg_list dynamic_args_; + + friend class basic_format_args; + + unsigned long long get_types() const { + return detail::is_unpacked_bit | data_.size() | + (named_info_.empty() + ? 0ULL + : static_cast(detail::has_named_args_bit)); + } + + const basic_format_arg* data() const { + return named_info_.empty() ? data_.data() : data_.data() + 1; + } + + template void emplace_arg(const T& arg) { + data_.emplace_back(detail::make_arg(arg)); + } + + template + void emplace_arg(const detail::named_arg& arg) { + if (named_info_.empty()) { + constexpr const detail::named_arg_info* zero_ptr{nullptr}; + data_.insert(data_.begin(), {zero_ptr, 0}); + } + data_.emplace_back(detail::make_arg(detail::unwrap(arg.value))); + auto pop_one = [](std::vector>* data) { + data->pop_back(); + }; + std::unique_ptr>, decltype(pop_one)> + guard{&data_, pop_one}; + named_info_.push_back({arg.name, static_cast(data_.size() - 2u)}); + data_[0].value_.named_args = {named_info_.data(), named_info_.size()}; + guard.release(); + } + + public: + constexpr dynamic_format_arg_store() = default; + + /** + \rst + Adds an argument into the dynamic store for later passing to a formatting + function. + + Note that custom types and string types (but not string views) are copied + into the store dynamically allocating memory if necessary. + + **Example**:: + + fmt::dynamic_format_arg_store store; + store.push_back(42); + store.push_back("abc"); + store.push_back(1.5f); + std::string result = fmt::vformat("{} and {} and {}", store); + \endrst + */ + template void push_back(const T& arg) { + if (detail::const_check(need_copy::value)) + emplace_arg(dynamic_args_.push>(arg)); + else + emplace_arg(detail::unwrap(arg)); + } + + /** + \rst + Adds a reference to the argument into the dynamic store for later passing to + a formatting function. + + **Example**:: + + fmt::dynamic_format_arg_store store; + char band[] = "Rolling Stones"; + store.push_back(std::cref(band)); + band[9] = 'c'; // Changing str affects the output. + std::string result = fmt::vformat("{}", store); + // result == "Rolling Scones" + \endrst + */ + template void push_back(std::reference_wrapper arg) { + static_assert( + need_copy::value, + "objects of built-in types and string views are always copied"); + emplace_arg(arg.get()); + } + + /** + Adds named argument into the dynamic store for later passing to a formatting + function. ``std::reference_wrapper`` is supported to avoid copying of the + argument. The name is always copied into the store. + */ + template + void push_back(const detail::named_arg& arg) { + const char_type* arg_name = + dynamic_args_.push>(arg.name).c_str(); + if (detail::const_check(need_copy::value)) { + emplace_arg( + fmt::arg(arg_name, dynamic_args_.push>(arg.value))); + } else { + emplace_arg(fmt::arg(arg_name, arg.value)); + } + } + + /** Erase all elements from the store */ + void clear() { + data_.clear(); + named_info_.clear(); + dynamic_args_ = detail::dynamic_arg_list(); + } + + /** + \rst + Reserves space to store at least *new_cap* arguments including + *new_cap_named* named arguments. + \endrst + */ + void reserve(size_t new_cap, size_t new_cap_named) { + FMT_ASSERT(new_cap >= new_cap_named, + "Set of arguments includes set of named arguments"); + data_.reserve(new_cap); + named_info_.reserve(new_cap_named); + } +}; + +FMT_END_NAMESPACE + +#endif // FMT_ARGS_H_ diff --git a/libkram/fmt/chrono.h b/libkram/fmt/chrono.h new file mode 100644 index 00000000..ed7f5f16 --- /dev/null +++ b/libkram/fmt/chrono.h @@ -0,0 +1,2069 @@ +// Formatting library for C++ - chrono support +// +// Copyright (c) 2012 - present, Victor Zverovich +// All rights reserved. +// +// For the license information refer to format.h. + +#ifndef FMT_CHRONO_H_ +#define FMT_CHRONO_H_ + +#include +#include +#include // std::isfinite +#include // std::memcpy +#include +#include +#include +#include +#include + +#include "format.h" + +FMT_BEGIN_NAMESPACE + +// Enable tzset. +#ifndef FMT_USE_TZSET +// UWP doesn't provide _tzset. +# if FMT_HAS_INCLUDE("winapifamily.h") +# include +# endif +# if defined(_WIN32) && (!defined(WINAPI_FAMILY) || \ + (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP)) +# define FMT_USE_TZSET 1 +# else +# define FMT_USE_TZSET 0 +# endif +#endif + +// Enable safe chrono durations, unless explicitly disabled. +#ifndef FMT_SAFE_DURATION_CAST +# define FMT_SAFE_DURATION_CAST 1 +#endif +#if FMT_SAFE_DURATION_CAST + +// For conversion between std::chrono::durations without undefined +// behaviour or erroneous results. +// This is a stripped down version of duration_cast, for inclusion in fmt. +// See https://github.com/pauldreik/safe_duration_cast +// +// Copyright Paul Dreik 2019 +namespace safe_duration_cast { + +template ::value && + std::numeric_limits::is_signed == + std::numeric_limits::is_signed)> +FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) { + ec = 0; + using F = std::numeric_limits; + using T = std::numeric_limits; + static_assert(F::is_integer, "From must be integral"); + static_assert(T::is_integer, "To must be integral"); + + // A and B are both signed, or both unsigned. + if (detail::const_check(F::digits <= T::digits)) { + // From fits in To without any problem. + } else { + // From does not always fit in To, resort to a dynamic check. + if (from < (T::min)() || from > (T::max)()) { + // outside range. + ec = 1; + return {}; + } + } + return static_cast(from); +} + +/** + * converts From to To, without loss. If the dynamic value of from + * can't be converted to To without loss, ec is set. + */ +template ::value && + std::numeric_limits::is_signed != + std::numeric_limits::is_signed)> +FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) { + ec = 0; + using F = std::numeric_limits; + using T = std::numeric_limits; + static_assert(F::is_integer, "From must be integral"); + static_assert(T::is_integer, "To must be integral"); + + if (detail::const_check(F::is_signed && !T::is_signed)) { + // From may be negative, not allowed! + if (fmt::detail::is_negative(from)) { + ec = 1; + return {}; + } + // From is positive. Can it always fit in To? + if (detail::const_check(F::digits > T::digits) && + from > static_cast(detail::max_value())) { + ec = 1; + return {}; + } + } + + if (detail::const_check(!F::is_signed && T::is_signed && + F::digits >= T::digits) && + from > static_cast(detail::max_value())) { + ec = 1; + return {}; + } + return static_cast(from); // Lossless conversion. +} + +template ::value)> +FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) { + ec = 0; + return from; +} // function + +// clang-format off +/** + * converts From to To if possible, otherwise ec is set. + * + * input | output + * ---------------------------------|--------------- + * NaN | NaN + * Inf | Inf + * normal, fits in output | converted (possibly lossy) + * normal, does not fit in output | ec is set + * subnormal | best effort + * -Inf | -Inf + */ +// clang-format on +template ::value)> +FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) { + ec = 0; + using T = std::numeric_limits; + static_assert(std::is_floating_point::value, "From must be floating"); + static_assert(std::is_floating_point::value, "To must be floating"); + + // catch the only happy case + if (std::isfinite(from)) { + if (from >= T::lowest() && from <= (T::max)()) { + return static_cast(from); + } + // not within range. + ec = 1; + return {}; + } + + // nan and inf will be preserved + return static_cast(from); +} // function + +template ::value)> +FMT_CONSTEXPR To safe_float_conversion(const From from, int& ec) { + ec = 0; + static_assert(std::is_floating_point::value, "From must be floating"); + return from; +} + +/** + * safe duration cast between integral durations + */ +template ::value), + FMT_ENABLE_IF(std::is_integral::value)> +To safe_duration_cast(std::chrono::duration from, + int& ec) { + using From = std::chrono::duration; + ec = 0; + // the basic idea is that we need to convert from count() in the from type + // to count() in the To type, by multiplying it with this: + struct Factor + : std::ratio_divide {}; + + static_assert(Factor::num > 0, "num must be positive"); + static_assert(Factor::den > 0, "den must be positive"); + + // the conversion is like this: multiply from.count() with Factor::num + // /Factor::den and convert it to To::rep, all this without + // overflow/underflow. let's start by finding a suitable type that can hold + // both To, From and Factor::num + using IntermediateRep = + typename std::common_type::type; + + // safe conversion to IntermediateRep + IntermediateRep count = + lossless_integral_conversion(from.count(), ec); + if (ec) return {}; + // multiply with Factor::num without overflow or underflow + if (detail::const_check(Factor::num != 1)) { + const auto max1 = detail::max_value() / Factor::num; + if (count > max1) { + ec = 1; + return {}; + } + const auto min1 = + (std::numeric_limits::min)() / Factor::num; + if (!std::is_unsigned::value && count < min1) { + ec = 1; + return {}; + } + count *= Factor::num; + } + + if (detail::const_check(Factor::den != 1)) count /= Factor::den; + auto tocount = lossless_integral_conversion(count, ec); + return ec ? To() : To(tocount); +} + +/** + * safe duration_cast between floating point durations + */ +template ::value), + FMT_ENABLE_IF(std::is_floating_point::value)> +To safe_duration_cast(std::chrono::duration from, + int& ec) { + using From = std::chrono::duration; + ec = 0; + if (std::isnan(from.count())) { + // nan in, gives nan out. easy. + return To{std::numeric_limits::quiet_NaN()}; + } + // maybe we should also check if from is denormal, and decide what to do about + // it. + + // +-inf should be preserved. + if (std::isinf(from.count())) { + return To{from.count()}; + } + + // the basic idea is that we need to convert from count() in the from type + // to count() in the To type, by multiplying it with this: + struct Factor + : std::ratio_divide {}; + + static_assert(Factor::num > 0, "num must be positive"); + static_assert(Factor::den > 0, "den must be positive"); + + // the conversion is like this: multiply from.count() with Factor::num + // /Factor::den and convert it to To::rep, all this without + // overflow/underflow. let's start by finding a suitable type that can hold + // both To, From and Factor::num + using IntermediateRep = + typename std::common_type::type; + + // force conversion of From::rep -> IntermediateRep to be safe, + // even if it will never happen be narrowing in this context. + IntermediateRep count = + safe_float_conversion(from.count(), ec); + if (ec) { + return {}; + } + + // multiply with Factor::num without overflow or underflow + if (detail::const_check(Factor::num != 1)) { + constexpr auto max1 = detail::max_value() / + static_cast(Factor::num); + if (count > max1) { + ec = 1; + return {}; + } + constexpr auto min1 = std::numeric_limits::lowest() / + static_cast(Factor::num); + if (count < min1) { + ec = 1; + return {}; + } + count *= static_cast(Factor::num); + } + + // this can't go wrong, right? den>0 is checked earlier. + if (detail::const_check(Factor::den != 1)) { + using common_t = typename std::common_type::type; + count /= static_cast(Factor::den); + } + + // convert to the to type, safely + using ToRep = typename To::rep; + + const ToRep tocount = safe_float_conversion(count, ec); + if (ec) { + return {}; + } + return To{tocount}; +} +} // namespace safe_duration_cast +#endif + +// Prevents expansion of a preceding token as a function-style macro. +// Usage: f FMT_NOMACRO() +#define FMT_NOMACRO + +namespace detail { +template struct null {}; +inline null<> localtime_r FMT_NOMACRO(...) { return null<>(); } +inline null<> localtime_s(...) { return null<>(); } +inline null<> gmtime_r(...) { return null<>(); } +inline null<> gmtime_s(...) { return null<>(); } + +inline const std::locale& get_classic_locale() { + static const auto& locale = std::locale::classic(); + return locale; +} + +template struct codecvt_result { + static constexpr const size_t max_size = 32; + CodeUnit buf[max_size]; + CodeUnit* end; +}; +template +constexpr const size_t codecvt_result::max_size; + +template +void write_codecvt(codecvt_result& out, string_view in_buf, + const std::locale& loc) { +#if FMT_CLANG_VERSION +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wdeprecated" + auto& f = std::use_facet>(loc); +# pragma clang diagnostic pop +#else + auto& f = std::use_facet>(loc); +#endif + auto mb = std::mbstate_t(); + const char* from_next = nullptr; + auto result = f.in(mb, in_buf.begin(), in_buf.end(), from_next, + std::begin(out.buf), std::end(out.buf), out.end); + if (result != std::codecvt_base::ok) + FMT_THROW(format_error("failed to format time")); +} + +template +auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc) + -> OutputIt { + if (detail::is_utf8() && loc != get_classic_locale()) { + // char16_t and char32_t codecvts are broken in MSVC (linkage errors) and + // gcc-4. +#if FMT_MSC_VERSION != 0 || \ + (defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI)) + // The _GLIBCXX_USE_DUAL_ABI macro is always defined in libstdc++ from gcc-5 + // and newer. + using code_unit = wchar_t; +#else + using code_unit = char32_t; +#endif + + using unit_t = codecvt_result; + unit_t unit; + write_codecvt(unit, in, loc); + // In UTF-8 is used one to four one-byte code units. + auto&& buf = basic_memory_buffer(); + for (code_unit* p = unit.buf; p != unit.end; ++p) { + uint32_t c = static_cast(*p); + if (sizeof(code_unit) == 2 && c >= 0xd800 && c <= 0xdfff) { + // surrogate pair + ++p; + if (p == unit.end || (c & 0xfc00) != 0xd800 || + (*p & 0xfc00) != 0xdc00) { + FMT_THROW(format_error("failed to format time")); + } + c = (c << 10) + static_cast(*p) - 0x35fdc00; + } + if (c < 0x80) { + buf.push_back(static_cast(c)); + } else if (c < 0x800) { + buf.push_back(static_cast(0xc0 | (c >> 6))); + buf.push_back(static_cast(0x80 | (c & 0x3f))); + } else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) { + buf.push_back(static_cast(0xe0 | (c >> 12))); + buf.push_back(static_cast(0x80 | ((c & 0xfff) >> 6))); + buf.push_back(static_cast(0x80 | (c & 0x3f))); + } else if (c >= 0x10000 && c <= 0x10ffff) { + buf.push_back(static_cast(0xf0 | (c >> 18))); + buf.push_back(static_cast(0x80 | ((c & 0x3ffff) >> 12))); + buf.push_back(static_cast(0x80 | ((c & 0xfff) >> 6))); + buf.push_back(static_cast(0x80 | (c & 0x3f))); + } else { + FMT_THROW(format_error("failed to format time")); + } + } + return copy_str(buf.data(), buf.data() + buf.size(), out); + } + return copy_str(in.data(), in.data() + in.size(), out); +} + +template ::value)> +auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc) + -> OutputIt { + codecvt_result unit; + write_codecvt(unit, sv, loc); + return copy_str(unit.buf, unit.end, out); +} + +template ::value)> +auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc) + -> OutputIt { + return write_encoded_tm_str(out, sv, loc); +} + +template +inline void do_write(buffer& buf, const std::tm& time, + const std::locale& loc, char format, char modifier) { + auto&& format_buf = formatbuf>(buf); + auto&& os = std::basic_ostream(&format_buf); + os.imbue(loc); + using iterator = std::ostreambuf_iterator; + const auto& facet = std::use_facet>(loc); + auto end = facet.put(os, os, Char(' '), &time, format, modifier); + if (end.failed()) FMT_THROW(format_error("failed to format time")); +} + +template ::value)> +auto write(OutputIt out, const std::tm& time, const std::locale& loc, + char format, char modifier = 0) -> OutputIt { + auto&& buf = get_buffer(out); + do_write(buf, time, loc, format, modifier); + return get_iterator(buf, out); +} + +template ::value)> +auto write(OutputIt out, const std::tm& time, const std::locale& loc, + char format, char modifier = 0) -> OutputIt { + auto&& buf = basic_memory_buffer(); + do_write(buf, time, loc, format, modifier); + return write_encoded_tm_str(out, string_view(buf.data(), buf.size()), loc); +} + +} // namespace detail + +FMT_MODULE_EXPORT_BEGIN + +/** + Converts given time since epoch as ``std::time_t`` value into calendar time, + expressed in local time. Unlike ``std::localtime``, this function is + thread-safe on most platforms. + */ +inline std::tm localtime(std::time_t time) { + struct dispatcher { + std::time_t time_; + std::tm tm_; + + dispatcher(std::time_t t) : time_(t) {} + + bool run() { + using namespace fmt::detail; + return handle(localtime_r(&time_, &tm_)); + } + + bool handle(std::tm* tm) { return tm != nullptr; } + + bool handle(detail::null<>) { + using namespace fmt::detail; + return fallback(localtime_s(&tm_, &time_)); + } + + bool fallback(int res) { return res == 0; } + +#if !FMT_MSC_VERSION + bool fallback(detail::null<>) { + using namespace fmt::detail; + std::tm* tm = std::localtime(&time_); + if (tm) tm_ = *tm; + return tm != nullptr; + } +#endif + }; + dispatcher lt(time); + // Too big time values may be unsupported. + if (!lt.run()) FMT_THROW(format_error("time_t value out of range")); + return lt.tm_; +} + +inline std::tm localtime( + std::chrono::time_point time_point) { + return localtime(std::chrono::system_clock::to_time_t(time_point)); +} + +/** + Converts given time since epoch as ``std::time_t`` value into calendar time, + expressed in Coordinated Universal Time (UTC). Unlike ``std::gmtime``, this + function is thread-safe on most platforms. + */ +inline std::tm gmtime(std::time_t time) { + struct dispatcher { + std::time_t time_; + std::tm tm_; + + dispatcher(std::time_t t) : time_(t) {} + + bool run() { + using namespace fmt::detail; + return handle(gmtime_r(&time_, &tm_)); + } + + bool handle(std::tm* tm) { return tm != nullptr; } + + bool handle(detail::null<>) { + using namespace fmt::detail; + return fallback(gmtime_s(&tm_, &time_)); + } + + bool fallback(int res) { return res == 0; } + +#if !FMT_MSC_VERSION + bool fallback(detail::null<>) { + std::tm* tm = std::gmtime(&time_); + if (tm) tm_ = *tm; + return tm != nullptr; + } +#endif + }; + dispatcher gt(time); + // Too big time values may be unsupported. + if (!gt.run()) FMT_THROW(format_error("time_t value out of range")); + return gt.tm_; +} + +inline std::tm gmtime( + std::chrono::time_point time_point) { + return gmtime(std::chrono::system_clock::to_time_t(time_point)); +} + +FMT_BEGIN_DETAIL_NAMESPACE + +// Writes two-digit numbers a, b and c separated by sep to buf. +// The method by Pavel Novikov based on +// https://johnnylee-sde.github.io/Fast-unsigned-integer-to-time-string/. +inline void write_digit2_separated(char* buf, unsigned a, unsigned b, + unsigned c, char sep) { + unsigned long long digits = + a | (b << 24) | (static_cast(c) << 48); + // Convert each value to BCD. + // We have x = a * 10 + b and we want to convert it to BCD y = a * 16 + b. + // The difference is + // y - x = a * 6 + // a can be found from x: + // a = floor(x / 10) + // then + // y = x + a * 6 = x + floor(x / 10) * 6 + // floor(x / 10) is (x * 205) >> 11 (needs 16 bits). + digits += (((digits * 205) >> 11) & 0x000f00000f00000f) * 6; + // Put low nibbles to high bytes and high nibbles to low bytes. + digits = ((digits & 0x00f00000f00000f0) >> 4) | + ((digits & 0x000f00000f00000f) << 8); + auto usep = static_cast(sep); + // Add ASCII '0' to each digit byte and insert separators. + digits |= 0x3030003030003030 | (usep << 16) | (usep << 40); + + constexpr const size_t len = 8; + if (const_check(is_big_endian())) { + char tmp[len]; + std::memcpy(tmp, &digits, len); + std::reverse_copy(tmp, tmp + len, buf); + } else { + std::memcpy(buf, &digits, len); + } +} + +template FMT_CONSTEXPR inline const char* get_units() { + if (std::is_same::value) return "as"; + if (std::is_same::value) return "fs"; + if (std::is_same::value) return "ps"; + if (std::is_same::value) return "ns"; + if (std::is_same::value) return "µs"; + if (std::is_same::value) return "ms"; + if (std::is_same::value) return "cs"; + if (std::is_same::value) return "ds"; + if (std::is_same>::value) return "s"; + if (std::is_same::value) return "das"; + if (std::is_same::value) return "hs"; + if (std::is_same::value) return "ks"; + if (std::is_same::value) return "Ms"; + if (std::is_same::value) return "Gs"; + if (std::is_same::value) return "Ts"; + if (std::is_same::value) return "Ps"; + if (std::is_same::value) return "Es"; + if (std::is_same>::value) return "m"; + if (std::is_same>::value) return "h"; + return nullptr; +} + +enum class numeric_system { + standard, + // Alternative numeric system, e.g. 十二 instead of 12 in ja_JP locale. + alternative +}; + +// Parses a put_time-like format string and invokes handler actions. +template +FMT_CONSTEXPR const Char* parse_chrono_format(const Char* begin, + const Char* end, + Handler&& handler) { + auto ptr = begin; + while (ptr != end) { + auto c = *ptr; + if (c == '}') break; + if (c != '%') { + ++ptr; + continue; + } + if (begin != ptr) handler.on_text(begin, ptr); + ++ptr; // consume '%' + if (ptr == end) FMT_THROW(format_error("invalid format")); + c = *ptr++; + switch (c) { + case '%': + handler.on_text(ptr - 1, ptr); + break; + case 'n': { + const Char newline[] = {'\n'}; + handler.on_text(newline, newline + 1); + break; + } + case 't': { + const Char tab[] = {'\t'}; + handler.on_text(tab, tab + 1); + break; + } + // Year: + case 'Y': + handler.on_year(numeric_system::standard); + break; + case 'y': + handler.on_short_year(numeric_system::standard); + break; + case 'C': + handler.on_century(numeric_system::standard); + break; + case 'G': + handler.on_iso_week_based_year(); + break; + case 'g': + handler.on_iso_week_based_short_year(); + break; + // Day of the week: + case 'a': + handler.on_abbr_weekday(); + break; + case 'A': + handler.on_full_weekday(); + break; + case 'w': + handler.on_dec0_weekday(numeric_system::standard); + break; + case 'u': + handler.on_dec1_weekday(numeric_system::standard); + break; + // Month: + case 'b': + case 'h': + handler.on_abbr_month(); + break; + case 'B': + handler.on_full_month(); + break; + case 'm': + handler.on_dec_month(numeric_system::standard); + break; + // Day of the year/month: + case 'U': + handler.on_dec0_week_of_year(numeric_system::standard); + break; + case 'W': + handler.on_dec1_week_of_year(numeric_system::standard); + break; + case 'V': + handler.on_iso_week_of_year(numeric_system::standard); + break; + case 'j': + handler.on_day_of_year(); + break; + case 'd': + handler.on_day_of_month(numeric_system::standard); + break; + case 'e': + handler.on_day_of_month_space(numeric_system::standard); + break; + // Hour, minute, second: + case 'H': + handler.on_24_hour(numeric_system::standard); + break; + case 'I': + handler.on_12_hour(numeric_system::standard); + break; + case 'M': + handler.on_minute(numeric_system::standard); + break; + case 'S': + handler.on_second(numeric_system::standard); + break; + // Other: + case 'c': + handler.on_datetime(numeric_system::standard); + break; + case 'x': + handler.on_loc_date(numeric_system::standard); + break; + case 'X': + handler.on_loc_time(numeric_system::standard); + break; + case 'D': + handler.on_us_date(); + break; + case 'F': + handler.on_iso_date(); + break; + case 'r': + handler.on_12_hour_time(); + break; + case 'R': + handler.on_24_hour_time(); + break; + case 'T': + handler.on_iso_time(); + break; + case 'p': + handler.on_am_pm(); + break; + case 'Q': + handler.on_duration_value(); + break; + case 'q': + handler.on_duration_unit(); + break; + case 'z': + handler.on_utc_offset(); + break; + case 'Z': + handler.on_tz_name(); + break; + // Alternative representation: + case 'E': { + if (ptr == end) FMT_THROW(format_error("invalid format")); + c = *ptr++; + switch (c) { + case 'Y': + handler.on_year(numeric_system::alternative); + break; + case 'y': + handler.on_offset_year(); + break; + case 'C': + handler.on_century(numeric_system::alternative); + break; + case 'c': + handler.on_datetime(numeric_system::alternative); + break; + case 'x': + handler.on_loc_date(numeric_system::alternative); + break; + case 'X': + handler.on_loc_time(numeric_system::alternative); + break; + default: + FMT_THROW(format_error("invalid format")); + } + break; + } + case 'O': + if (ptr == end) FMT_THROW(format_error("invalid format")); + c = *ptr++; + switch (c) { + case 'y': + handler.on_short_year(numeric_system::alternative); + break; + case 'm': + handler.on_dec_month(numeric_system::alternative); + break; + case 'U': + handler.on_dec0_week_of_year(numeric_system::alternative); + break; + case 'W': + handler.on_dec1_week_of_year(numeric_system::alternative); + break; + case 'V': + handler.on_iso_week_of_year(numeric_system::alternative); + break; + case 'd': + handler.on_day_of_month(numeric_system::alternative); + break; + case 'e': + handler.on_day_of_month_space(numeric_system::alternative); + break; + case 'w': + handler.on_dec0_weekday(numeric_system::alternative); + break; + case 'u': + handler.on_dec1_weekday(numeric_system::alternative); + break; + case 'H': + handler.on_24_hour(numeric_system::alternative); + break; + case 'I': + handler.on_12_hour(numeric_system::alternative); + break; + case 'M': + handler.on_minute(numeric_system::alternative); + break; + case 'S': + handler.on_second(numeric_system::alternative); + break; + default: + FMT_THROW(format_error("invalid format")); + } + break; + default: + FMT_THROW(format_error("invalid format")); + } + begin = ptr; + } + if (begin != ptr) handler.on_text(begin, ptr); + return ptr; +} + +template struct null_chrono_spec_handler { + FMT_CONSTEXPR void unsupported() { + static_cast(this)->unsupported(); + } + FMT_CONSTEXPR void on_year(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_short_year(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_offset_year() { unsupported(); } + FMT_CONSTEXPR void on_century(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_iso_week_based_year() { unsupported(); } + FMT_CONSTEXPR void on_iso_week_based_short_year() { unsupported(); } + FMT_CONSTEXPR void on_abbr_weekday() { unsupported(); } + FMT_CONSTEXPR void on_full_weekday() { unsupported(); } + FMT_CONSTEXPR void on_dec0_weekday(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_dec1_weekday(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_abbr_month() { unsupported(); } + FMT_CONSTEXPR void on_full_month() { unsupported(); } + FMT_CONSTEXPR void on_dec_month(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_day_of_year() { unsupported(); } + FMT_CONSTEXPR void on_day_of_month(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_day_of_month_space(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_24_hour(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_12_hour(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_minute(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_second(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_datetime(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_loc_date(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_loc_time(numeric_system) { unsupported(); } + FMT_CONSTEXPR void on_us_date() { unsupported(); } + FMT_CONSTEXPR void on_iso_date() { unsupported(); } + FMT_CONSTEXPR void on_12_hour_time() { unsupported(); } + FMT_CONSTEXPR void on_24_hour_time() { unsupported(); } + FMT_CONSTEXPR void on_iso_time() { unsupported(); } + FMT_CONSTEXPR void on_am_pm() { unsupported(); } + FMT_CONSTEXPR void on_duration_value() { unsupported(); } + FMT_CONSTEXPR void on_duration_unit() { unsupported(); } + FMT_CONSTEXPR void on_utc_offset() { unsupported(); } + FMT_CONSTEXPR void on_tz_name() { unsupported(); } +}; + +struct tm_format_checker : null_chrono_spec_handler { + FMT_NORETURN void unsupported() { FMT_THROW(format_error("no format")); } + + template + FMT_CONSTEXPR void on_text(const Char*, const Char*) {} + FMT_CONSTEXPR void on_year(numeric_system) {} + FMT_CONSTEXPR void on_short_year(numeric_system) {} + FMT_CONSTEXPR void on_offset_year() {} + FMT_CONSTEXPR void on_century(numeric_system) {} + FMT_CONSTEXPR void on_iso_week_based_year() {} + FMT_CONSTEXPR void on_iso_week_based_short_year() {} + FMT_CONSTEXPR void on_abbr_weekday() {} + FMT_CONSTEXPR void on_full_weekday() {} + FMT_CONSTEXPR void on_dec0_weekday(numeric_system) {} + FMT_CONSTEXPR void on_dec1_weekday(numeric_system) {} + FMT_CONSTEXPR void on_abbr_month() {} + FMT_CONSTEXPR void on_full_month() {} + FMT_CONSTEXPR void on_dec_month(numeric_system) {} + FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) {} + FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) {} + FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) {} + FMT_CONSTEXPR void on_day_of_year() {} + FMT_CONSTEXPR void on_day_of_month(numeric_system) {} + FMT_CONSTEXPR void on_day_of_month_space(numeric_system) {} + FMT_CONSTEXPR void on_24_hour(numeric_system) {} + FMT_CONSTEXPR void on_12_hour(numeric_system) {} + FMT_CONSTEXPR void on_minute(numeric_system) {} + FMT_CONSTEXPR void on_second(numeric_system) {} + FMT_CONSTEXPR void on_datetime(numeric_system) {} + FMT_CONSTEXPR void on_loc_date(numeric_system) {} + FMT_CONSTEXPR void on_loc_time(numeric_system) {} + FMT_CONSTEXPR void on_us_date() {} + FMT_CONSTEXPR void on_iso_date() {} + FMT_CONSTEXPR void on_12_hour_time() {} + FMT_CONSTEXPR void on_24_hour_time() {} + FMT_CONSTEXPR void on_iso_time() {} + FMT_CONSTEXPR void on_am_pm() {} + FMT_CONSTEXPR void on_utc_offset() {} + FMT_CONSTEXPR void on_tz_name() {} +}; + +inline const char* tm_wday_full_name(int wday) { + static constexpr const char* full_name_list[] = { + "Sunday", "Monday", "Tuesday", "Wednesday", + "Thursday", "Friday", "Saturday"}; + return wday >= 0 && wday <= 6 ? full_name_list[wday] : "?"; +} +inline const char* tm_wday_short_name(int wday) { + static constexpr const char* short_name_list[] = {"Sun", "Mon", "Tue", "Wed", + "Thu", "Fri", "Sat"}; + return wday >= 0 && wday <= 6 ? short_name_list[wday] : "???"; +} + +inline const char* tm_mon_full_name(int mon) { + static constexpr const char* full_name_list[] = { + "January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December"}; + return mon >= 0 && mon <= 11 ? full_name_list[mon] : "?"; +} +inline const char* tm_mon_short_name(int mon) { + static constexpr const char* short_name_list[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + }; + return mon >= 0 && mon <= 11 ? short_name_list[mon] : "???"; +} + +template +struct has_member_data_tm_gmtoff : std::false_type {}; +template +struct has_member_data_tm_gmtoff> + : std::true_type {}; + +template +struct has_member_data_tm_zone : std::false_type {}; +template +struct has_member_data_tm_zone> + : std::true_type {}; + +#if FMT_USE_TZSET +inline void tzset_once() { + static bool init = []() -> bool { + _tzset(); + return true; + }(); + ignore_unused(init); +} +#endif + +template class tm_writer { + private: + static constexpr int days_per_week = 7; + + const std::locale& loc_; + const bool is_classic_; + OutputIt out_; + const std::tm& tm_; + + auto tm_sec() const noexcept -> int { + FMT_ASSERT(tm_.tm_sec >= 0 && tm_.tm_sec <= 61, ""); + return tm_.tm_sec; + } + auto tm_min() const noexcept -> int { + FMT_ASSERT(tm_.tm_min >= 0 && tm_.tm_min <= 59, ""); + return tm_.tm_min; + } + auto tm_hour() const noexcept -> int { + FMT_ASSERT(tm_.tm_hour >= 0 && tm_.tm_hour <= 23, ""); + return tm_.tm_hour; + } + auto tm_mday() const noexcept -> int { + FMT_ASSERT(tm_.tm_mday >= 1 && tm_.tm_mday <= 31, ""); + return tm_.tm_mday; + } + auto tm_mon() const noexcept -> int { + FMT_ASSERT(tm_.tm_mon >= 0 && tm_.tm_mon <= 11, ""); + return tm_.tm_mon; + } + auto tm_year() const noexcept -> long long { return 1900ll + tm_.tm_year; } + auto tm_wday() const noexcept -> int { + FMT_ASSERT(tm_.tm_wday >= 0 && tm_.tm_wday <= 6, ""); + return tm_.tm_wday; + } + auto tm_yday() const noexcept -> int { + FMT_ASSERT(tm_.tm_yday >= 0 && tm_.tm_yday <= 365, ""); + return tm_.tm_yday; + } + + auto tm_hour12() const noexcept -> int { + const auto h = tm_hour(); + const auto z = h < 12 ? h : h - 12; + return z == 0 ? 12 : z; + } + + // POSIX and the C Standard are unclear or inconsistent about what %C and %y + // do if the year is negative or exceeds 9999. Use the convention that %C + // concatenated with %y yields the same output as %Y, and that %Y contains at + // least 4 characters, with more only if necessary. + auto split_year_lower(long long year) const noexcept -> int { + auto l = year % 100; + if (l < 0) l = -l; // l in [0, 99] + return static_cast(l); + } + + // Algorithm: + // https://en.wikipedia.org/wiki/ISO_week_date#Calculating_the_week_number_from_a_month_and_day_of_the_month_or_ordinal_date + auto iso_year_weeks(long long curr_year) const noexcept -> int { + const auto prev_year = curr_year - 1; + const auto curr_p = + (curr_year + curr_year / 4 - curr_year / 100 + curr_year / 400) % + days_per_week; + const auto prev_p = + (prev_year + prev_year / 4 - prev_year / 100 + prev_year / 400) % + days_per_week; + return 52 + ((curr_p == 4 || prev_p == 3) ? 1 : 0); + } + auto iso_week_num(int tm_yday, int tm_wday) const noexcept -> int { + return (tm_yday + 11 - (tm_wday == 0 ? days_per_week : tm_wday)) / + days_per_week; + } + auto tm_iso_week_year() const noexcept -> long long { + const auto year = tm_year(); + const auto w = iso_week_num(tm_yday(), tm_wday()); + if (w < 1) return year - 1; + if (w > iso_year_weeks(year)) return year + 1; + return year; + } + auto tm_iso_week_of_year() const noexcept -> int { + const auto year = tm_year(); + const auto w = iso_week_num(tm_yday(), tm_wday()); + if (w < 1) return iso_year_weeks(year - 1); + if (w > iso_year_weeks(year)) return 1; + return w; + } + + void write1(int value) { + *out_++ = static_cast('0' + to_unsigned(value) % 10); + } + void write2(int value) { + const char* d = digits2(to_unsigned(value) % 100); + *out_++ = *d++; + *out_++ = *d; + } + + void write_year_extended(long long year) { + // At least 4 characters. + int width = 4; + if (year < 0) { + *out_++ = '-'; + year = 0 - year; + --width; + } + uint32_or_64_or_128_t n = to_unsigned(year); + const int num_digits = count_digits(n); + if (width > num_digits) out_ = std::fill_n(out_, width - num_digits, '0'); + out_ = format_decimal(out_, n, num_digits).end; + } + void write_year(long long year) { + if (year >= 0 && year < 10000) { + write2(static_cast(year / 100)); + write2(static_cast(year % 100)); + } else { + write_year_extended(year); + } + } + + void write_utc_offset(long offset) { + if (offset < 0) { + *out_++ = '-'; + offset = -offset; + } else { + *out_++ = '+'; + } + offset /= 60; + write2(static_cast(offset / 60)); + write2(static_cast(offset % 60)); + } + template ::value)> + void format_utc_offset_impl(const T& tm) { + write_utc_offset(tm.tm_gmtoff); + } + template ::value)> + void format_utc_offset_impl(const T& tm) { +#if defined(_WIN32) && defined(_UCRT) +# if FMT_USE_TZSET + tzset_once(); +# endif + long offset = 0; + _get_timezone(&offset); + if (tm.tm_isdst) { + long dstbias = 0; + _get_dstbias(&dstbias); + offset += dstbias; + } + write_utc_offset(-offset); +#else + ignore_unused(tm); + format_localized('z'); +#endif + } + + template ::value)> + void format_tz_name_impl(const T& tm) { + if (is_classic_) + out_ = write_tm_str(out_, tm.tm_zone, loc_); + else + format_localized('Z'); + } + template ::value)> + void format_tz_name_impl(const T&) { + format_localized('Z'); + } + + void format_localized(char format, char modifier = 0) { + out_ = write(out_, tm_, loc_, format, modifier); + } + + public: + tm_writer(const std::locale& loc, OutputIt out, const std::tm& tm) + : loc_(loc), + is_classic_(loc_ == get_classic_locale()), + out_(out), + tm_(tm) {} + + OutputIt out() const { return out_; } + + FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) { + out_ = copy_str(begin, end, out_); + } + + void on_abbr_weekday() { + if (is_classic_) + out_ = write(out_, tm_wday_short_name(tm_wday())); + else + format_localized('a'); + } + void on_full_weekday() { + if (is_classic_) + out_ = write(out_, tm_wday_full_name(tm_wday())); + else + format_localized('A'); + } + void on_dec0_weekday(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) return write1(tm_wday()); + format_localized('w', 'O'); + } + void on_dec1_weekday(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) { + auto wday = tm_wday(); + write1(wday == 0 ? days_per_week : wday); + } else { + format_localized('u', 'O'); + } + } + + void on_abbr_month() { + if (is_classic_) + out_ = write(out_, tm_mon_short_name(tm_mon())); + else + format_localized('b'); + } + void on_full_month() { + if (is_classic_) + out_ = write(out_, tm_mon_full_name(tm_mon())); + else + format_localized('B'); + } + + void on_datetime(numeric_system ns) { + if (is_classic_) { + on_abbr_weekday(); + *out_++ = ' '; + on_abbr_month(); + *out_++ = ' '; + on_day_of_month_space(numeric_system::standard); + *out_++ = ' '; + on_iso_time(); + *out_++ = ' '; + on_year(numeric_system::standard); + } else { + format_localized('c', ns == numeric_system::standard ? '\0' : 'E'); + } + } + void on_loc_date(numeric_system ns) { + if (is_classic_) + on_us_date(); + else + format_localized('x', ns == numeric_system::standard ? '\0' : 'E'); + } + void on_loc_time(numeric_system ns) { + if (is_classic_) + on_iso_time(); + else + format_localized('X', ns == numeric_system::standard ? '\0' : 'E'); + } + void on_us_date() { + char buf[8]; + write_digit2_separated(buf, to_unsigned(tm_mon() + 1), + to_unsigned(tm_mday()), + to_unsigned(split_year_lower(tm_year())), '/'); + out_ = copy_str(std::begin(buf), std::end(buf), out_); + } + void on_iso_date() { + auto year = tm_year(); + char buf[10]; + size_t offset = 0; + if (year >= 0 && year < 10000) { + copy2(buf, digits2(static_cast(year / 100))); + } else { + offset = 4; + write_year_extended(year); + year = 0; + } + write_digit2_separated(buf + 2, static_cast(year % 100), + to_unsigned(tm_mon() + 1), to_unsigned(tm_mday()), + '-'); + out_ = copy_str(std::begin(buf) + offset, std::end(buf), out_); + } + + void on_utc_offset() { format_utc_offset_impl(tm_); } + void on_tz_name() { format_tz_name_impl(tm_); } + + void on_year(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) + return write_year(tm_year()); + format_localized('Y', 'E'); + } + void on_short_year(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) + return write2(split_year_lower(tm_year())); + format_localized('y', 'O'); + } + void on_offset_year() { + if (is_classic_) return write2(split_year_lower(tm_year())); + format_localized('y', 'E'); + } + + void on_century(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) { + auto year = tm_year(); + auto upper = year / 100; + if (year >= -99 && year < 0) { + // Zero upper on negative year. + *out_++ = '-'; + *out_++ = '0'; + } else if (upper >= 0 && upper < 100) { + write2(static_cast(upper)); + } else { + out_ = write(out_, upper); + } + } else { + format_localized('C', 'E'); + } + } + + void on_dec_month(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) + return write2(tm_mon() + 1); + format_localized('m', 'O'); + } + + void on_dec0_week_of_year(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) + return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week); + format_localized('U', 'O'); + } + void on_dec1_week_of_year(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) { + auto wday = tm_wday(); + write2((tm_yday() + days_per_week - + (wday == 0 ? (days_per_week - 1) : (wday - 1))) / + days_per_week); + } else { + format_localized('W', 'O'); + } + } + void on_iso_week_of_year(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) + return write2(tm_iso_week_of_year()); + format_localized('V', 'O'); + } + + void on_iso_week_based_year() { write_year(tm_iso_week_year()); } + void on_iso_week_based_short_year() { + write2(split_year_lower(tm_iso_week_year())); + } + + void on_day_of_year() { + auto yday = tm_yday() + 1; + write1(yday / 100); + write2(yday % 100); + } + void on_day_of_month(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) return write2(tm_mday()); + format_localized('d', 'O'); + } + void on_day_of_month_space(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) { + auto mday = to_unsigned(tm_mday()) % 100; + const char* d2 = digits2(mday); + *out_++ = mday < 10 ? ' ' : d2[0]; + *out_++ = d2[1]; + } else { + format_localized('e', 'O'); + } + } + + void on_24_hour(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) return write2(tm_hour()); + format_localized('H', 'O'); + } + void on_12_hour(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) + return write2(tm_hour12()); + format_localized('I', 'O'); + } + void on_minute(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) return write2(tm_min()); + format_localized('M', 'O'); + } + void on_second(numeric_system ns) { + if (is_classic_ || ns == numeric_system::standard) return write2(tm_sec()); + format_localized('S', 'O'); + } + + void on_12_hour_time() { + if (is_classic_) { + char buf[8]; + write_digit2_separated(buf, to_unsigned(tm_hour12()), + to_unsigned(tm_min()), to_unsigned(tm_sec()), ':'); + out_ = copy_str(std::begin(buf), std::end(buf), out_); + *out_++ = ' '; + on_am_pm(); + } else { + format_localized('r'); + } + } + void on_24_hour_time() { + write2(tm_hour()); + *out_++ = ':'; + write2(tm_min()); + } + void on_iso_time() { + char buf[8]; + write_digit2_separated(buf, to_unsigned(tm_hour()), to_unsigned(tm_min()), + to_unsigned(tm_sec()), ':'); + out_ = copy_str(std::begin(buf), std::end(buf), out_); + } + + void on_am_pm() { + if (is_classic_) { + *out_++ = tm_hour() < 12 ? 'A' : 'P'; + *out_++ = 'M'; + } else { + format_localized('p'); + } + } + + // These apply to chrono durations but not tm. + void on_duration_value() {} + void on_duration_unit() {} +}; + +struct chrono_format_checker : null_chrono_spec_handler { + FMT_NORETURN void unsupported() { FMT_THROW(format_error("no date")); } + + template + FMT_CONSTEXPR void on_text(const Char*, const Char*) {} + FMT_CONSTEXPR void on_24_hour(numeric_system) {} + FMT_CONSTEXPR void on_12_hour(numeric_system) {} + FMT_CONSTEXPR void on_minute(numeric_system) {} + FMT_CONSTEXPR void on_second(numeric_system) {} + FMT_CONSTEXPR void on_12_hour_time() {} + FMT_CONSTEXPR void on_24_hour_time() {} + FMT_CONSTEXPR void on_iso_time() {} + FMT_CONSTEXPR void on_am_pm() {} + FMT_CONSTEXPR void on_duration_value() {} + FMT_CONSTEXPR void on_duration_unit() {} +}; + +template ::value)> +inline bool isfinite(T) { + return true; +} + +// Converts value to Int and checks that it's in the range [0, upper). +template ::value)> +inline Int to_nonnegative_int(T value, Int upper) { + FMT_ASSERT(std::is_unsigned::value || + (value >= 0 && to_unsigned(value) <= to_unsigned(upper)), + "invalid value"); + (void)upper; + return static_cast(value); +} +template ::value)> +inline Int to_nonnegative_int(T value, Int upper) { + if (value < 0 || value > static_cast(upper)) + FMT_THROW(format_error("invalid value")); + return static_cast(value); +} + +template ::value)> +inline T mod(T x, int y) { + return x % static_cast(y); +} +template ::value)> +inline T mod(T x, int y) { + return std::fmod(x, static_cast(y)); +} + +// If T is an integral type, maps T to its unsigned counterpart, otherwise +// leaves it unchanged (unlike std::make_unsigned). +template ::value> +struct make_unsigned_or_unchanged { + using type = T; +}; + +template struct make_unsigned_or_unchanged { + using type = typename std::make_unsigned::type; +}; + +#if FMT_SAFE_DURATION_CAST +// throwing version of safe_duration_cast +template +To fmt_safe_duration_cast(std::chrono::duration from) { + int ec; + To to = safe_duration_cast::safe_duration_cast(from, ec); + if (ec) FMT_THROW(format_error("cannot format duration")); + return to; +} +#endif + +template ::value)> +inline std::chrono::duration get_milliseconds( + std::chrono::duration d) { + // this may overflow and/or the result may not fit in the + // target type. +#if FMT_SAFE_DURATION_CAST + using CommonSecondsType = + typename std::common_type::type; + const auto d_as_common = fmt_safe_duration_cast(d); + const auto d_as_whole_seconds = + fmt_safe_duration_cast(d_as_common); + // this conversion should be nonproblematic + const auto diff = d_as_common - d_as_whole_seconds; + const auto ms = + fmt_safe_duration_cast>(diff); + return ms; +#else + auto s = std::chrono::duration_cast(d); + return std::chrono::duration_cast(d - s); +#endif +} + +// Counts the number of fractional digits in the range [0, 18] according to the +// C++20 spec. If more than 18 fractional digits are required then returns 6 for +// microseconds precision. +template () / 10)> +struct count_fractional_digits { + static constexpr int value = + Num % Den == 0 ? N : count_fractional_digits::value; +}; + +// Base case that doesn't instantiate any more templates +// in order to avoid overflow. +template +struct count_fractional_digits { + static constexpr int value = (Num % Den == 0) ? N : 6; +}; + +constexpr long long pow10(std::uint32_t n) { + return n == 0 ? 1 : 10 * pow10(n - 1); +} + +template ::is_signed)> +constexpr std::chrono::duration abs( + std::chrono::duration d) { + // We need to compare the duration using the count() method directly + // due to a compiler bug in clang-11 regarding the spaceship operator, + // when -Wzero-as-null-pointer-constant is enabled. + // In clang-12 the bug has been fixed. See + // https://bugs.llvm.org/show_bug.cgi?id=46235 and the reproducible example: + // https://www.godbolt.org/z/Knbb5joYx. + return d.count() >= d.zero().count() ? d : -d; +} + +template ::is_signed)> +constexpr std::chrono::duration abs( + std::chrono::duration d) { + return d; +} + +template ::value)> +OutputIt format_duration_value(OutputIt out, Rep val, int) { + return write(out, val); +} + +template ::value)> +OutputIt format_duration_value(OutputIt out, Rep val, int precision) { + auto specs = basic_format_specs(); + specs.precision = precision; + specs.type = precision >= 0 ? presentation_type::fixed_lower + : presentation_type::general_lower; + return write(out, val, specs); +} + +template +OutputIt copy_unit(string_view unit, OutputIt out, Char) { + return std::copy(unit.begin(), unit.end(), out); +} + +template +OutputIt copy_unit(string_view unit, OutputIt out, wchar_t) { + // This works when wchar_t is UTF-32 because units only contain characters + // that have the same representation in UTF-16 and UTF-32. + utf8_to_utf16 u(unit); + return std::copy(u.c_str(), u.c_str() + u.size(), out); +} + +template +OutputIt format_duration_unit(OutputIt out) { + if (const char* unit = get_units()) + return copy_unit(string_view(unit), out, Char()); + *out++ = '['; + out = write(out, Period::num); + if (const_check(Period::den != 1)) { + *out++ = '/'; + out = write(out, Period::den); + } + *out++ = ']'; + *out++ = 's'; + return out; +} + +class get_locale { + private: + union { + std::locale locale_; + }; + bool has_locale_ = false; + + public: + get_locale(bool localized, locale_ref loc) : has_locale_(localized) { + if (localized) + ::new (&locale_) std::locale(loc.template get()); + } + ~get_locale() { + if (has_locale_) locale_.~locale(); + } + operator const std::locale&() const { + return has_locale_ ? locale_ : get_classic_locale(); + } +}; + +template +struct chrono_formatter { + FormatContext& context; + OutputIt out; + int precision; + bool localized = false; + // rep is unsigned to avoid overflow. + using rep = + conditional_t::value && sizeof(Rep) < sizeof(int), + unsigned, typename make_unsigned_or_unchanged::type>; + rep val; + using seconds = std::chrono::duration; + seconds s; + using milliseconds = std::chrono::duration; + bool negative; + + using char_type = typename FormatContext::char_type; + using tm_writer_type = tm_writer; + + chrono_formatter(FormatContext& ctx, OutputIt o, + std::chrono::duration d) + : context(ctx), + out(o), + val(static_cast(d.count())), + negative(false) { + if (d.count() < 0) { + val = 0 - val; + negative = true; + } + + // this may overflow and/or the result may not fit in the + // target type. +#if FMT_SAFE_DURATION_CAST + // might need checked conversion (rep!=Rep) + auto tmpval = std::chrono::duration(val); + s = fmt_safe_duration_cast(tmpval); +#else + s = std::chrono::duration_cast( + std::chrono::duration(val)); +#endif + } + + // returns true if nan or inf, writes to out. + bool handle_nan_inf() { + if (isfinite(val)) { + return false; + } + if (isnan(val)) { + write_nan(); + return true; + } + // must be +-inf + if (val > 0) { + write_pinf(); + } else { + write_ninf(); + } + return true; + } + + Rep hour() const { return static_cast(mod((s.count() / 3600), 24)); } + + Rep hour12() const { + Rep hour = static_cast(mod((s.count() / 3600), 12)); + return hour <= 0 ? 12 : hour; + } + + Rep minute() const { return static_cast(mod((s.count() / 60), 60)); } + Rep second() const { return static_cast(mod(s.count(), 60)); } + + std::tm time() const { + auto time = std::tm(); + time.tm_hour = to_nonnegative_int(hour(), 24); + time.tm_min = to_nonnegative_int(minute(), 60); + time.tm_sec = to_nonnegative_int(second(), 60); + return time; + } + + void write_sign() { + if (negative) { + *out++ = '-'; + negative = false; + } + } + + void write(Rep value, int width) { + write_sign(); + if (isnan(value)) return write_nan(); + uint32_or_64_or_128_t n = + to_unsigned(to_nonnegative_int(value, max_value())); + int num_digits = detail::count_digits(n); + if (width > num_digits) out = std::fill_n(out, width - num_digits, '0'); + out = format_decimal(out, n, num_digits).end; + } + + template void write_fractional_seconds(Duration d) { + FMT_ASSERT(!std::is_floating_point::value, ""); + constexpr auto num_fractional_digits = + count_fractional_digits::value; + + using subsecond_precision = std::chrono::duration< + typename std::common_type::type, + std::ratio<1, detail::pow10(num_fractional_digits)>>; + if (std::ratio_less::value) { + *out++ = '.'; + auto fractional = + detail::abs(d) - std::chrono::duration_cast(d); + auto subseconds = + std::chrono::treat_as_floating_point< + typename subsecond_precision::rep>::value + ? fractional.count() + : std::chrono::duration_cast(fractional) + .count(); + uint32_or_64_or_128_t n = + to_unsigned(to_nonnegative_int(subseconds, max_value())); + int num_digits = detail::count_digits(n); + if (num_fractional_digits > num_digits) + out = std::fill_n(out, num_fractional_digits - num_digits, '0'); + out = format_decimal(out, n, num_digits).end; + } + } + + void write_nan() { std::copy_n("nan", 3, out); } + void write_pinf() { std::copy_n("inf", 3, out); } + void write_ninf() { std::copy_n("-inf", 4, out); } + + template + void format_tm(const tm& time, Callback cb, Args... args) { + if (isnan(val)) return write_nan(); + get_locale loc(localized, context.locale()); + auto w = tm_writer_type(loc, out, time); + (w.*cb)(args...); + out = w.out(); + } + + void on_text(const char_type* begin, const char_type* end) { + std::copy(begin, end, out); + } + + // These are not implemented because durations don't have date information. + void on_abbr_weekday() {} + void on_full_weekday() {} + void on_dec0_weekday(numeric_system) {} + void on_dec1_weekday(numeric_system) {} + void on_abbr_month() {} + void on_full_month() {} + void on_datetime(numeric_system) {} + void on_loc_date(numeric_system) {} + void on_loc_time(numeric_system) {} + void on_us_date() {} + void on_iso_date() {} + void on_utc_offset() {} + void on_tz_name() {} + void on_year(numeric_system) {} + void on_short_year(numeric_system) {} + void on_offset_year() {} + void on_century(numeric_system) {} + void on_iso_week_based_year() {} + void on_iso_week_based_short_year() {} + void on_dec_month(numeric_system) {} + void on_dec0_week_of_year(numeric_system) {} + void on_dec1_week_of_year(numeric_system) {} + void on_iso_week_of_year(numeric_system) {} + void on_day_of_year() {} + void on_day_of_month(numeric_system) {} + void on_day_of_month_space(numeric_system) {} + + void on_24_hour(numeric_system ns) { + if (handle_nan_inf()) return; + + if (ns == numeric_system::standard) return write(hour(), 2); + auto time = tm(); + time.tm_hour = to_nonnegative_int(hour(), 24); + format_tm(time, &tm_writer_type::on_24_hour, ns); + } + + void on_12_hour(numeric_system ns) { + if (handle_nan_inf()) return; + + if (ns == numeric_system::standard) return write(hour12(), 2); + auto time = tm(); + time.tm_hour = to_nonnegative_int(hour12(), 12); + format_tm(time, &tm_writer_type::on_12_hour, ns); + } + + void on_minute(numeric_system ns) { + if (handle_nan_inf()) return; + + if (ns == numeric_system::standard) return write(minute(), 2); + auto time = tm(); + time.tm_min = to_nonnegative_int(minute(), 60); + format_tm(time, &tm_writer_type::on_minute, ns); + } + + void on_second(numeric_system ns) { + if (handle_nan_inf()) return; + + if (ns == numeric_system::standard) { + if (std::is_floating_point::value) { + constexpr auto num_fractional_digits = + count_fractional_digits::value; + auto buf = memory_buffer(); + format_to(std::back_inserter(buf), runtime("{:.{}f}"), + std::fmod(val * static_cast(Period::num) / + static_cast(Period::den), + static_cast(60)), + num_fractional_digits); + if (negative) *out++ = '-'; + if (buf.size() < 2 || buf[1] == '.') *out++ = '0'; + out = std::copy(buf.begin(), buf.end(), out); + } else { + write(second(), 2); + write_fractional_seconds(std::chrono::duration(val)); + } + return; + } + auto time = tm(); + time.tm_sec = to_nonnegative_int(second(), 60); + format_tm(time, &tm_writer_type::on_second, ns); + } + + void on_12_hour_time() { + if (handle_nan_inf()) return; + format_tm(time(), &tm_writer_type::on_12_hour_time); + } + + void on_24_hour_time() { + if (handle_nan_inf()) { + *out++ = ':'; + handle_nan_inf(); + return; + } + + write(hour(), 2); + *out++ = ':'; + write(minute(), 2); + } + + void on_iso_time() { + on_24_hour_time(); + *out++ = ':'; + if (handle_nan_inf()) return; + on_second(numeric_system::standard); + } + + void on_am_pm() { + if (handle_nan_inf()) return; + format_tm(time(), &tm_writer_type::on_am_pm); + } + + void on_duration_value() { + if (handle_nan_inf()) return; + write_sign(); + out = format_duration_value(out, val, precision); + } + + void on_duration_unit() { + out = format_duration_unit(out); + } +}; + +FMT_END_DETAIL_NAMESPACE + +#if defined(__cpp_lib_chrono) && __cpp_lib_chrono >= 201907 +using weekday = std::chrono::weekday; +#else +// A fallback version of weekday. +class weekday { + private: + unsigned char value; + + public: + weekday() = default; + explicit constexpr weekday(unsigned wd) noexcept + : value(static_cast(wd != 7 ? wd : 0)) {} + constexpr unsigned c_encoding() const noexcept { return value; } +}; + +class year_month_day {}; +#endif + +// A rudimentary weekday formatter. +template struct formatter { + private: + bool localized = false; + + public: + FMT_CONSTEXPR auto parse(basic_format_parse_context& ctx) + -> decltype(ctx.begin()) { + auto begin = ctx.begin(), end = ctx.end(); + if (begin != end && *begin == 'L') { + ++begin; + localized = true; + } + return begin; + } + + template + auto format(weekday wd, FormatContext& ctx) const -> decltype(ctx.out()) { + auto time = std::tm(); + time.tm_wday = static_cast(wd.c_encoding()); + detail::get_locale loc(localized, ctx.locale()); + auto w = detail::tm_writer(loc, ctx.out(), time); + w.on_abbr_weekday(); + return w.out(); + } +}; + +template +struct formatter, Char> { + private: + basic_format_specs specs; + int precision = -1; + using arg_ref_type = detail::arg_ref; + arg_ref_type width_ref; + arg_ref_type precision_ref; + bool localized = false; + basic_string_view format_str; + using duration = std::chrono::duration; + + struct spec_handler { + formatter& f; + basic_format_parse_context& context; + basic_string_view format_str; + + template FMT_CONSTEXPR arg_ref_type make_arg_ref(Id arg_id) { + context.check_arg_id(arg_id); + return arg_ref_type(arg_id); + } + + FMT_CONSTEXPR arg_ref_type make_arg_ref(basic_string_view arg_id) { + context.check_arg_id(arg_id); + return arg_ref_type(arg_id); + } + + FMT_CONSTEXPR arg_ref_type make_arg_ref(detail::auto_id) { + return arg_ref_type(context.next_arg_id()); + } + + void on_error(const char* msg) { FMT_THROW(format_error(msg)); } + FMT_CONSTEXPR void on_fill(basic_string_view fill) { + f.specs.fill = fill; + } + FMT_CONSTEXPR void on_align(align_t align) { f.specs.align = align; } + FMT_CONSTEXPR void on_width(int width) { f.specs.width = width; } + FMT_CONSTEXPR void on_precision(int _precision) { + f.precision = _precision; + } + FMT_CONSTEXPR void end_precision() {} + + template FMT_CONSTEXPR void on_dynamic_width(Id arg_id) { + f.width_ref = make_arg_ref(arg_id); + } + + template FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) { + f.precision_ref = make_arg_ref(arg_id); + } + }; + + using iterator = typename basic_format_parse_context::iterator; + struct parse_range { + iterator begin; + iterator end; + }; + + FMT_CONSTEXPR parse_range do_parse(basic_format_parse_context& ctx) { + auto begin = ctx.begin(), end = ctx.end(); + if (begin == end || *begin == '}') return {begin, begin}; + spec_handler handler{*this, ctx, format_str}; + begin = detail::parse_align(begin, end, handler); + if (begin == end) return {begin, begin}; + begin = detail::parse_width(begin, end, handler); + if (begin == end) return {begin, begin}; + if (*begin == '.') { + if (std::is_floating_point::value) + begin = detail::parse_precision(begin, end, handler); + else + handler.on_error("precision not allowed for this argument type"); + } + if (begin != end && *begin == 'L') { + ++begin; + localized = true; + } + end = detail::parse_chrono_format(begin, end, + detail::chrono_format_checker()); + return {begin, end}; + } + + public: + FMT_CONSTEXPR auto parse(basic_format_parse_context& ctx) + -> decltype(ctx.begin()) { + auto range = do_parse(ctx); + format_str = basic_string_view( + &*range.begin, detail::to_unsigned(range.end - range.begin)); + return range.end; + } + + template + auto format(const duration& d, FormatContext& ctx) const + -> decltype(ctx.out()) { + auto specs_copy = specs; + auto precision_copy = precision; + auto begin = format_str.begin(), end = format_str.end(); + // As a possible future optimization, we could avoid extra copying if width + // is not specified. + basic_memory_buffer buf; + auto out = std::back_inserter(buf); + detail::handle_dynamic_spec(specs_copy.width, + width_ref, ctx); + detail::handle_dynamic_spec(precision_copy, + precision_ref, ctx); + if (begin == end || *begin == '}') { + out = detail::format_duration_value(out, d.count(), precision_copy); + detail::format_duration_unit(out); + } else { + detail::chrono_formatter f( + ctx, out, d); + f.precision = precision_copy; + f.localized = localized; + detail::parse_chrono_format(begin, end, f); + } + return detail::write( + ctx.out(), basic_string_view(buf.data(), buf.size()), specs_copy); + } +}; + +template +struct formatter, + Char> : formatter { + FMT_CONSTEXPR formatter() { + basic_string_view default_specs = + detail::string_literal{}; + this->do_parse(default_specs.begin(), default_specs.end()); + } + + template + auto format(std::chrono::time_point val, + FormatContext& ctx) const -> decltype(ctx.out()) { + return formatter::format(localtime(val), ctx); + } +}; + +template struct formatter { + private: + enum class spec { + unknown, + year_month_day, + hh_mm_ss, + }; + spec spec_ = spec::unknown; + basic_string_view specs; + + protected: + template FMT_CONSTEXPR auto do_parse(It begin, It end) -> It { + if (begin != end && *begin == ':') ++begin; + end = detail::parse_chrono_format(begin, end, detail::tm_format_checker()); + // Replace default spec only if the new spec is not empty. + if (end != begin) specs = {begin, detail::to_unsigned(end - begin)}; + return end; + } + + public: + FMT_CONSTEXPR auto parse(basic_format_parse_context& ctx) + -> decltype(ctx.begin()) { + auto end = this->do_parse(ctx.begin(), ctx.end()); + // basic_string_view<>::compare isn't constexpr before C++17. + if (specs.size() == 2 && specs[0] == Char('%')) { + if (specs[1] == Char('F')) + spec_ = spec::year_month_day; + else if (specs[1] == Char('T')) + spec_ = spec::hh_mm_ss; + } + return end; + } + + template + auto format(const std::tm& tm, FormatContext& ctx) const + -> decltype(ctx.out()) { + const auto loc_ref = ctx.locale(); + detail::get_locale loc(static_cast(loc_ref), loc_ref); + auto w = detail::tm_writer(loc, ctx.out(), tm); + if (spec_ == spec::year_month_day) + w.on_iso_date(); + else if (spec_ == spec::hh_mm_ss) + w.on_iso_time(); + else + detail::parse_chrono_format(specs.begin(), specs.end(), w); + return w.out(); + } +}; + +FMT_MODULE_EXPORT_END +FMT_END_NAMESPACE + +#endif // FMT_CHRONO_H_ diff --git a/libkram/fmt/color.h b/libkram/fmt/color.h new file mode 100644 index 00000000..06b90ba1 --- /dev/null +++ b/libkram/fmt/color.h @@ -0,0 +1,651 @@ +// Formatting library for C++ - color support +// +// Copyright (c) 2018 - present, Victor Zverovich and fmt contributors +// All rights reserved. +// +// For the license information refer to format.h. + +#ifndef FMT_COLOR_H_ +#define FMT_COLOR_H_ + +#include "format.h" + +FMT_BEGIN_NAMESPACE +FMT_MODULE_EXPORT_BEGIN + +enum class color : uint32_t { + alice_blue = 0xF0F8FF, // rgb(240,248,255) + antique_white = 0xFAEBD7, // rgb(250,235,215) + aqua = 0x00FFFF, // rgb(0,255,255) + aquamarine = 0x7FFFD4, // rgb(127,255,212) + azure = 0xF0FFFF, // rgb(240,255,255) + beige = 0xF5F5DC, // rgb(245,245,220) + bisque = 0xFFE4C4, // rgb(255,228,196) + black = 0x000000, // rgb(0,0,0) + blanched_almond = 0xFFEBCD, // rgb(255,235,205) + blue = 0x0000FF, // rgb(0,0,255) + blue_violet = 0x8A2BE2, // rgb(138,43,226) + brown = 0xA52A2A, // rgb(165,42,42) + burly_wood = 0xDEB887, // rgb(222,184,135) + cadet_blue = 0x5F9EA0, // rgb(95,158,160) + chartreuse = 0x7FFF00, // rgb(127,255,0) + chocolate = 0xD2691E, // rgb(210,105,30) + coral = 0xFF7F50, // rgb(255,127,80) + cornflower_blue = 0x6495ED, // rgb(100,149,237) + cornsilk = 0xFFF8DC, // rgb(255,248,220) + crimson = 0xDC143C, // rgb(220,20,60) + cyan = 0x00FFFF, // rgb(0,255,255) + dark_blue = 0x00008B, // rgb(0,0,139) + dark_cyan = 0x008B8B, // rgb(0,139,139) + dark_golden_rod = 0xB8860B, // rgb(184,134,11) + dark_gray = 0xA9A9A9, // rgb(169,169,169) + dark_green = 0x006400, // rgb(0,100,0) + dark_khaki = 0xBDB76B, // rgb(189,183,107) + dark_magenta = 0x8B008B, // rgb(139,0,139) + dark_olive_green = 0x556B2F, // rgb(85,107,47) + dark_orange = 0xFF8C00, // rgb(255,140,0) + dark_orchid = 0x9932CC, // rgb(153,50,204) + dark_red = 0x8B0000, // rgb(139,0,0) + dark_salmon = 0xE9967A, // rgb(233,150,122) + dark_sea_green = 0x8FBC8F, // rgb(143,188,143) + dark_slate_blue = 0x483D8B, // rgb(72,61,139) + dark_slate_gray = 0x2F4F4F, // rgb(47,79,79) + dark_turquoise = 0x00CED1, // rgb(0,206,209) + dark_violet = 0x9400D3, // rgb(148,0,211) + deep_pink = 0xFF1493, // rgb(255,20,147) + deep_sky_blue = 0x00BFFF, // rgb(0,191,255) + dim_gray = 0x696969, // rgb(105,105,105) + dodger_blue = 0x1E90FF, // rgb(30,144,255) + fire_brick = 0xB22222, // rgb(178,34,34) + floral_white = 0xFFFAF0, // rgb(255,250,240) + forest_green = 0x228B22, // rgb(34,139,34) + fuchsia = 0xFF00FF, // rgb(255,0,255) + gainsboro = 0xDCDCDC, // rgb(220,220,220) + ghost_white = 0xF8F8FF, // rgb(248,248,255) + gold = 0xFFD700, // rgb(255,215,0) + golden_rod = 0xDAA520, // rgb(218,165,32) + gray = 0x808080, // rgb(128,128,128) + green = 0x008000, // rgb(0,128,0) + green_yellow = 0xADFF2F, // rgb(173,255,47) + honey_dew = 0xF0FFF0, // rgb(240,255,240) + hot_pink = 0xFF69B4, // rgb(255,105,180) + indian_red = 0xCD5C5C, // rgb(205,92,92) + indigo = 0x4B0082, // rgb(75,0,130) + ivory = 0xFFFFF0, // rgb(255,255,240) + khaki = 0xF0E68C, // rgb(240,230,140) + lavender = 0xE6E6FA, // rgb(230,230,250) + lavender_blush = 0xFFF0F5, // rgb(255,240,245) + lawn_green = 0x7CFC00, // rgb(124,252,0) + lemon_chiffon = 0xFFFACD, // rgb(255,250,205) + light_blue = 0xADD8E6, // rgb(173,216,230) + light_coral = 0xF08080, // rgb(240,128,128) + light_cyan = 0xE0FFFF, // rgb(224,255,255) + light_golden_rod_yellow = 0xFAFAD2, // rgb(250,250,210) + light_gray = 0xD3D3D3, // rgb(211,211,211) + light_green = 0x90EE90, // rgb(144,238,144) + light_pink = 0xFFB6C1, // rgb(255,182,193) + light_salmon = 0xFFA07A, // rgb(255,160,122) + light_sea_green = 0x20B2AA, // rgb(32,178,170) + light_sky_blue = 0x87CEFA, // rgb(135,206,250) + light_slate_gray = 0x778899, // rgb(119,136,153) + light_steel_blue = 0xB0C4DE, // rgb(176,196,222) + light_yellow = 0xFFFFE0, // rgb(255,255,224) + lime = 0x00FF00, // rgb(0,255,0) + lime_green = 0x32CD32, // rgb(50,205,50) + linen = 0xFAF0E6, // rgb(250,240,230) + magenta = 0xFF00FF, // rgb(255,0,255) + maroon = 0x800000, // rgb(128,0,0) + medium_aquamarine = 0x66CDAA, // rgb(102,205,170) + medium_blue = 0x0000CD, // rgb(0,0,205) + medium_orchid = 0xBA55D3, // rgb(186,85,211) + medium_purple = 0x9370DB, // rgb(147,112,219) + medium_sea_green = 0x3CB371, // rgb(60,179,113) + medium_slate_blue = 0x7B68EE, // rgb(123,104,238) + medium_spring_green = 0x00FA9A, // rgb(0,250,154) + medium_turquoise = 0x48D1CC, // rgb(72,209,204) + medium_violet_red = 0xC71585, // rgb(199,21,133) + midnight_blue = 0x191970, // rgb(25,25,112) + mint_cream = 0xF5FFFA, // rgb(245,255,250) + misty_rose = 0xFFE4E1, // rgb(255,228,225) + moccasin = 0xFFE4B5, // rgb(255,228,181) + navajo_white = 0xFFDEAD, // rgb(255,222,173) + navy = 0x000080, // rgb(0,0,128) + old_lace = 0xFDF5E6, // rgb(253,245,230) + olive = 0x808000, // rgb(128,128,0) + olive_drab = 0x6B8E23, // rgb(107,142,35) + orange = 0xFFA500, // rgb(255,165,0) + orange_red = 0xFF4500, // rgb(255,69,0) + orchid = 0xDA70D6, // rgb(218,112,214) + pale_golden_rod = 0xEEE8AA, // rgb(238,232,170) + pale_green = 0x98FB98, // rgb(152,251,152) + pale_turquoise = 0xAFEEEE, // rgb(175,238,238) + pale_violet_red = 0xDB7093, // rgb(219,112,147) + papaya_whip = 0xFFEFD5, // rgb(255,239,213) + peach_puff = 0xFFDAB9, // rgb(255,218,185) + peru = 0xCD853F, // rgb(205,133,63) + pink = 0xFFC0CB, // rgb(255,192,203) + plum = 0xDDA0DD, // rgb(221,160,221) + powder_blue = 0xB0E0E6, // rgb(176,224,230) + purple = 0x800080, // rgb(128,0,128) + rebecca_purple = 0x663399, // rgb(102,51,153) + red = 0xFF0000, // rgb(255,0,0) + rosy_brown = 0xBC8F8F, // rgb(188,143,143) + royal_blue = 0x4169E1, // rgb(65,105,225) + saddle_brown = 0x8B4513, // rgb(139,69,19) + salmon = 0xFA8072, // rgb(250,128,114) + sandy_brown = 0xF4A460, // rgb(244,164,96) + sea_green = 0x2E8B57, // rgb(46,139,87) + sea_shell = 0xFFF5EE, // rgb(255,245,238) + sienna = 0xA0522D, // rgb(160,82,45) + silver = 0xC0C0C0, // rgb(192,192,192) + sky_blue = 0x87CEEB, // rgb(135,206,235) + slate_blue = 0x6A5ACD, // rgb(106,90,205) + slate_gray = 0x708090, // rgb(112,128,144) + snow = 0xFFFAFA, // rgb(255,250,250) + spring_green = 0x00FF7F, // rgb(0,255,127) + steel_blue = 0x4682B4, // rgb(70,130,180) + tan = 0xD2B48C, // rgb(210,180,140) + teal = 0x008080, // rgb(0,128,128) + thistle = 0xD8BFD8, // rgb(216,191,216) + tomato = 0xFF6347, // rgb(255,99,71) + turquoise = 0x40E0D0, // rgb(64,224,208) + violet = 0xEE82EE, // rgb(238,130,238) + wheat = 0xF5DEB3, // rgb(245,222,179) + white = 0xFFFFFF, // rgb(255,255,255) + white_smoke = 0xF5F5F5, // rgb(245,245,245) + yellow = 0xFFFF00, // rgb(255,255,0) + yellow_green = 0x9ACD32 // rgb(154,205,50) +}; // enum class color + +enum class terminal_color : uint8_t { + black = 30, + red, + green, + yellow, + blue, + magenta, + cyan, + white, + bright_black = 90, + bright_red, + bright_green, + bright_yellow, + bright_blue, + bright_magenta, + bright_cyan, + bright_white +}; + +enum class emphasis : uint8_t { + bold = 1, + faint = 1 << 1, + italic = 1 << 2, + underline = 1 << 3, + blink = 1 << 4, + reverse = 1 << 5, + conceal = 1 << 6, + strikethrough = 1 << 7, +}; + +// rgb is a struct for red, green and blue colors. +// Using the name "rgb" makes some editors show the color in a tooltip. +struct rgb { + FMT_CONSTEXPR rgb() : r(0), g(0), b(0) {} + FMT_CONSTEXPR rgb(uint8_t r_, uint8_t g_, uint8_t b_) : r(r_), g(g_), b(b_) {} + FMT_CONSTEXPR rgb(uint32_t hex) + : r((hex >> 16) & 0xFF), g((hex >> 8) & 0xFF), b(hex & 0xFF) {} + FMT_CONSTEXPR rgb(color hex) + : r((uint32_t(hex) >> 16) & 0xFF), + g((uint32_t(hex) >> 8) & 0xFF), + b(uint32_t(hex) & 0xFF) {} + uint8_t r; + uint8_t g; + uint8_t b; +}; + +FMT_BEGIN_DETAIL_NAMESPACE + +// color is a struct of either a rgb color or a terminal color. +struct color_type { + FMT_CONSTEXPR color_type() noexcept : is_rgb(), value{} {} + FMT_CONSTEXPR color_type(color rgb_color) noexcept : is_rgb(true), value{} { + value.rgb_color = static_cast(rgb_color); + } + FMT_CONSTEXPR color_type(rgb rgb_color) noexcept : is_rgb(true), value{} { + value.rgb_color = (static_cast(rgb_color.r) << 16) | + (static_cast(rgb_color.g) << 8) | rgb_color.b; + } + FMT_CONSTEXPR color_type(terminal_color term_color) noexcept + : is_rgb(), value{} { + value.term_color = static_cast(term_color); + } + bool is_rgb; + union color_union { + uint8_t term_color; + uint32_t rgb_color; + } value; +}; + +FMT_END_DETAIL_NAMESPACE + +/** A text style consisting of foreground and background colors and emphasis. */ +class text_style { + public: + FMT_CONSTEXPR text_style(emphasis em = emphasis()) noexcept + : set_foreground_color(), set_background_color(), ems(em) {} + + FMT_CONSTEXPR text_style& operator|=(const text_style& rhs) { + if (!set_foreground_color) { + set_foreground_color = rhs.set_foreground_color; + foreground_color = rhs.foreground_color; + } else if (rhs.set_foreground_color) { + if (!foreground_color.is_rgb || !rhs.foreground_color.is_rgb) + FMT_THROW(format_error("can't OR a terminal color")); + foreground_color.value.rgb_color |= rhs.foreground_color.value.rgb_color; + } + + if (!set_background_color) { + set_background_color = rhs.set_background_color; + background_color = rhs.background_color; + } else if (rhs.set_background_color) { + if (!background_color.is_rgb || !rhs.background_color.is_rgb) + FMT_THROW(format_error("can't OR a terminal color")); + background_color.value.rgb_color |= rhs.background_color.value.rgb_color; + } + + ems = static_cast(static_cast(ems) | + static_cast(rhs.ems)); + return *this; + } + + friend FMT_CONSTEXPR text_style operator|(text_style lhs, + const text_style& rhs) { + return lhs |= rhs; + } + + FMT_CONSTEXPR bool has_foreground() const noexcept { + return set_foreground_color; + } + FMT_CONSTEXPR bool has_background() const noexcept { + return set_background_color; + } + FMT_CONSTEXPR bool has_emphasis() const noexcept { + return static_cast(ems) != 0; + } + FMT_CONSTEXPR detail::color_type get_foreground() const noexcept { + FMT_ASSERT(has_foreground(), "no foreground specified for this style"); + return foreground_color; + } + FMT_CONSTEXPR detail::color_type get_background() const noexcept { + FMT_ASSERT(has_background(), "no background specified for this style"); + return background_color; + } + FMT_CONSTEXPR emphasis get_emphasis() const noexcept { + FMT_ASSERT(has_emphasis(), "no emphasis specified for this style"); + return ems; + } + + private: + FMT_CONSTEXPR text_style(bool is_foreground, + detail::color_type text_color) noexcept + : set_foreground_color(), set_background_color(), ems() { + if (is_foreground) { + foreground_color = text_color; + set_foreground_color = true; + } else { + background_color = text_color; + set_background_color = true; + } + } + + friend FMT_CONSTEXPR text_style fg(detail::color_type foreground) noexcept; + + friend FMT_CONSTEXPR text_style bg(detail::color_type background) noexcept; + + detail::color_type foreground_color; + detail::color_type background_color; + bool set_foreground_color; + bool set_background_color; + emphasis ems; +}; + +/** Creates a text style from the foreground (text) color. */ +FMT_CONSTEXPR inline text_style fg(detail::color_type foreground) noexcept { + return text_style(true, foreground); +} + +/** Creates a text style from the background color. */ +FMT_CONSTEXPR inline text_style bg(detail::color_type background) noexcept { + return text_style(false, background); +} + +FMT_CONSTEXPR inline text_style operator|(emphasis lhs, emphasis rhs) noexcept { + return text_style(lhs) | rhs; +} + +FMT_BEGIN_DETAIL_NAMESPACE + +template struct ansi_color_escape { + FMT_CONSTEXPR ansi_color_escape(detail::color_type text_color, + const char* esc) noexcept { + // If we have a terminal color, we need to output another escape code + // sequence. + if (!text_color.is_rgb) { + bool is_background = esc == string_view("\x1b[48;2;"); + uint32_t value = text_color.value.term_color; + // Background ASCII codes are the same as the foreground ones but with + // 10 more. + if (is_background) value += 10u; + + size_t index = 0; + buffer[index++] = static_cast('\x1b'); + buffer[index++] = static_cast('['); + + if (value >= 100u) { + buffer[index++] = static_cast('1'); + value %= 100u; + } + buffer[index++] = static_cast('0' + value / 10u); + buffer[index++] = static_cast('0' + value % 10u); + + buffer[index++] = static_cast('m'); + buffer[index++] = static_cast('\0'); + return; + } + + for (int i = 0; i < 7; i++) { + buffer[i] = static_cast(esc[i]); + } + rgb color(text_color.value.rgb_color); + to_esc(color.r, buffer + 7, ';'); + to_esc(color.g, buffer + 11, ';'); + to_esc(color.b, buffer + 15, 'm'); + buffer[19] = static_cast(0); + } + FMT_CONSTEXPR ansi_color_escape(emphasis em) noexcept { + uint8_t em_codes[num_emphases] = {}; + if (has_emphasis(em, emphasis::bold)) em_codes[0] = 1; + if (has_emphasis(em, emphasis::faint)) em_codes[1] = 2; + if (has_emphasis(em, emphasis::italic)) em_codes[2] = 3; + if (has_emphasis(em, emphasis::underline)) em_codes[3] = 4; + if (has_emphasis(em, emphasis::blink)) em_codes[4] = 5; + if (has_emphasis(em, emphasis::reverse)) em_codes[5] = 7; + if (has_emphasis(em, emphasis::conceal)) em_codes[6] = 8; + if (has_emphasis(em, emphasis::strikethrough)) em_codes[7] = 9; + + size_t index = 0; + for (size_t i = 0; i < num_emphases; ++i) { + if (!em_codes[i]) continue; + buffer[index++] = static_cast('\x1b'); + buffer[index++] = static_cast('['); + buffer[index++] = static_cast('0' + em_codes[i]); + buffer[index++] = static_cast('m'); + } + buffer[index++] = static_cast(0); + } + FMT_CONSTEXPR operator const Char*() const noexcept { return buffer; } + + FMT_CONSTEXPR const Char* begin() const noexcept { return buffer; } + FMT_CONSTEXPR_CHAR_TRAITS const Char* end() const noexcept { + return buffer + std::char_traits::length(buffer); + } + + private: + static constexpr size_t num_emphases = 8; + Char buffer[7u + 3u * num_emphases + 1u]; + + static FMT_CONSTEXPR void to_esc(uint8_t c, Char* out, + char delimiter) noexcept { + out[0] = static_cast('0' + c / 100); + out[1] = static_cast('0' + c / 10 % 10); + out[2] = static_cast('0' + c % 10); + out[3] = static_cast(delimiter); + } + static FMT_CONSTEXPR bool has_emphasis(emphasis em, emphasis mask) noexcept { + return static_cast(em) & static_cast(mask); + } +}; + +template +FMT_CONSTEXPR ansi_color_escape make_foreground_color( + detail::color_type foreground) noexcept { + return ansi_color_escape(foreground, "\x1b[38;2;"); +} + +template +FMT_CONSTEXPR ansi_color_escape make_background_color( + detail::color_type background) noexcept { + return ansi_color_escape(background, "\x1b[48;2;"); +} + +template +FMT_CONSTEXPR ansi_color_escape make_emphasis(emphasis em) noexcept { + return ansi_color_escape(em); +} + +template inline void fputs(const Char* chars, FILE* stream) { + int result = std::fputs(chars, stream); + if (result < 0) + FMT_THROW(system_error(errno, FMT_STRING("cannot write to file"))); +} + +template <> inline void fputs(const wchar_t* chars, FILE* stream) { + int result = std::fputws(chars, stream); + if (result < 0) + FMT_THROW(system_error(errno, FMT_STRING("cannot write to file"))); +} + +template inline void reset_color(FILE* stream) { + fputs("\x1b[0m", stream); +} + +template <> inline void reset_color(FILE* stream) { + fputs(L"\x1b[0m", stream); +} + +template inline void reset_color(buffer& buffer) { + auto reset_color = string_view("\x1b[0m"); + buffer.append(reset_color.begin(), reset_color.end()); +} + +template struct styled_arg { + const T& value; + text_style style; +}; + +template +void vformat_to(buffer& buf, const text_style& ts, + basic_string_view format_str, + basic_format_args>> args) { + bool has_style = false; + if (ts.has_emphasis()) { + has_style = true; + auto emphasis = detail::make_emphasis(ts.get_emphasis()); + buf.append(emphasis.begin(), emphasis.end()); + } + if (ts.has_foreground()) { + has_style = true; + auto foreground = detail::make_foreground_color(ts.get_foreground()); + buf.append(foreground.begin(), foreground.end()); + } + if (ts.has_background()) { + has_style = true; + auto background = detail::make_background_color(ts.get_background()); + buf.append(background.begin(), background.end()); + } + detail::vformat_to(buf, format_str, args, {}); + if (has_style) detail::reset_color(buf); +} + +FMT_END_DETAIL_NAMESPACE + +template > +void vprint(std::FILE* f, const text_style& ts, const S& format, + basic_format_args>> args) { + basic_memory_buffer buf; + detail::vformat_to(buf, ts, detail::to_string_view(format), args); + if (detail::is_utf8()) { + detail::print(f, basic_string_view(buf.begin(), buf.size())); + } else { + buf.push_back(Char(0)); + detail::fputs(buf.data(), f); + } +} + +/** + \rst + Formats a string and prints it to the specified file stream using ANSI + escape sequences to specify text formatting. + + **Example**:: + + fmt::print(fmt::emphasis::bold | fg(fmt::color::red), + "Elapsed time: {0:.2f} seconds", 1.23); + \endrst + */ +template ::value)> +void print(std::FILE* f, const text_style& ts, const S& format_str, + const Args&... args) { + vprint(f, ts, format_str, + fmt::make_format_args>>(args...)); +} + +/** + \rst + Formats a string and prints it to stdout using ANSI escape sequences to + specify text formatting. + + **Example**:: + + fmt::print(fmt::emphasis::bold | fg(fmt::color::red), + "Elapsed time: {0:.2f} seconds", 1.23); + \endrst + */ +template ::value)> +void print(const text_style& ts, const S& format_str, const Args&... args) { + return print(stdout, ts, format_str, args...); +} + +template > +inline std::basic_string vformat( + const text_style& ts, const S& format_str, + basic_format_args>> args) { + basic_memory_buffer buf; + detail::vformat_to(buf, ts, detail::to_string_view(format_str), args); + return fmt::to_string(buf); +} + +/** + \rst + Formats arguments and returns the result as a string using ANSI + escape sequences to specify text formatting. + + **Example**:: + + #include + std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red), + "The answer is {}", 42); + \endrst +*/ +template > +inline std::basic_string format(const text_style& ts, const S& format_str, + const Args&... args) { + return fmt::vformat(ts, detail::to_string_view(format_str), + fmt::make_format_args>(args...)); +} + +/** + Formats a string with the given text_style and writes the output to ``out``. + */ +template ::value)> +OutputIt vformat_to( + OutputIt out, const text_style& ts, basic_string_view format_str, + basic_format_args>> args) { + auto&& buf = detail::get_buffer(out); + detail::vformat_to(buf, ts, format_str, args); + return detail::get_iterator(buf, out); +} + +/** + \rst + Formats arguments with the given text_style, writes the result to the output + iterator ``out`` and returns the iterator past the end of the output range. + + **Example**:: + + std::vector out; + fmt::format_to(std::back_inserter(out), + fmt::emphasis::bold | fg(fmt::color::red), "{}", 42); + \endrst +*/ +template >::value&& + detail::is_string::value> +inline auto format_to(OutputIt out, const text_style& ts, const S& format_str, + Args&&... args) -> + typename std::enable_if::type { + return vformat_to(out, ts, detail::to_string_view(format_str), + fmt::make_format_args>>(args...)); +} + +template +struct formatter, Char> : formatter { + template + auto format(const detail::styled_arg& arg, FormatContext& ctx) const + -> decltype(ctx.out()) { + const auto& ts = arg.style; + const auto& value = arg.value; + auto out = ctx.out(); + + bool has_style = false; + if (ts.has_emphasis()) { + has_style = true; + auto emphasis = detail::make_emphasis(ts.get_emphasis()); + out = std::copy(emphasis.begin(), emphasis.end(), out); + } + if (ts.has_foreground()) { + has_style = true; + auto foreground = + detail::make_foreground_color(ts.get_foreground()); + out = std::copy(foreground.begin(), foreground.end(), out); + } + if (ts.has_background()) { + has_style = true; + auto background = + detail::make_background_color(ts.get_background()); + out = std::copy(background.begin(), background.end(), out); + } + out = formatter::format(value, ctx); + if (has_style) { + auto reset_color = string_view("\x1b[0m"); + out = std::copy(reset_color.begin(), reset_color.end(), out); + } + return out; + } +}; + +/** + \rst + Returns an argument that will be formatted using ANSI escape sequences, + to be used in a formatting function. + + **Example**:: + + fmt::print("Elapsed time: {0:.2f} seconds", + fmt::styled(1.23, fmt::fg(fmt::color::green) | + fmt::bg(fmt::color::blue))); + \endrst + */ +template +FMT_CONSTEXPR auto styled(const T& value, text_style ts) + -> detail::styled_arg> { + return detail::styled_arg>{value, ts}; +} + +FMT_MODULE_EXPORT_END +FMT_END_NAMESPACE + +#endif // FMT_COLOR_H_ diff --git a/libkram/fmt/compile.h b/libkram/fmt/compile.h new file mode 100644 index 00000000..933668c4 --- /dev/null +++ b/libkram/fmt/compile.h @@ -0,0 +1,611 @@ +// Formatting library for C++ - experimental format string compilation +// +// Copyright (c) 2012 - present, Victor Zverovich and fmt contributors +// All rights reserved. +// +// For the license information refer to format.h. + +#ifndef FMT_COMPILE_H_ +#define FMT_COMPILE_H_ + +#include "format.h" + +FMT_BEGIN_NAMESPACE +namespace detail { + +template +FMT_CONSTEXPR inline counting_iterator copy_str(InputIt begin, InputIt end, + counting_iterator it) { + return it + (end - begin); +} + +template class truncating_iterator_base { + protected: + OutputIt out_; + size_t limit_; + size_t count_ = 0; + + truncating_iterator_base() : out_(), limit_(0) {} + + truncating_iterator_base(OutputIt out, size_t limit) + : out_(out), limit_(limit) {} + + public: + using iterator_category = std::output_iterator_tag; + using value_type = typename std::iterator_traits::value_type; + using difference_type = std::ptrdiff_t; + using pointer = void; + using reference = void; + FMT_UNCHECKED_ITERATOR(truncating_iterator_base); + + OutputIt base() const { return out_; } + size_t count() const { return count_; } +}; + +// An output iterator that truncates the output and counts the number of objects +// written to it. +template ::value_type>::type> +class truncating_iterator; + +template +class truncating_iterator + : public truncating_iterator_base { + mutable typename truncating_iterator_base::value_type blackhole_; + + public: + using value_type = typename truncating_iterator_base::value_type; + + truncating_iterator() = default; + + truncating_iterator(OutputIt out, size_t limit) + : truncating_iterator_base(out, limit) {} + + truncating_iterator& operator++() { + if (this->count_++ < this->limit_) ++this->out_; + return *this; + } + + truncating_iterator operator++(int) { + auto it = *this; + ++*this; + return it; + } + + value_type& operator*() const { + return this->count_ < this->limit_ ? *this->out_ : blackhole_; + } +}; + +template +class truncating_iterator + : public truncating_iterator_base { + public: + truncating_iterator() = default; + + truncating_iterator(OutputIt out, size_t limit) + : truncating_iterator_base(out, limit) {} + + template truncating_iterator& operator=(T val) { + if (this->count_++ < this->limit_) *this->out_++ = val; + return *this; + } + + truncating_iterator& operator++() { return *this; } + truncating_iterator& operator++(int) { return *this; } + truncating_iterator& operator*() { return *this; } +}; + +// A compile-time string which is compiled into fast formatting code. +class compiled_string {}; + +template +struct is_compiled_string : std::is_base_of {}; + +/** + \rst + Converts a string literal *s* into a format string that will be parsed at + compile time and converted into efficient formatting code. Requires C++17 + ``constexpr if`` compiler support. + + **Example**:: + + // Converts 42 into std::string using the most efficient method and no + // runtime format string processing. + std::string s = fmt::format(FMT_COMPILE("{}"), 42); + \endrst + */ +#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction) +# define FMT_COMPILE(s) \ + FMT_STRING_IMPL(s, fmt::detail::compiled_string, explicit) +#else +# define FMT_COMPILE(s) FMT_STRING(s) +#endif + +#if FMT_USE_NONTYPE_TEMPLATE_ARGS +template Str> +struct udl_compiled_string : compiled_string { + using char_type = Char; + explicit constexpr operator basic_string_view() const { + return {Str.data, N - 1}; + } +}; +#endif + +template +const T& first(const T& value, const Tail&...) { + return value; +} + +#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction) +template struct type_list {}; + +// Returns a reference to the argument at index N from [first, rest...]. +template +constexpr const auto& get([[maybe_unused]] const T& first, + [[maybe_unused]] const Args&... rest) { + static_assert(N < 1 + sizeof...(Args), "index is out of bounds"); + if constexpr (N == 0) + return first; + else + return detail::get(rest...); +} + +template +constexpr int get_arg_index_by_name(basic_string_view name, + type_list) { + return get_arg_index_by_name(name); +} + +template struct get_type_impl; + +template struct get_type_impl> { + using type = + remove_cvref_t(std::declval()...))>; +}; + +template +using get_type = typename get_type_impl::type; + +template struct is_compiled_format : std::false_type {}; + +template struct text { + basic_string_view data; + using char_type = Char; + + template + constexpr OutputIt format(OutputIt out, const Args&...) const { + return write(out, data); + } +}; + +template +struct is_compiled_format> : std::true_type {}; + +template +constexpr text make_text(basic_string_view s, size_t pos, + size_t size) { + return {{&s[pos], size}}; +} + +template struct code_unit { + Char value; + using char_type = Char; + + template + constexpr OutputIt format(OutputIt out, const Args&...) const { + return write(out, value); + } +}; + +// This ensures that the argument type is convertible to `const T&`. +template +constexpr const T& get_arg_checked(const Args&... args) { + const auto& arg = detail::get(args...); + if constexpr (detail::is_named_arg>()) { + return arg.value; + } else { + return arg; + } +} + +template +struct is_compiled_format> : std::true_type {}; + +// A replacement field that refers to argument N. +template struct field { + using char_type = Char; + + template + constexpr OutputIt format(OutputIt out, const Args&... args) const { + return write(out, get_arg_checked(args...)); + } +}; + +template +struct is_compiled_format> : std::true_type {}; + +// A replacement field that refers to argument with name. +template struct runtime_named_field { + using char_type = Char; + basic_string_view name; + + template + constexpr static bool try_format_argument( + OutputIt& out, + // [[maybe_unused]] due to unused-but-set-parameter warning in GCC 7,8,9 + [[maybe_unused]] basic_string_view arg_name, const T& arg) { + if constexpr (is_named_arg::type>::value) { + if (arg_name == arg.name) { + out = write(out, arg.value); + return true; + } + } + return false; + } + + template + constexpr OutputIt format(OutputIt out, const Args&... args) const { + bool found = (try_format_argument(out, name, args) || ...); + if (!found) { + FMT_THROW(format_error("argument with specified name is not found")); + } + return out; + } +}; + +template +struct is_compiled_format> : std::true_type {}; + +// A replacement field that refers to argument N and has format specifiers. +template struct spec_field { + using char_type = Char; + formatter fmt; + + template + constexpr FMT_INLINE OutputIt format(OutputIt out, + const Args&... args) const { + const auto& vargs = + fmt::make_format_args>(args...); + basic_format_context ctx(out, vargs); + return fmt.format(get_arg_checked(args...), ctx); + } +}; + +template +struct is_compiled_format> : std::true_type {}; + +template struct concat { + L lhs; + R rhs; + using char_type = typename L::char_type; + + template + constexpr OutputIt format(OutputIt out, const Args&... args) const { + out = lhs.format(out, args...); + return rhs.format(out, args...); + } +}; + +template +struct is_compiled_format> : std::true_type {}; + +template +constexpr concat make_concat(L lhs, R rhs) { + return {lhs, rhs}; +} + +struct unknown_format {}; + +template +constexpr size_t parse_text(basic_string_view str, size_t pos) { + for (size_t size = str.size(); pos != size; ++pos) { + if (str[pos] == '{' || str[pos] == '}') break; + } + return pos; +} + +template +constexpr auto compile_format_string(S format_str); + +template +constexpr auto parse_tail(T head, S format_str) { + if constexpr (POS != + basic_string_view(format_str).size()) { + constexpr auto tail = compile_format_string(format_str); + if constexpr (std::is_same, + unknown_format>()) + return tail; + else + return make_concat(head, tail); + } else { + return head; + } +} + +template struct parse_specs_result { + formatter fmt; + size_t end; + int next_arg_id; +}; + +constexpr int manual_indexing_id = -1; + +template +constexpr parse_specs_result parse_specs(basic_string_view str, + size_t pos, int next_arg_id) { + str.remove_prefix(pos); + auto ctx = compile_parse_context(str, max_value(), nullptr, {}, + next_arg_id); + auto f = formatter(); + auto end = f.parse(ctx); + return {f, pos + fmt::detail::to_unsigned(end - str.data()), + next_arg_id == 0 ? manual_indexing_id : ctx.next_arg_id()}; +} + +template struct arg_id_handler { + arg_ref arg_id; + + constexpr int operator()() { + FMT_ASSERT(false, "handler cannot be used with automatic indexing"); + return 0; + } + constexpr int operator()(int id) { + arg_id = arg_ref(id); + return 0; + } + constexpr int operator()(basic_string_view id) { + arg_id = arg_ref(id); + return 0; + } + + constexpr void on_error(const char* message) { + FMT_THROW(format_error(message)); + } +}; + +template struct parse_arg_id_result { + arg_ref arg_id; + const Char* arg_id_end; +}; + +template +constexpr auto parse_arg_id(const Char* begin, const Char* end) { + auto handler = arg_id_handler{arg_ref{}}; + auto arg_id_end = parse_arg_id(begin, end, handler); + return parse_arg_id_result{handler.arg_id, arg_id_end}; +} + +template struct field_type { + using type = remove_cvref_t; +}; + +template +struct field_type::value>> { + using type = remove_cvref_t; +}; + +template +constexpr auto parse_replacement_field_then_tail(S format_str) { + using char_type = typename S::char_type; + constexpr auto str = basic_string_view(format_str); + constexpr char_type c = END_POS != str.size() ? str[END_POS] : char_type(); + if constexpr (c == '}') { + return parse_tail( + field::type, ARG_INDEX>(), + format_str); + } else if constexpr (c != ':') { + FMT_THROW(format_error("expected ':'")); + } else { + constexpr auto result = parse_specs::type>( + str, END_POS + 1, NEXT_ID == manual_indexing_id ? 0 : NEXT_ID); + if constexpr (result.end >= str.size() || str[result.end] != '}') { + FMT_THROW(format_error("expected '}'")); + return 0; + } else { + return parse_tail( + spec_field::type, ARG_INDEX>{ + result.fmt}, + format_str); + } + } +} + +// Compiles a non-empty format string and returns the compiled representation +// or unknown_format() on unrecognized input. +template +constexpr auto compile_format_string(S format_str) { + using char_type = typename S::char_type; + constexpr auto str = basic_string_view(format_str); + if constexpr (str[POS] == '{') { + if constexpr (POS + 1 == str.size()) + FMT_THROW(format_error("unmatched '{' in format string")); + if constexpr (str[POS + 1] == '{') { + return parse_tail(make_text(str, POS, 1), format_str); + } else if constexpr (str[POS + 1] == '}' || str[POS + 1] == ':') { + static_assert(ID != manual_indexing_id, + "cannot switch from manual to automatic argument indexing"); + constexpr auto next_id = + ID != manual_indexing_id ? ID + 1 : manual_indexing_id; + return parse_replacement_field_then_tail, Args, + POS + 1, ID, next_id>( + format_str); + } else { + constexpr auto arg_id_result = + parse_arg_id(str.data() + POS + 1, str.data() + str.size()); + constexpr auto arg_id_end_pos = arg_id_result.arg_id_end - str.data(); + constexpr char_type c = + arg_id_end_pos != str.size() ? str[arg_id_end_pos] : char_type(); + static_assert(c == '}' || c == ':', "missing '}' in format string"); + if constexpr (arg_id_result.arg_id.kind == arg_id_kind::index) { + static_assert( + ID == manual_indexing_id || ID == 0, + "cannot switch from automatic to manual argument indexing"); + constexpr auto arg_index = arg_id_result.arg_id.val.index; + return parse_replacement_field_then_tail, + Args, arg_id_end_pos, + arg_index, manual_indexing_id>( + format_str); + } else if constexpr (arg_id_result.arg_id.kind == arg_id_kind::name) { + constexpr auto arg_index = + get_arg_index_by_name(arg_id_result.arg_id.val.name, Args{}); + if constexpr (arg_index != invalid_arg_index) { + constexpr auto next_id = + ID != manual_indexing_id ? ID + 1 : manual_indexing_id; + return parse_replacement_field_then_tail< + decltype(get_type::value), Args, arg_id_end_pos, + arg_index, next_id>(format_str); + } else { + if constexpr (c == '}') { + return parse_tail( + runtime_named_field{arg_id_result.arg_id.val.name}, + format_str); + } else if constexpr (c == ':') { + return unknown_format(); // no type info for specs parsing + } + } + } + } + } else if constexpr (str[POS] == '}') { + if constexpr (POS + 1 == str.size()) + FMT_THROW(format_error("unmatched '}' in format string")); + return parse_tail(make_text(str, POS, 1), format_str); + } else { + constexpr auto end = parse_text(str, POS + 1); + if constexpr (end - POS > 1) { + return parse_tail(make_text(str, POS, end - POS), + format_str); + } else { + return parse_tail(code_unit{str[POS]}, + format_str); + } + } +} + +template ::value)> +constexpr auto compile(S format_str) { + constexpr auto str = basic_string_view(format_str); + if constexpr (str.size() == 0) { + return detail::make_text(str, 0, 0); + } else { + constexpr auto result = + detail::compile_format_string, 0, 0>( + format_str); + return result; + } +} +#endif // defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction) +} // namespace detail + +FMT_MODULE_EXPORT_BEGIN + +#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction) + +template ::value)> +FMT_INLINE std::basic_string format(const CompiledFormat& cf, + const Args&... args) { + auto s = std::basic_string(); + cf.format(std::back_inserter(s), args...); + return s; +} + +template ::value)> +constexpr FMT_INLINE OutputIt format_to(OutputIt out, const CompiledFormat& cf, + const Args&... args) { + return cf.format(out, args...); +} + +template ::value)> +FMT_INLINE std::basic_string format(const S&, + Args&&... args) { + if constexpr (std::is_same::value) { + constexpr auto str = basic_string_view(S()); + if constexpr (str.size() == 2 && str[0] == '{' && str[1] == '}') { + const auto& first = detail::first(args...); + if constexpr (detail::is_named_arg< + remove_cvref_t>::value) { + return fmt::to_string(first.value); + } else { + return fmt::to_string(first); + } + } + } + constexpr auto compiled = detail::compile(S()); + if constexpr (std::is_same, + detail::unknown_format>()) { + return fmt::format( + static_cast>(S()), + std::forward(args)...); + } else { + return fmt::format(compiled, std::forward(args)...); + } +} + +template ::value)> +FMT_CONSTEXPR OutputIt format_to(OutputIt out, const S&, Args&&... args) { + constexpr auto compiled = detail::compile(S()); + if constexpr (std::is_same, + detail::unknown_format>()) { + return fmt::format_to( + out, static_cast>(S()), + std::forward(args)...); + } else { + return fmt::format_to(out, compiled, std::forward(args)...); + } +} +#endif + +template ::value)> +format_to_n_result format_to_n(OutputIt out, size_t n, + const S& format_str, Args&&... args) { + auto it = fmt::format_to(detail::truncating_iterator(out, n), + format_str, std::forward(args)...); + return {it.base(), it.count()}; +} + +template ::value)> +FMT_CONSTEXPR20 size_t formatted_size(const S& format_str, + const Args&... args) { + return fmt::format_to(detail::counting_iterator(), format_str, args...) + .count(); +} + +template ::value)> +void print(std::FILE* f, const S& format_str, const Args&... args) { + memory_buffer buffer; + fmt::format_to(std::back_inserter(buffer), format_str, args...); + detail::print(f, {buffer.data(), buffer.size()}); +} + +template ::value)> +void print(const S& format_str, const Args&... args) { + print(stdout, format_str, args...); +} + +#if FMT_USE_NONTYPE_TEMPLATE_ARGS +inline namespace literals { +template constexpr auto operator""_cf() { + using char_t = remove_cvref_t; + return detail::udl_compiled_string(); +} +} // namespace literals +#endif + +FMT_MODULE_EXPORT_END +FMT_END_NAMESPACE + +#endif // FMT_COMPILE_H_ diff --git a/libkram/fmt/core.h b/libkram/fmt/core.h new file mode 100644 index 00000000..549f948f --- /dev/null +++ b/libkram/fmt/core.h @@ -0,0 +1,3338 @@ +// Formatting library for C++ - the core API for char/UTF-8 +// +// Copyright (c) 2012 - present, Victor Zverovich +// All rights reserved. +// +// For the license information refer to format.h. + +#ifndef FMT_CORE_H_ +#define FMT_CORE_H_ + +#include // std::byte +#include // std::FILE +#include // std::strlen +#include +#include +#include +#include + +// The fmt library version in the form major * 10000 + minor * 100 + patch. +#define FMT_VERSION 90101 + +#if defined(__clang__) && !defined(__ibmxl__) +# define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__) +#else +# define FMT_CLANG_VERSION 0 +#endif + +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \ + !defined(__NVCOMPILER) +# define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +#else +# define FMT_GCC_VERSION 0 +#endif + +#ifndef FMT_GCC_PRAGMA +// Workaround _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884. +# if FMT_GCC_VERSION >= 504 +# define FMT_GCC_PRAGMA(arg) _Pragma(arg) +# else +# define FMT_GCC_PRAGMA(arg) +# endif +#endif + +#ifdef __ICL +# define FMT_ICC_VERSION __ICL +#elif defined(__INTEL_COMPILER) +# define FMT_ICC_VERSION __INTEL_COMPILER +#else +# define FMT_ICC_VERSION 0 +#endif + +#ifdef _MSC_VER +# define FMT_MSC_VERSION _MSC_VER +# define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__)) +#else +# define FMT_MSC_VERSION 0 +# define FMT_MSC_WARNING(...) +#endif + +#ifdef _MSVC_LANG +# define FMT_CPLUSPLUS _MSVC_LANG +#else +# define FMT_CPLUSPLUS __cplusplus +#endif + +#ifdef __has_feature +# define FMT_HAS_FEATURE(x) __has_feature(x) +#else +# define FMT_HAS_FEATURE(x) 0 +#endif + +#if defined(__has_include) || FMT_ICC_VERSION >= 1600 || FMT_MSC_VERSION > 1900 +# define FMT_HAS_INCLUDE(x) __has_include(x) +#else +# define FMT_HAS_INCLUDE(x) 0 +#endif + +#ifdef __has_cpp_attribute +# define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define FMT_HAS_CPP_ATTRIBUTE(x) 0 +#endif + +#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \ + (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute)) + +#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \ + (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute)) + +// Check if relaxed C++14 constexpr is supported. +// GCC doesn't allow throw in constexpr until version 6 (bug 67371). +#ifndef FMT_USE_CONSTEXPR +# if (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912 || \ + (FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L)) && \ + !FMT_ICC_VERSION && !defined(__NVCC__) +# define FMT_USE_CONSTEXPR 1 +# else +# define FMT_USE_CONSTEXPR 0 +# endif +#endif +#if FMT_USE_CONSTEXPR +# define FMT_CONSTEXPR constexpr +#else +# define FMT_CONSTEXPR +#endif + +#if ((FMT_CPLUSPLUS >= 202002L) && \ + (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE > 9)) || \ + (FMT_CPLUSPLUS >= 201709L && FMT_GCC_VERSION >= 1002) +# define FMT_CONSTEXPR20 constexpr +#else +# define FMT_CONSTEXPR20 +#endif + +// Check if constexpr std::char_traits<>::{compare,length} are supported. +#if defined(__GLIBCXX__) +# if FMT_CPLUSPLUS >= 201703L && defined(_GLIBCXX_RELEASE) && \ + _GLIBCXX_RELEASE >= 7 // GCC 7+ libstdc++ has _GLIBCXX_RELEASE. +# define FMT_CONSTEXPR_CHAR_TRAITS constexpr +# endif +#elif defined(_LIBCPP_VERSION) && FMT_CPLUSPLUS >= 201703L && \ + _LIBCPP_VERSION >= 4000 +# define FMT_CONSTEXPR_CHAR_TRAITS constexpr +#elif FMT_MSC_VERSION >= 1914 && FMT_CPLUSPLUS >= 201703L +# define FMT_CONSTEXPR_CHAR_TRAITS constexpr +#endif +#ifndef FMT_CONSTEXPR_CHAR_TRAITS +# define FMT_CONSTEXPR_CHAR_TRAITS +#endif + +// Check if exceptions are disabled. +#ifndef FMT_EXCEPTIONS +# if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || \ + (FMT_MSC_VERSION && !_HAS_EXCEPTIONS) +# define FMT_EXCEPTIONS 0 +# else +# define FMT_EXCEPTIONS 1 +# endif +#endif + +#ifndef FMT_DEPRECATED +# if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VERSION >= 1900 +# define FMT_DEPRECATED [[deprecated]] +# else +# if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__) +# define FMT_DEPRECATED __attribute__((deprecated)) +# elif FMT_MSC_VERSION +# define FMT_DEPRECATED __declspec(deprecated) +# else +# define FMT_DEPRECATED /* deprecated */ +# endif +# endif +#endif + +// [[noreturn]] is disabled on MSVC and NVCC because of bogus unreachable code +// warnings. +#if FMT_EXCEPTIONS && FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && \ + !defined(__NVCC__) +# define FMT_NORETURN [[noreturn]] +#else +# define FMT_NORETURN +#endif + +#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough) +# define FMT_FALLTHROUGH [[fallthrough]] +#elif defined(__clang__) +# define FMT_FALLTHROUGH [[clang::fallthrough]] +#elif FMT_GCC_VERSION >= 700 && \ + (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520) +# define FMT_FALLTHROUGH [[gnu::fallthrough]] +#else +# define FMT_FALLTHROUGH +#endif + +#ifndef FMT_NODISCARD +# if FMT_HAS_CPP17_ATTRIBUTE(nodiscard) +# define FMT_NODISCARD [[nodiscard]] +# else +# define FMT_NODISCARD +# endif +#endif + +#ifndef FMT_USE_FLOAT +# define FMT_USE_FLOAT 1 +#endif +#ifndef FMT_USE_DOUBLE +# define FMT_USE_DOUBLE 1 +#endif +#ifndef FMT_USE_LONG_DOUBLE +# define FMT_USE_LONG_DOUBLE 1 +#endif + +#ifndef FMT_INLINE +# if FMT_GCC_VERSION || FMT_CLANG_VERSION +# define FMT_INLINE inline __attribute__((always_inline)) +# else +# define FMT_INLINE inline +# endif +#endif + +// An inline std::forward replacement. +#define FMT_FORWARD(...) static_cast(__VA_ARGS__) + +#ifdef _MSC_VER +# define FMT_UNCHECKED_ITERATOR(It) \ + using _Unchecked_type = It // Mark iterator as checked. +#else +# define FMT_UNCHECKED_ITERATOR(It) using unchecked_type = It +#endif + +#ifndef FMT_BEGIN_NAMESPACE +# define FMT_BEGIN_NAMESPACE \ + namespace fmt { \ + inline namespace v9 { +# define FMT_END_NAMESPACE \ + } \ + } +#endif + +#ifndef FMT_MODULE_EXPORT +# define FMT_MODULE_EXPORT +# define FMT_MODULE_EXPORT_BEGIN +# define FMT_MODULE_EXPORT_END +# define FMT_BEGIN_DETAIL_NAMESPACE namespace detail { +# define FMT_END_DETAIL_NAMESPACE } +#endif + +#if !defined(FMT_HEADER_ONLY) && defined(_WIN32) +# define FMT_CLASS_API FMT_MSC_WARNING(suppress : 4275) +# ifdef FMT_EXPORT +# define FMT_API __declspec(dllexport) +# elif defined(FMT_SHARED) +# define FMT_API __declspec(dllimport) +# endif +#else +# define FMT_CLASS_API +# if defined(FMT_EXPORT) || defined(FMT_SHARED) +# if defined(__GNUC__) || defined(__clang__) +# define FMT_API __attribute__((visibility("default"))) +# endif +# endif +#endif +#ifndef FMT_API +# define FMT_API +#endif + +// libc++ supports string_view in pre-c++17. +#if FMT_HAS_INCLUDE() && \ + (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION)) +# include +# define FMT_USE_STRING_VIEW +#elif FMT_HAS_INCLUDE("experimental/string_view") && FMT_CPLUSPLUS >= 201402L +# include +# define FMT_USE_EXPERIMENTAL_STRING_VIEW +#endif + +#ifndef FMT_UNICODE +# define FMT_UNICODE !FMT_MSC_VERSION +#endif + +#ifndef FMT_CONSTEVAL +# if ((FMT_GCC_VERSION >= 1000 || FMT_CLANG_VERSION >= 1101) && \ + FMT_CPLUSPLUS >= 202002L && !defined(__apple_build_version__)) || \ + (defined(__cpp_consteval) && \ + (!FMT_MSC_VERSION || _MSC_FULL_VER >= 193030704)) +// consteval is broken in MSVC before VS2022 and Apple clang 13. +# define FMT_CONSTEVAL consteval +# define FMT_HAS_CONSTEVAL +# else +# define FMT_CONSTEVAL +# endif +#endif + +#ifndef FMT_USE_NONTYPE_TEMPLATE_ARGS +# if defined(__cpp_nontype_template_args) && \ + ((FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L) || \ + __cpp_nontype_template_args >= 201911L) && \ + !defined(__NVCOMPILER) && !defined(__LCC__) +# define FMT_USE_NONTYPE_TEMPLATE_ARGS 1 +# else +# define FMT_USE_NONTYPE_TEMPLATE_ARGS 0 +# endif +#endif + +// Enable minimal optimizations for more compact code in debug mode. +FMT_GCC_PRAGMA("GCC push_options") +#if !defined(__OPTIMIZE__) && !defined(__NVCOMPILER) && !defined(__LCC__) +FMT_GCC_PRAGMA("GCC optimize(\"Og\")") +#endif + +FMT_BEGIN_NAMESPACE +FMT_MODULE_EXPORT_BEGIN + +// Implementations of enable_if_t and other metafunctions for older systems. +template +using enable_if_t = typename std::enable_if::type; +template +using conditional_t = typename std::conditional::type; +template using bool_constant = std::integral_constant; +template +using remove_reference_t = typename std::remove_reference::type; +template +using remove_const_t = typename std::remove_const::type; +template +using remove_cvref_t = typename std::remove_cv>::type; +template struct type_identity { using type = T; }; +template using type_identity_t = typename type_identity::type; +template +using underlying_t = typename std::underlying_type::type; + +template struct disjunction : std::false_type {}; +template struct disjunction

: P {}; +template +struct disjunction + : conditional_t> {}; + +template struct conjunction : std::true_type {}; +template struct conjunction

: P {}; +template +struct conjunction + : conditional_t, P1> {}; + +struct monostate { + constexpr monostate() {} +}; + +// An enable_if helper to be used in template parameters which results in much +// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed +// to workaround a bug in MSVC 2019 (see #1140 and #1186). +#ifdef FMT_DOC +# define FMT_ENABLE_IF(...) +#else +# define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0 +#endif + +FMT_BEGIN_DETAIL_NAMESPACE + +// Suppresses "unused variable" warnings with the method described in +// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/. +// (void)var does not work on many Intel compilers. +template FMT_CONSTEXPR void ignore_unused(const T&...) {} + +constexpr FMT_INLINE auto is_constant_evaluated( + bool default_value = false) noexcept -> bool { +#ifdef __cpp_lib_is_constant_evaluated + ignore_unused(default_value); + return std::is_constant_evaluated(); +#else + return default_value; +#endif +} + +// Suppresses "conditional expression is constant" warnings. +template constexpr FMT_INLINE auto const_check(T value) -> T { + return value; +} + +FMT_NORETURN FMT_API void assert_fail(const char* file, int line, + const char* message); + +#ifndef FMT_ASSERT +# ifdef NDEBUG +// FMT_ASSERT is not empty to avoid -Wempty-body. +# define FMT_ASSERT(condition, message) \ + ::fmt::detail::ignore_unused((condition), (message)) +# else +# define FMT_ASSERT(condition, message) \ + ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \ + ? (void)0 \ + : ::fmt::detail::assert_fail(__FILE__, __LINE__, (message))) +# endif +#endif + +#if defined(FMT_USE_STRING_VIEW) +template using std_string_view = std::basic_string_view; +#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW) +template +using std_string_view = std::experimental::basic_string_view; +#else +template struct std_string_view {}; +#endif + +#ifdef FMT_USE_INT128 +// Do nothing. +#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \ + !(FMT_CLANG_VERSION && FMT_MSC_VERSION) +# define FMT_USE_INT128 1 +using int128_opt = __int128_t; // An optional native 128-bit integer. +using uint128_opt = __uint128_t; +template inline auto convert_for_visit(T value) -> T { + return value; +} +#else +# define FMT_USE_INT128 0 +#endif +#if !FMT_USE_INT128 +enum class int128_opt {}; +enum class uint128_opt {}; +// Reduce template instantiations. +template auto convert_for_visit(T) -> monostate { return {}; } +#endif + +// Casts a nonnegative integer to unsigned. +template +FMT_CONSTEXPR auto to_unsigned(Int value) -> + typename std::make_unsigned::type { + FMT_ASSERT(std::is_unsigned::value || value >= 0, "negative value"); + return static_cast::type>(value); +} + +FMT_MSC_WARNING(suppress : 4566) constexpr unsigned char micro[] = "\u00B5"; + +constexpr auto is_utf8() -> bool { + // Avoid buggy sign extensions in MSVC's constant evaluation mode (#2297). + using uchar = unsigned char; + return FMT_UNICODE || (sizeof(micro) == 3 && uchar(micro[0]) == 0xC2 && + uchar(micro[1]) == 0xB5); +} +FMT_END_DETAIL_NAMESPACE + +/** + An implementation of ``std::basic_string_view`` for pre-C++17. It provides a + subset of the API. ``fmt::basic_string_view`` is used for format strings even + if ``std::string_view`` is available to prevent issues when a library is + compiled with a different ``-std`` option than the client code (which is not + recommended). + */ +template class basic_string_view { + private: + const Char* data_; + size_t size_; + + public: + using value_type = Char; + using iterator = const Char*; + + constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {} + + /** Constructs a string reference object from a C string and a size. */ + constexpr basic_string_view(const Char* s, size_t count) noexcept + : data_(s), size_(count) {} + + /** + \rst + Constructs a string reference object from a C string computing + the size with ``std::char_traits::length``. + \endrst + */ + FMT_CONSTEXPR_CHAR_TRAITS + FMT_INLINE + basic_string_view(const Char* s) + : data_(s), + size_(detail::const_check(std::is_same::value && + !detail::is_constant_evaluated(true)) + ? std::strlen(reinterpret_cast(s)) + : std::char_traits::length(s)) {} + + /** Constructs a string reference from a ``std::basic_string`` object. */ + template + FMT_CONSTEXPR basic_string_view( + const std::basic_string& s) noexcept + : data_(s.data()), size_(s.size()) {} + + template >::value)> + FMT_CONSTEXPR basic_string_view(S s) noexcept + : data_(s.data()), size_(s.size()) {} + + /** Returns a pointer to the string data. */ + constexpr auto data() const noexcept -> const Char* { return data_; } + + /** Returns the string size. */ + constexpr auto size() const noexcept -> size_t { return size_; } + + constexpr auto begin() const noexcept -> iterator { return data_; } + constexpr auto end() const noexcept -> iterator { return data_ + size_; } + + constexpr auto operator[](size_t pos) const noexcept -> const Char& { + return data_[pos]; + } + + FMT_CONSTEXPR void remove_prefix(size_t n) noexcept { + data_ += n; + size_ -= n; + } + + FMT_CONSTEXPR_CHAR_TRAITS bool starts_with( + basic_string_view sv) const noexcept { + return size_ >= sv.size_ && + std::char_traits::compare(data_, sv.data_, sv.size_) == 0; + } + FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(Char c) const noexcept { + return size_ >= 1 && std::char_traits::eq(*data_, c); + } + FMT_CONSTEXPR_CHAR_TRAITS bool starts_with(const Char* s) const { + return starts_with(basic_string_view(s)); + } + + // Lexicographically compare this string reference to other. + FMT_CONSTEXPR_CHAR_TRAITS auto compare(basic_string_view other) const -> int { + size_t str_size = size_ < other.size_ ? size_ : other.size_; + int result = std::char_traits::compare(data_, other.data_, str_size); + if (result == 0) + result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1); + return result; + } + + FMT_CONSTEXPR_CHAR_TRAITS friend auto operator==(basic_string_view lhs, + basic_string_view rhs) + -> bool { + return lhs.compare(rhs) == 0; + } + friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool { + return lhs.compare(rhs) != 0; + } + friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool { + return lhs.compare(rhs) < 0; + } + friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool { + return lhs.compare(rhs) <= 0; + } + friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool { + return lhs.compare(rhs) > 0; + } + friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool { + return lhs.compare(rhs) >= 0; + } +}; + +using string_view = basic_string_view; + +/** Specifies if ``T`` is a character type. Can be specialized by users. */ +template struct is_char : std::false_type {}; +template <> struct is_char : std::true_type {}; + +FMT_BEGIN_DETAIL_NAMESPACE + +// A base class for compile-time strings. +struct compile_string {}; + +template +struct is_compile_string : std::is_base_of {}; + +// Returns a string view of `s`. +template ::value)> +FMT_INLINE auto to_string_view(const Char* s) -> basic_string_view { + return s; +} +template +inline auto to_string_view(const std::basic_string& s) + -> basic_string_view { + return s; +} +template +constexpr auto to_string_view(basic_string_view s) + -> basic_string_view { + return s; +} +template >::value)> +inline auto to_string_view(std_string_view s) -> basic_string_view { + return s; +} +template ::value)> +constexpr auto to_string_view(const S& s) + -> basic_string_view { + return basic_string_view(s); +} +void to_string_view(...); + +// Specifies whether S is a string type convertible to fmt::basic_string_view. +// It should be a constexpr function but MSVC 2017 fails to compile it in +// enable_if and MSVC 2015 fails to compile it as an alias template. +// ADL invocation of to_string_view is DEPRECATED! +template +struct is_string : std::is_class()))> { +}; + +template struct char_t_impl {}; +template struct char_t_impl::value>> { + using result = decltype(to_string_view(std::declval())); + using type = typename result::value_type; +}; + +enum class type { + none_type, + // Integer types should go first, + int_type, + uint_type, + long_long_type, + ulong_long_type, + int128_type, + uint128_type, + bool_type, + char_type, + last_integer_type = char_type, + // followed by floating-point types. + float_type, + double_type, + long_double_type, + last_numeric_type = long_double_type, + cstring_type, + string_type, + pointer_type, + custom_type +}; + +// Maps core type T to the corresponding type enum constant. +template +struct type_constant : std::integral_constant {}; + +#define FMT_TYPE_CONSTANT(Type, constant) \ + template \ + struct type_constant \ + : std::integral_constant {} + +FMT_TYPE_CONSTANT(int, int_type); +FMT_TYPE_CONSTANT(unsigned, uint_type); +FMT_TYPE_CONSTANT(long long, long_long_type); +FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type); +FMT_TYPE_CONSTANT(int128_opt, int128_type); +FMT_TYPE_CONSTANT(uint128_opt, uint128_type); +FMT_TYPE_CONSTANT(bool, bool_type); +FMT_TYPE_CONSTANT(Char, char_type); +FMT_TYPE_CONSTANT(float, float_type); +FMT_TYPE_CONSTANT(double, double_type); +FMT_TYPE_CONSTANT(long double, long_double_type); +FMT_TYPE_CONSTANT(const Char*, cstring_type); +FMT_TYPE_CONSTANT(basic_string_view, string_type); +FMT_TYPE_CONSTANT(const void*, pointer_type); + +constexpr bool is_integral_type(type t) { + return t > type::none_type && t <= type::last_integer_type; +} + +constexpr bool is_arithmetic_type(type t) { + return t > type::none_type && t <= type::last_numeric_type; +} + +FMT_NORETURN FMT_API void throw_format_error(const char* message); + +struct error_handler { + constexpr error_handler() = default; + constexpr error_handler(const error_handler&) = default; + + // This function is intentionally not constexpr to give a compile-time error. + FMT_NORETURN void on_error(const char* message) { + throw_format_error(message); + } +}; +FMT_END_DETAIL_NAMESPACE + +/** String's character type. */ +template using char_t = typename detail::char_t_impl::type; + +/** + \rst + Parsing context consisting of a format string range being parsed and an + argument counter for automatic indexing. + You can use the ``format_parse_context`` type alias for ``char`` instead. + \endrst + */ +template +class basic_format_parse_context : private ErrorHandler { + private: + basic_string_view format_str_; + int next_arg_id_; + + FMT_CONSTEXPR void do_check_arg_id(int id); + + public: + using char_type = Char; + using iterator = typename basic_string_view::iterator; + + explicit constexpr basic_format_parse_context( + basic_string_view format_str, ErrorHandler eh = {}, + int next_arg_id = 0) + : ErrorHandler(eh), format_str_(format_str), next_arg_id_(next_arg_id) {} + + /** + Returns an iterator to the beginning of the format string range being + parsed. + */ + constexpr auto begin() const noexcept -> iterator { + return format_str_.begin(); + } + + /** + Returns an iterator past the end of the format string range being parsed. + */ + constexpr auto end() const noexcept -> iterator { return format_str_.end(); } + + /** Advances the begin iterator to ``it``. */ + FMT_CONSTEXPR void advance_to(iterator it) { + format_str_.remove_prefix(detail::to_unsigned(it - begin())); + } + + /** + Reports an error if using the manual argument indexing; otherwise returns + the next argument index and switches to the automatic indexing. + */ + FMT_CONSTEXPR auto next_arg_id() -> int { + if (next_arg_id_ < 0) { + on_error("cannot switch from manual to automatic argument indexing"); + return 0; + } + int id = next_arg_id_++; + do_check_arg_id(id); + return id; + } + + /** + Reports an error if using the automatic argument indexing; otherwise + switches to the manual indexing. + */ + FMT_CONSTEXPR void check_arg_id(int id) { + if (next_arg_id_ > 0) { + on_error("cannot switch from automatic to manual argument indexing"); + return; + } + next_arg_id_ = -1; + do_check_arg_id(id); + } + FMT_CONSTEXPR void check_arg_id(basic_string_view) {} + FMT_CONSTEXPR void check_dynamic_spec(int arg_id); + + FMT_CONSTEXPR void on_error(const char* message) { + ErrorHandler::on_error(message); + } + + constexpr auto error_handler() const -> ErrorHandler { return *this; } +}; + +using format_parse_context = basic_format_parse_context; + +FMT_BEGIN_DETAIL_NAMESPACE +// A parse context with extra data used only in compile-time checks. +template +class compile_parse_context + : public basic_format_parse_context { + private: + int num_args_; + const type* types_; + using base = basic_format_parse_context; + + public: + explicit FMT_CONSTEXPR compile_parse_context( + basic_string_view format_str, int num_args, const type* types, + ErrorHandler eh = {}, int next_arg_id = 0) + : base(format_str, eh, next_arg_id), num_args_(num_args), types_(types) {} + + constexpr auto num_args() const -> int { return num_args_; } + constexpr auto arg_type(int id) const -> type { return types_[id]; } + + FMT_CONSTEXPR auto next_arg_id() -> int { + int id = base::next_arg_id(); + if (id >= num_args_) this->on_error("argument not found"); + return id; + } + + FMT_CONSTEXPR void check_arg_id(int id) { + base::check_arg_id(id); + if (id >= num_args_) this->on_error("argument not found"); + } + using base::check_arg_id; + + FMT_CONSTEXPR void check_dynamic_spec(int arg_id) { + detail::ignore_unused(arg_id); +#if !defined(__LCC__) + if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id])) + this->on_error("width/precision is not integer"); +#endif + } +}; +FMT_END_DETAIL_NAMESPACE + +template +FMT_CONSTEXPR void +basic_format_parse_context::do_check_arg_id(int id) { + // Argument id is only checked at compile-time during parsing because + // formatting has its own validation. + if (detail::is_constant_evaluated() && FMT_GCC_VERSION >= 1200) { + using context = detail::compile_parse_context; + if (id >= static_cast(this)->num_args()) + on_error("argument not found"); + } +} + +template +FMT_CONSTEXPR void +basic_format_parse_context::check_dynamic_spec(int arg_id) { + if (detail::is_constant_evaluated()) { + using context = detail::compile_parse_context; + static_cast(this)->check_dynamic_spec(arg_id); + } +} + +template class basic_format_arg; +template class basic_format_args; +template class dynamic_format_arg_store; + +// A formatter for objects of type T. +template +struct formatter { + // A deleted default constructor indicates a disabled formatter. + formatter() = delete; +}; + +// Specifies if T has an enabled formatter specialization. A type can be +// formattable even if it doesn't have a formatter e.g. via a conversion. +template +using has_formatter = + std::is_constructible>; + +// Checks whether T is a container with contiguous storage. +template struct is_contiguous : std::false_type {}; +template +struct is_contiguous> : std::true_type {}; + +class appender; + +FMT_BEGIN_DETAIL_NAMESPACE + +template +constexpr auto has_const_formatter_impl(T*) + -> decltype(typename Context::template formatter_type().format( + std::declval(), std::declval()), + true) { + return true; +} +template +constexpr auto has_const_formatter_impl(...) -> bool { + return false; +} +template +constexpr auto has_const_formatter() -> bool { + return has_const_formatter_impl(static_cast(nullptr)); +} + +// Extracts a reference to the container from back_insert_iterator. +template +inline auto get_container(std::back_insert_iterator it) + -> Container& { + using base = std::back_insert_iterator; + struct accessor : base { + accessor(base b) : base(b) {} + using base::container; + }; + return *accessor(it).container; +} + +template +FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out) + -> OutputIt { + while (begin != end) *out++ = static_cast(*begin++); + return out; +} + +template , U>::value&& is_char::value)> +FMT_CONSTEXPR auto copy_str(T* begin, T* end, U* out) -> U* { + if (is_constant_evaluated()) return copy_str(begin, end, out); + auto size = to_unsigned(end - begin); + memcpy(out, begin, size * sizeof(U)); + return out + size; +} + +/** + \rst + A contiguous memory buffer with an optional growing ability. It is an internal + class and shouldn't be used directly, only via `~fmt::basic_memory_buffer`. + \endrst + */ +template class buffer { + private: + T* ptr_; + size_t size_; + size_t capacity_; + + protected: + // Don't initialize ptr_ since it is not accessed to save a few cycles. + FMT_MSC_WARNING(suppress : 26495) + buffer(size_t sz) noexcept : size_(sz), capacity_(sz) {} + + FMT_CONSTEXPR20 buffer(T* p = nullptr, size_t sz = 0, size_t cap = 0) noexcept + : ptr_(p), size_(sz), capacity_(cap) {} + + FMT_CONSTEXPR20 ~buffer() = default; + buffer(buffer&&) = default; + + /** Sets the buffer data and capacity. */ + FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept { + ptr_ = buf_data; + capacity_ = buf_capacity; + } + + /** Increases the buffer capacity to hold at least *capacity* elements. */ + virtual FMT_CONSTEXPR20 void grow(size_t capacity) = 0; + + public: + using value_type = T; + using const_reference = const T&; + + buffer(const buffer&) = delete; + void operator=(const buffer&) = delete; + + FMT_INLINE auto begin() noexcept -> T* { return ptr_; } + FMT_INLINE auto end() noexcept -> T* { return ptr_ + size_; } + + FMT_INLINE auto begin() const noexcept -> const T* { return ptr_; } + FMT_INLINE auto end() const noexcept -> const T* { return ptr_ + size_; } + + /** Returns the size of this buffer. */ + constexpr auto size() const noexcept -> size_t { return size_; } + + /** Returns the capacity of this buffer. */ + constexpr auto capacity() const noexcept -> size_t { return capacity_; } + + /** Returns a pointer to the buffer data. */ + FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; } + + /** Returns a pointer to the buffer data. */ + FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; } + + /** Clears this buffer. */ + void clear() { size_ = 0; } + + // Tries resizing the buffer to contain *count* elements. If T is a POD type + // the new elements may not be initialized. + FMT_CONSTEXPR20 void try_resize(size_t count) { + try_reserve(count); + size_ = count <= capacity_ ? count : capacity_; + } + + // Tries increasing the buffer capacity to *new_capacity*. It can increase the + // capacity by a smaller amount than requested but guarantees there is space + // for at least one additional element either by increasing the capacity or by + // flushing the buffer if it is full. + FMT_CONSTEXPR20 void try_reserve(size_t new_capacity) { + if (new_capacity > capacity_) grow(new_capacity); + } + + FMT_CONSTEXPR20 void push_back(const T& value) { + try_reserve(size_ + 1); + ptr_[size_++] = value; + } + + /** Appends data to the end of the buffer. */ + template void append(const U* begin, const U* end); + + template FMT_CONSTEXPR auto operator[](Idx index) -> T& { + return ptr_[index]; + } + template + FMT_CONSTEXPR auto operator[](Idx index) const -> const T& { + return ptr_[index]; + } +}; + +struct buffer_traits { + explicit buffer_traits(size_t) {} + auto count() const -> size_t { return 0; } + auto limit(size_t size) -> size_t { return size; } +}; + +class fixed_buffer_traits { + private: + size_t count_ = 0; + size_t limit_; + + public: + explicit fixed_buffer_traits(size_t limit) : limit_(limit) {} + auto count() const -> size_t { return count_; } + auto limit(size_t size) -> size_t { + size_t n = limit_ > count_ ? limit_ - count_ : 0; + count_ += size; + return size < n ? size : n; + } +}; + +// A buffer that writes to an output iterator when flushed. +template +class iterator_buffer final : public Traits, public buffer { + private: + OutputIt out_; + enum { buffer_size = 256 }; + T data_[buffer_size]; + + protected: + FMT_CONSTEXPR20 void grow(size_t) override { + if (this->size() == buffer_size) flush(); + } + + void flush() { + auto size = this->size(); + this->clear(); + out_ = copy_str(data_, data_ + this->limit(size), out_); + } + + public: + explicit iterator_buffer(OutputIt out, size_t n = buffer_size) + : Traits(n), buffer(data_, 0, buffer_size), out_(out) {} + iterator_buffer(iterator_buffer&& other) + : Traits(other), buffer(data_, 0, buffer_size), out_(other.out_) {} + ~iterator_buffer() { flush(); } + + auto out() -> OutputIt { + flush(); + return out_; + } + auto count() const -> size_t { return Traits::count() + this->size(); } +}; + +template +class iterator_buffer final + : public fixed_buffer_traits, + public buffer { + private: + T* out_; + enum { buffer_size = 256 }; + T data_[buffer_size]; + + protected: + FMT_CONSTEXPR20 void grow(size_t) override { + if (this->size() == this->capacity()) flush(); + } + + void flush() { + size_t n = this->limit(this->size()); + if (this->data() == out_) { + out_ += n; + this->set(data_, buffer_size); + } + this->clear(); + } + + public: + explicit iterator_buffer(T* out, size_t n = buffer_size) + : fixed_buffer_traits(n), buffer(out, 0, n), out_(out) {} + iterator_buffer(iterator_buffer&& other) + : fixed_buffer_traits(other), + buffer(std::move(other)), + out_(other.out_) { + if (this->data() != out_) { + this->set(data_, buffer_size); + this->clear(); + } + } + ~iterator_buffer() { flush(); } + + auto out() -> T* { + flush(); + return out_; + } + auto count() const -> size_t { + return fixed_buffer_traits::count() + this->size(); + } +}; + +template class iterator_buffer final : public buffer { + protected: + FMT_CONSTEXPR20 void grow(size_t) override {} + + public: + explicit iterator_buffer(T* out, size_t = 0) : buffer(out, 0, ~size_t()) {} + + auto out() -> T* { return &*this->end(); } +}; + +// A buffer that writes to a container with the contiguous storage. +template +class iterator_buffer, + enable_if_t::value, + typename Container::value_type>> + final : public buffer { + private: + Container& container_; + + protected: + FMT_CONSTEXPR20 void grow(size_t capacity) override { + container_.resize(capacity); + this->set(&container_[0], capacity); + } + + public: + explicit iterator_buffer(Container& c) + : buffer(c.size()), container_(c) {} + explicit iterator_buffer(std::back_insert_iterator out, size_t = 0) + : iterator_buffer(get_container(out)) {} + + auto out() -> std::back_insert_iterator { + return std::back_inserter(container_); + } +}; + +// A buffer that counts the number of code units written discarding the output. +template class counting_buffer final : public buffer { + private: + enum { buffer_size = 256 }; + T data_[buffer_size]; + size_t count_ = 0; + + protected: + FMT_CONSTEXPR20 void grow(size_t) override { + if (this->size() != buffer_size) return; + count_ += this->size(); + this->clear(); + } + + public: + counting_buffer() : buffer(data_, 0, buffer_size) {} + + auto count() -> size_t { return count_ + this->size(); } +}; + +template +using buffer_appender = conditional_t::value, appender, + std::back_insert_iterator>>; + +// Maps an output iterator to a buffer. +template +auto get_buffer(OutputIt out) -> iterator_buffer { + return iterator_buffer(out); +} +template , Buf>::value)> +auto get_buffer(std::back_insert_iterator out) -> buffer& { + return get_container(out); +} + +template +FMT_INLINE auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) { + return buf.out(); +} +template +auto get_iterator(buffer&, OutputIt out) -> OutputIt { + return out; +} + +template +struct fallback_formatter { + fallback_formatter() = delete; +}; + +// Specifies if T has an enabled fallback_formatter specialization. +template +using has_fallback_formatter = +#ifdef FMT_DEPRECATED_OSTREAM + std::is_constructible>; +#else + std::false_type; +#endif + +struct view {}; + +template struct named_arg : view { + const Char* name; + const T& value; + named_arg(const Char* n, const T& v) : name(n), value(v) {} +}; + +template struct named_arg_info { + const Char* name; + int id; +}; + +template +struct arg_data { + // args_[0].named_args points to named_args_ to avoid bloating format_args. + // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning. + T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : +1)]; + named_arg_info named_args_[NUM_NAMED_ARGS]; + + template + arg_data(const U&... init) : args_{T(named_args_, NUM_NAMED_ARGS), init...} {} + arg_data(const arg_data& other) = delete; + auto args() const -> const T* { return args_ + 1; } + auto named_args() -> named_arg_info* { return named_args_; } +}; + +template +struct arg_data { + // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning. + T args_[NUM_ARGS != 0 ? NUM_ARGS : +1]; + + template + FMT_CONSTEXPR FMT_INLINE arg_data(const U&... init) : args_{init...} {} + FMT_CONSTEXPR FMT_INLINE auto args() const -> const T* { return args_; } + FMT_CONSTEXPR FMT_INLINE auto named_args() -> std::nullptr_t { + return nullptr; + } +}; + +template +inline void init_named_args(named_arg_info*, int, int) {} + +template struct is_named_arg : std::false_type {}; +template struct is_statically_named_arg : std::false_type {}; + +template +struct is_named_arg> : std::true_type {}; + +template ::value)> +void init_named_args(named_arg_info* named_args, int arg_count, + int named_arg_count, const T&, const Tail&... args) { + init_named_args(named_args, arg_count + 1, named_arg_count, args...); +} + +template ::value)> +void init_named_args(named_arg_info* named_args, int arg_count, + int named_arg_count, const T& arg, const Tail&... args) { + named_args[named_arg_count++] = {arg.name, arg_count}; + init_named_args(named_args, arg_count + 1, named_arg_count, args...); +} + +template +FMT_CONSTEXPR FMT_INLINE void init_named_args(std::nullptr_t, int, int, + const Args&...) {} + +template constexpr auto count() -> size_t { return B ? 1 : 0; } +template constexpr auto count() -> size_t { + return (B1 ? 1 : 0) + count(); +} + +template constexpr auto count_named_args() -> size_t { + return count::value...>(); +} + +template +constexpr auto count_statically_named_args() -> size_t { + return count::value...>(); +} + +struct unformattable {}; +struct unformattable_char : unformattable {}; +struct unformattable_const : unformattable {}; +struct unformattable_pointer : unformattable {}; + +template struct string_value { + const Char* data; + size_t size; +}; + +template struct named_arg_value { + const named_arg_info* data; + size_t size; +}; + +template struct custom_value { + using parse_context = typename Context::parse_context_type; + void* value; + void (*format)(void* arg, parse_context& parse_ctx, Context& ctx); +}; + +// A formatting argument value. +template class value { + public: + using char_type = typename Context::char_type; + + union { + monostate no_value; + int int_value; + unsigned uint_value; + long long long_long_value; + unsigned long long ulong_long_value; + int128_opt int128_value; + uint128_opt uint128_value; + bool bool_value; + char_type char_value; + float float_value; + double double_value; + long double long_double_value; + const void* pointer; + string_value string; + custom_value custom; + named_arg_value named_args; + }; + + constexpr FMT_INLINE value() : no_value() {} + constexpr FMT_INLINE value(int val) : int_value(val) {} + constexpr FMT_INLINE value(unsigned val) : uint_value(val) {} + constexpr FMT_INLINE value(long long val) : long_long_value(val) {} + constexpr FMT_INLINE value(unsigned long long val) : ulong_long_value(val) {} + FMT_INLINE value(int128_opt val) : int128_value(val) {} + FMT_INLINE value(uint128_opt val) : uint128_value(val) {} + constexpr FMT_INLINE value(float val) : float_value(val) {} + constexpr FMT_INLINE value(double val) : double_value(val) {} + FMT_INLINE value(long double val) : long_double_value(val) {} + constexpr FMT_INLINE value(bool val) : bool_value(val) {} + constexpr FMT_INLINE value(char_type val) : char_value(val) {} + FMT_CONSTEXPR FMT_INLINE value(const char_type* val) { + string.data = val; + if (is_constant_evaluated()) string.size = {}; + } + FMT_CONSTEXPR FMT_INLINE value(basic_string_view val) { + string.data = val.data(); + string.size = val.size(); + } + FMT_INLINE value(const void* val) : pointer(val) {} + FMT_INLINE value(const named_arg_info* args, size_t size) + : named_args{args, size} {} + + template FMT_CONSTEXPR FMT_INLINE value(T& val) { + using value_type = remove_cvref_t; + custom.value = const_cast(&val); + // Get the formatter type through the context to allow different contexts + // have different extension points, e.g. `formatter` for `format` and + // `printf_formatter` for `printf`. + custom.format = format_custom_arg< + value_type, + conditional_t::value, + typename Context::template formatter_type, + fallback_formatter>>; + } + value(unformattable); + value(unformattable_char); + value(unformattable_const); + value(unformattable_pointer); + + private: + // Formats an argument of a custom type, such as a user-defined class. + template + static void format_custom_arg(void* arg, + typename Context::parse_context_type& parse_ctx, + Context& ctx) { + auto f = Formatter(); + parse_ctx.advance_to(f.parse(parse_ctx)); + using qualified_type = + conditional_t(), const T, T>; + ctx.advance_to(f.format(*static_cast(arg), ctx)); + } +}; + +template +FMT_CONSTEXPR auto make_arg(T&& value) -> basic_format_arg; + +// To minimize the number of types we need to deal with, long is translated +// either to int or to long long depending on its size. +enum { long_short = sizeof(long) == sizeof(int) }; +using long_type = conditional_t; +using ulong_type = conditional_t; + +#ifdef __cpp_lib_byte +inline auto format_as(std::byte b) -> unsigned char { + return static_cast(b); +} +#endif + +template struct has_format_as { + template ::value&& std::is_integral::value)> + static auto check(U*) -> std::true_type; + static auto check(...) -> std::false_type; + + enum { value = decltype(check(static_cast(nullptr)))::value }; +}; + +// Maps formatting arguments to core types. +// arg_mapper reports errors by returning unformattable instead of using +// static_assert because it's used in the is_formattable trait. +template struct arg_mapper { + using char_type = typename Context::char_type; + + FMT_CONSTEXPR FMT_INLINE auto map(signed char val) -> int { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(unsigned char val) -> unsigned { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(short val) -> int { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(unsigned short val) -> unsigned { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(int val) -> int { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(unsigned val) -> unsigned { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(long val) -> long_type { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(unsigned long val) -> ulong_type { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(long long val) -> long long { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(unsigned long long val) + -> unsigned long long { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(int128_opt val) -> int128_opt { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(uint128_opt val) -> uint128_opt { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(bool val) -> bool { return val; } + + template ::value || + std::is_same::value)> + FMT_CONSTEXPR FMT_INLINE auto map(T val) -> char_type { + return val; + } + template ::value || +#ifdef __cpp_char8_t + std::is_same::value || +#endif + std::is_same::value || + std::is_same::value) && + !std::is_same::value, + int> = 0> + FMT_CONSTEXPR FMT_INLINE auto map(T) -> unformattable_char { + return {}; + } + + FMT_CONSTEXPR FMT_INLINE auto map(float val) -> float { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(double val) -> double { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(long double val) -> long double { + return val; + } + + FMT_CONSTEXPR FMT_INLINE auto map(char_type* val) -> const char_type* { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(const char_type* val) -> const char_type* { + return val; + } + template ::value && !std::is_pointer::value && + std::is_same>::value)> + FMT_CONSTEXPR FMT_INLINE auto map(const T& val) + -> basic_string_view { + return to_string_view(val); + } + template ::value && !std::is_pointer::value && + !std::is_same>::value)> + FMT_CONSTEXPR FMT_INLINE auto map(const T&) -> unformattable_char { + return {}; + } + template >::value && + !is_string::value && !has_formatter::value && + !has_fallback_formatter::value)> + FMT_CONSTEXPR FMT_INLINE auto map(const T& val) + -> basic_string_view { + return basic_string_view(val); + } + template >::value && + !std::is_convertible>::value && + !is_string::value && !has_formatter::value && + !has_fallback_formatter::value)> + FMT_CONSTEXPR FMT_INLINE auto map(const T& val) + -> basic_string_view { + return std_string_view(val); + } + + FMT_CONSTEXPR FMT_INLINE auto map(void* val) -> const void* { return val; } + FMT_CONSTEXPR FMT_INLINE auto map(const void* val) -> const void* { + return val; + } + FMT_CONSTEXPR FMT_INLINE auto map(std::nullptr_t val) -> const void* { + return val; + } + + // We use SFINAE instead of a const T* parameter to avoid conflicting with + // the C array overload. + template < + typename T, + FMT_ENABLE_IF( + std::is_pointer::value || std::is_member_pointer::value || + std::is_function::type>::value || + (std::is_convertible::value && + !std::is_convertible::value && + !has_formatter::value))> + FMT_CONSTEXPR auto map(const T&) -> unformattable_pointer { + return {}; + } + + template ::value)> + FMT_CONSTEXPR FMT_INLINE auto map(const T (&values)[N]) -> const T (&)[N] { + return values; + } + + template ::value&& std::is_convertible::value && + !has_format_as::value && !has_formatter::value && + !has_fallback_formatter::value)> + FMT_DEPRECATED FMT_CONSTEXPR FMT_INLINE auto map(const T& val) + -> decltype(std::declval().map( + static_cast>(val))) { + return map(static_cast>(val)); + } + + template ::value && + !has_formatter::value)> + FMT_CONSTEXPR FMT_INLINE auto map(const T& val) + -> decltype(std::declval().map(format_as(T()))) { + return map(format_as(val)); + } + + template > + struct formattable + : bool_constant() || + !std::is_const>::value || + has_fallback_formatter::value> {}; + +#if (FMT_MSC_VERSION != 0 && FMT_MSC_VERSION < 1910) || \ + FMT_ICC_VERSION != 0 || defined(__NVCC__) + // Workaround a bug in MSVC and Intel (Issue 2746). + template FMT_CONSTEXPR FMT_INLINE auto do_map(T&& val) -> T& { + return val; + } +#else + template ::value)> + FMT_CONSTEXPR FMT_INLINE auto do_map(T&& val) -> T& { + return val; + } + template ::value)> + FMT_CONSTEXPR FMT_INLINE auto do_map(T&&) -> unformattable_const { + return {}; + } +#endif + + template , + FMT_ENABLE_IF(!is_string::value && !is_char::value && + !std::is_array::value && + !std::is_pointer::value && + !has_format_as::value && + (has_formatter::value || + has_fallback_formatter::value))> + FMT_CONSTEXPR FMT_INLINE auto map(T&& val) + -> decltype(this->do_map(std::forward(val))) { + return do_map(std::forward(val)); + } + + template ::value)> + FMT_CONSTEXPR FMT_INLINE auto map(const T& named_arg) + -> decltype(std::declval().map(named_arg.value)) { + return map(named_arg.value); + } + + auto map(...) -> unformattable { return {}; } +}; + +// A type constant after applying arg_mapper. +template +using mapped_type_constant = + type_constant().map(std::declval())), + typename Context::char_type>; + +enum { packed_arg_bits = 4 }; +// Maximum number of arguments with packed types. +enum { max_packed_args = 62 / packed_arg_bits }; +enum : unsigned long long { is_unpacked_bit = 1ULL << 63 }; +enum : unsigned long long { has_named_args_bit = 1ULL << 62 }; + +FMT_END_DETAIL_NAMESPACE + +// An output iterator that appends to a buffer. +// It is used to reduce symbol sizes for the common case. +class appender : public std::back_insert_iterator> { + using base = std::back_insert_iterator>; + + public: + using std::back_insert_iterator>::back_insert_iterator; + appender(base it) noexcept : base(it) {} + FMT_UNCHECKED_ITERATOR(appender); + + auto operator++() noexcept -> appender& { return *this; } + auto operator++(int) noexcept -> appender { return *this; } +}; + +// A formatting argument. It is a trivially copyable/constructible type to +// allow storage in basic_memory_buffer. +template class basic_format_arg { + private: + detail::value value_; + detail::type type_; + + template + friend FMT_CONSTEXPR auto detail::make_arg(T&& value) + -> basic_format_arg; + + template + friend FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis, + const basic_format_arg& arg) + -> decltype(vis(0)); + + friend class basic_format_args; + friend class dynamic_format_arg_store; + + using char_type = typename Context::char_type; + + template + friend struct detail::arg_data; + + basic_format_arg(const detail::named_arg_info* args, size_t size) + : value_(args, size) {} + + public: + class handle { + public: + explicit handle(detail::custom_value custom) : custom_(custom) {} + + void format(typename Context::parse_context_type& parse_ctx, + Context& ctx) const { + custom_.format(custom_.value, parse_ctx, ctx); + } + + private: + detail::custom_value custom_; + }; + + constexpr basic_format_arg() : type_(detail::type::none_type) {} + + constexpr explicit operator bool() const noexcept { + return type_ != detail::type::none_type; + } + + auto type() const -> detail::type { return type_; } + + auto is_integral() const -> bool { return detail::is_integral_type(type_); } + auto is_arithmetic() const -> bool { + return detail::is_arithmetic_type(type_); + } +}; + +/** + \rst + Visits an argument dispatching to the appropriate visit method based on + the argument type. For example, if the argument type is ``double`` then + ``vis(value)`` will be called with the value of type ``double``. + \endrst + */ +template +FMT_CONSTEXPR FMT_INLINE auto visit_format_arg( + Visitor&& vis, const basic_format_arg& arg) -> decltype(vis(0)) { + switch (arg.type_) { + case detail::type::none_type: + break; + case detail::type::int_type: + return vis(arg.value_.int_value); + case detail::type::uint_type: + return vis(arg.value_.uint_value); + case detail::type::long_long_type: + return vis(arg.value_.long_long_value); + case detail::type::ulong_long_type: + return vis(arg.value_.ulong_long_value); + case detail::type::int128_type: + return vis(detail::convert_for_visit(arg.value_.int128_value)); + case detail::type::uint128_type: + return vis(detail::convert_for_visit(arg.value_.uint128_value)); + case detail::type::bool_type: + return vis(arg.value_.bool_value); + case detail::type::char_type: + return vis(arg.value_.char_value); + case detail::type::float_type: + return vis(arg.value_.float_value); + case detail::type::double_type: + return vis(arg.value_.double_value); + case detail::type::long_double_type: + return vis(arg.value_.long_double_value); + case detail::type::cstring_type: + return vis(arg.value_.string.data); + case detail::type::string_type: + using sv = basic_string_view; + return vis(sv(arg.value_.string.data, arg.value_.string.size)); + case detail::type::pointer_type: + return vis(arg.value_.pointer); + case detail::type::custom_type: + return vis(typename basic_format_arg::handle(arg.value_.custom)); + } + return vis(monostate()); +} + +FMT_BEGIN_DETAIL_NAMESPACE + +template +auto copy_str(InputIt begin, InputIt end, appender out) -> appender { + get_container(out).append(begin, end); + return out; +} + +template +FMT_CONSTEXPR auto copy_str(R&& rng, OutputIt out) -> OutputIt { + return detail::copy_str(rng.begin(), rng.end(), out); +} + +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500 +// A workaround for gcc 4.8 to make void_t work in a SFINAE context. +template struct void_t_impl { using type = void; }; +template +using void_t = typename detail::void_t_impl::type; +#else +template using void_t = void; +#endif + +template +struct is_output_iterator : std::false_type {}; + +template +struct is_output_iterator< + It, T, + void_t::iterator_category, + decltype(*std::declval() = std::declval())>> + : std::true_type {}; + +template +struct is_back_insert_iterator : std::false_type {}; +template +struct is_back_insert_iterator> + : std::true_type {}; + +template +struct is_contiguous_back_insert_iterator : std::false_type {}; +template +struct is_contiguous_back_insert_iterator> + : is_contiguous {}; +template <> +struct is_contiguous_back_insert_iterator : std::true_type {}; + +// A type-erased reference to an std::locale to avoid a heavy include. +class locale_ref { + private: + const void* locale_; // A type-erased pointer to std::locale. + + public: + constexpr FMT_INLINE locale_ref() : locale_(nullptr) {} + template explicit locale_ref(const Locale& loc); + + explicit operator bool() const noexcept { return locale_ != nullptr; } + + template auto get() const -> Locale; +}; + +template constexpr auto encode_types() -> unsigned long long { + return 0; +} + +template +constexpr auto encode_types() -> unsigned long long { + return static_cast(mapped_type_constant::value) | + (encode_types() << packed_arg_bits); +} + +template +FMT_CONSTEXPR FMT_INLINE auto make_value(T&& val) -> value { + const auto& arg = arg_mapper().map(FMT_FORWARD(val)); + + constexpr bool formattable_char = + !std::is_same::value; + static_assert(formattable_char, "Mixing character types is disallowed."); + + constexpr bool formattable_const = + !std::is_same::value; + static_assert(formattable_const, "Cannot format a const argument."); + + // Formatting of arbitrary pointers is disallowed. If you want to output + // a pointer cast it to "void *" or "const void *". In particular, this + // forbids formatting of "[const] volatile char *" which is printed as bool + // by iostreams. + constexpr bool formattable_pointer = + !std::is_same::value; + static_assert(formattable_pointer, + "Formatting of non-void pointers is disallowed."); + + constexpr bool formattable = + !std::is_same::value; + static_assert( + formattable, + "Cannot format an argument. To make type T formattable provide a " + "formatter specialization: https://fmt.dev/latest/api.html#udt"); + return {arg}; +} + +template +FMT_CONSTEXPR auto make_arg(T&& value) -> basic_format_arg { + basic_format_arg arg; + arg.type_ = mapped_type_constant::value; + arg.value_ = make_value(value); + return arg; +} + +// The type template parameter is there to avoid an ODR violation when using +// a fallback formatter in one translation unit and an implicit conversion in +// another (not recommended). +template +FMT_CONSTEXPR FMT_INLINE auto make_arg(T&& val) -> value { + return make_value(val); +} + +template +FMT_CONSTEXPR inline auto make_arg(T&& value) -> basic_format_arg { + return make_arg(value); +} +FMT_END_DETAIL_NAMESPACE + +// Formatting context. +template class basic_format_context { + public: + /** The character type for the output. */ + using char_type = Char; + + private: + OutputIt out_; + basic_format_args args_; + detail::locale_ref loc_; + + public: + using iterator = OutputIt; + using format_arg = basic_format_arg; + using parse_context_type = basic_format_parse_context; + template using formatter_type = formatter; + + basic_format_context(basic_format_context&&) = default; + basic_format_context(const basic_format_context&) = delete; + void operator=(const basic_format_context&) = delete; + /** + Constructs a ``basic_format_context`` object. References to the arguments are + stored in the object so make sure they have appropriate lifetimes. + */ + constexpr basic_format_context( + OutputIt out, basic_format_args ctx_args, + detail::locale_ref loc = detail::locale_ref()) + : out_(out), args_(ctx_args), loc_(loc) {} + + constexpr auto arg(int id) const -> format_arg { return args_.get(id); } + FMT_CONSTEXPR auto arg(basic_string_view name) -> format_arg { + return args_.get(name); + } + FMT_CONSTEXPR auto arg_id(basic_string_view name) -> int { + return args_.get_id(name); + } + auto args() const -> const basic_format_args& { + return args_; + } + + FMT_CONSTEXPR auto error_handler() -> detail::error_handler { return {}; } + void on_error(const char* message) { error_handler().on_error(message); } + + // Returns an iterator to the beginning of the output range. + FMT_CONSTEXPR auto out() -> iterator { return out_; } + + // Advances the begin iterator to ``it``. + void advance_to(iterator it) { + if (!detail::is_back_insert_iterator()) out_ = it; + } + + FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; } +}; + +template +using buffer_context = + basic_format_context, Char>; +using format_context = buffer_context; + +// Workaround an alias issue: https://stackoverflow.com/q/62767544/471164. +#define FMT_BUFFER_CONTEXT(Char) \ + basic_format_context, Char> + +template +using is_formattable = bool_constant< + !std::is_base_of>().map( + std::declval()))>::value && + !detail::has_fallback_formatter::value>; + +/** + \rst + An array of references to arguments. It can be implicitly converted into + `~fmt::basic_format_args` for passing into type-erased formatting functions + such as `~fmt::vformat`. + \endrst + */ +template +class format_arg_store +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409 + // Workaround a GCC template argument substitution bug. + : public basic_format_args +#endif +{ + private: + static const size_t num_args = sizeof...(Args); + static const size_t num_named_args = detail::count_named_args(); + static const bool is_packed = num_args <= detail::max_packed_args; + + using value_type = conditional_t, + basic_format_arg>; + + detail::arg_data + data_; + + friend class basic_format_args; + + static constexpr unsigned long long desc = + (is_packed ? detail::encode_types() + : detail::is_unpacked_bit | num_args) | + (num_named_args != 0 + ? static_cast(detail::has_named_args_bit) + : 0); + + public: + template + FMT_CONSTEXPR FMT_INLINE format_arg_store(T&&... args) + : +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409 + basic_format_args(*this), +#endif + data_{detail::make_arg< + is_packed, Context, + detail::mapped_type_constant, Context>::value>( + FMT_FORWARD(args))...} { + detail::init_named_args(data_.named_args(), 0, 0, args...); + } +}; + +/** + \rst + Constructs a `~fmt::format_arg_store` object that contains references to + arguments and can be implicitly converted to `~fmt::format_args`. `Context` + can be omitted in which case it defaults to `~fmt::context`. + See `~fmt::arg` for lifetime considerations. + \endrst + */ +template +constexpr auto make_format_args(Args&&... args) + -> format_arg_store...> { + return {FMT_FORWARD(args)...}; +} + +/** + \rst + Returns a named argument to be used in a formatting function. + It should only be used in a call to a formatting function or + `dynamic_format_arg_store::push_back`. + + **Example**:: + + fmt::print("Elapsed time: {s:.2f} seconds", fmt::arg("s", 1.23)); + \endrst + */ +template +inline auto arg(const Char* name, const T& arg) -> detail::named_arg { + static_assert(!detail::is_named_arg(), "nested named arguments"); + return {name, arg}; +} + +/** + \rst + A view of a collection of formatting arguments. To avoid lifetime issues it + should only be used as a parameter type in type-erased functions such as + ``vformat``:: + + void vlog(string_view format_str, format_args args); // OK + format_args args = make_format_args(42); // Error: dangling reference + \endrst + */ +template class basic_format_args { + public: + using size_type = int; + using format_arg = basic_format_arg; + + private: + // A descriptor that contains information about formatting arguments. + // If the number of arguments is less or equal to max_packed_args then + // argument types are passed in the descriptor. This reduces binary code size + // per formatting function call. + unsigned long long desc_; + union { + // If is_packed() returns true then argument values are stored in values_; + // otherwise they are stored in args_. This is done to improve cache + // locality and reduce compiled code size since storing larger objects + // may require more code (at least on x86-64) even if the same amount of + // data is actually copied to stack. It saves ~10% on the bloat test. + const detail::value* values_; + const format_arg* args_; + }; + + constexpr auto is_packed() const -> bool { + return (desc_ & detail::is_unpacked_bit) == 0; + } + auto has_named_args() const -> bool { + return (desc_ & detail::has_named_args_bit) != 0; + } + + FMT_CONSTEXPR auto type(int index) const -> detail::type { + int shift = index * detail::packed_arg_bits; + unsigned int mask = (1 << detail::packed_arg_bits) - 1; + return static_cast((desc_ >> shift) & mask); + } + + constexpr FMT_INLINE basic_format_args(unsigned long long desc, + const detail::value* values) + : desc_(desc), values_(values) {} + constexpr basic_format_args(unsigned long long desc, const format_arg* args) + : desc_(desc), args_(args) {} + + public: + constexpr basic_format_args() : desc_(0), args_(nullptr) {} + + /** + \rst + Constructs a `basic_format_args` object from `~fmt::format_arg_store`. + \endrst + */ + template + constexpr FMT_INLINE basic_format_args( + const format_arg_store& store) + : basic_format_args(format_arg_store::desc, + store.data_.args()) {} + + /** + \rst + Constructs a `basic_format_args` object from + `~fmt::dynamic_format_arg_store`. + \endrst + */ + constexpr FMT_INLINE basic_format_args( + const dynamic_format_arg_store& store) + : basic_format_args(store.get_types(), store.data()) {} + + /** + \rst + Constructs a `basic_format_args` object from a dynamic set of arguments. + \endrst + */ + constexpr basic_format_args(const format_arg* args, int count) + : basic_format_args(detail::is_unpacked_bit | detail::to_unsigned(count), + args) {} + + /** Returns the argument with the specified id. */ + FMT_CONSTEXPR auto get(int id) const -> format_arg { + format_arg arg; + if (!is_packed()) { + if (id < max_size()) arg = args_[id]; + return arg; + } + if (id >= detail::max_packed_args) return arg; + arg.type_ = type(id); + if (arg.type_ == detail::type::none_type) return arg; + arg.value_ = values_[id]; + return arg; + } + + template + auto get(basic_string_view name) const -> format_arg { + int id = get_id(name); + return id >= 0 ? get(id) : format_arg(); + } + + template + auto get_id(basic_string_view name) const -> int { + if (!has_named_args()) return -1; + const auto& named_args = + (is_packed() ? values_[-1] : args_[-1].value_).named_args; + for (size_t i = 0; i < named_args.size; ++i) { + if (named_args.data[i].name == name) return named_args.data[i].id; + } + return -1; + } + + auto max_size() const -> int { + unsigned long long max_packed = detail::max_packed_args; + return static_cast(is_packed() ? max_packed + : desc_ & ~detail::is_unpacked_bit); + } +}; + +/** An alias to ``basic_format_args``. */ +// A separate type would result in shorter symbols but break ABI compatibility +// between clang and gcc on ARM (#1919). +using format_args = basic_format_args; + +// We cannot use enum classes as bit fields because of a gcc bug, so we put them +// in namespaces instead (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414). +// Additionally, if an underlying type is specified, older gcc incorrectly warns +// that the type is too small. Both bugs are fixed in gcc 9.3. +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 903 +# define FMT_ENUM_UNDERLYING_TYPE(type) +#else +# define FMT_ENUM_UNDERLYING_TYPE(type) : type +#endif +namespace align { +enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, left, right, center, + numeric}; +} +using align_t = align::type; +namespace sign { +enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, minus, plus, space}; +} +using sign_t = sign::type; + +FMT_BEGIN_DETAIL_NAMESPACE + +// Workaround an array initialization issue in gcc 4.8. +template struct fill_t { + private: + enum { max_size = 4 }; + Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)}; + unsigned char size_ = 1; + + public: + FMT_CONSTEXPR void operator=(basic_string_view s) { + auto size = s.size(); + if (size > max_size) return throw_format_error("invalid fill"); + for (size_t i = 0; i < size; ++i) data_[i] = s[i]; + size_ = static_cast(size); + } + + constexpr auto size() const -> size_t { return size_; } + constexpr auto data() const -> const Char* { return data_; } + + FMT_CONSTEXPR auto operator[](size_t index) -> Char& { return data_[index]; } + FMT_CONSTEXPR auto operator[](size_t index) const -> const Char& { + return data_[index]; + } +}; +FMT_END_DETAIL_NAMESPACE + +enum class presentation_type : unsigned char { + none, + // Integer types should go first, + dec, // 'd' + oct, // 'o' + hex_lower, // 'x' + hex_upper, // 'X' + bin_lower, // 'b' + bin_upper, // 'B' + hexfloat_lower, // 'a' + hexfloat_upper, // 'A' + exp_lower, // 'e' + exp_upper, // 'E' + fixed_lower, // 'f' + fixed_upper, // 'F' + general_lower, // 'g' + general_upper, // 'G' + chr, // 'c' + string, // 's' + pointer, // 'p' + debug // '?' +}; + +// Format specifiers for built-in and string types. +template struct basic_format_specs { + int width; + int precision; + presentation_type type; + align_t align : 4; + sign_t sign : 3; + bool alt : 1; // Alternate form ('#'). + bool localized : 1; + detail::fill_t fill; + + constexpr basic_format_specs() + : width(0), + precision(-1), + type(presentation_type::none), + align(align::none), + sign(sign::none), + alt(false), + localized(false) {} +}; + +using format_specs = basic_format_specs; + +FMT_BEGIN_DETAIL_NAMESPACE + +enum class arg_id_kind { none, index, name }; + +// An argument reference. +template struct arg_ref { + FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {} + + FMT_CONSTEXPR explicit arg_ref(int index) + : kind(arg_id_kind::index), val(index) {} + FMT_CONSTEXPR explicit arg_ref(basic_string_view name) + : kind(arg_id_kind::name), val(name) {} + + FMT_CONSTEXPR auto operator=(int idx) -> arg_ref& { + kind = arg_id_kind::index; + val.index = idx; + return *this; + } + + arg_id_kind kind; + union value { + FMT_CONSTEXPR value(int id = 0) : index{id} {} + FMT_CONSTEXPR value(basic_string_view n) : name(n) {} + + int index; + basic_string_view name; + } val; +}; + +// Format specifiers with width and precision resolved at formatting rather +// than parsing time to allow re-using the same parsed specifiers with +// different sets of arguments (precompilation of format strings). +template +struct dynamic_format_specs : basic_format_specs { + arg_ref width_ref; + arg_ref precision_ref; +}; + +struct auto_id {}; + +// A format specifier handler that sets fields in basic_format_specs. +template class specs_setter { + protected: + basic_format_specs& specs_; + + public: + explicit FMT_CONSTEXPR specs_setter(basic_format_specs& specs) + : specs_(specs) {} + + FMT_CONSTEXPR specs_setter(const specs_setter& other) + : specs_(other.specs_) {} + + FMT_CONSTEXPR void on_align(align_t align) { specs_.align = align; } + FMT_CONSTEXPR void on_fill(basic_string_view fill) { + specs_.fill = fill; + } + FMT_CONSTEXPR void on_sign(sign_t s) { specs_.sign = s; } + FMT_CONSTEXPR void on_hash() { specs_.alt = true; } + FMT_CONSTEXPR void on_localized() { specs_.localized = true; } + + FMT_CONSTEXPR void on_zero() { + if (specs_.align == align::none) specs_.align = align::numeric; + specs_.fill[0] = Char('0'); + } + + FMT_CONSTEXPR void on_width(int width) { specs_.width = width; } + FMT_CONSTEXPR void on_precision(int precision) { + specs_.precision = precision; + } + FMT_CONSTEXPR void end_precision() {} + + FMT_CONSTEXPR void on_type(presentation_type type) { specs_.type = type; } +}; + +// Format spec handler that saves references to arguments representing dynamic +// width and precision to be resolved at formatting time. +template +class dynamic_specs_handler + : public specs_setter { + public: + using char_type = typename ParseContext::char_type; + + FMT_CONSTEXPR dynamic_specs_handler(dynamic_format_specs& specs, + ParseContext& ctx) + : specs_setter(specs), specs_(specs), context_(ctx) {} + + FMT_CONSTEXPR dynamic_specs_handler(const dynamic_specs_handler& other) + : specs_setter(other), + specs_(other.specs_), + context_(other.context_) {} + + template FMT_CONSTEXPR void on_dynamic_width(Id arg_id) { + specs_.width_ref = make_arg_ref(arg_id); + } + + template FMT_CONSTEXPR void on_dynamic_precision(Id arg_id) { + specs_.precision_ref = make_arg_ref(arg_id); + } + + FMT_CONSTEXPR void on_error(const char* message) { + context_.on_error(message); + } + + private: + dynamic_format_specs& specs_; + ParseContext& context_; + + using arg_ref_type = arg_ref; + + FMT_CONSTEXPR auto make_arg_ref(int arg_id) -> arg_ref_type { + context_.check_arg_id(arg_id); + context_.check_dynamic_spec(arg_id); + return arg_ref_type(arg_id); + } + + FMT_CONSTEXPR auto make_arg_ref(auto_id) -> arg_ref_type { + int arg_id = context_.next_arg_id(); + context_.check_dynamic_spec(arg_id); + return arg_ref_type(arg_id); + } + + FMT_CONSTEXPR auto make_arg_ref(basic_string_view arg_id) + -> arg_ref_type { + context_.check_arg_id(arg_id); + basic_string_view format_str( + context_.begin(), to_unsigned(context_.end() - context_.begin())); + return arg_ref_type(arg_id); + } +}; + +template constexpr bool is_ascii_letter(Char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +// Converts a character to ASCII. Returns a number > 127 on conversion failure. +template ::value)> +constexpr auto to_ascii(Char c) -> Char { + return c; +} +template ::value)> +constexpr auto to_ascii(Char c) -> underlying_t { + return c; +} + +FMT_CONSTEXPR inline auto code_point_length_impl(char c) -> int { + return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4" + [static_cast(c) >> 3]; +} + +template +FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int { + if (const_check(sizeof(Char) != 1)) return 1; + int len = code_point_length_impl(static_cast(*begin)); + + // Compute the pointer to the next character early so that the next + // iteration can start working on the next character. Neither Clang + // nor GCC figure out this reordering on their own. + return len + !len; +} + +// Return the result via the out param to workaround gcc bug 77539. +template +FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool { + for (out = first; out != last; ++out) { + if (*out == value) return true; + } + return false; +} + +template <> +inline auto find(const char* first, const char* last, char value, + const char*& out) -> bool { + out = static_cast( + std::memchr(first, value, to_unsigned(last - first))); + return out != nullptr; +} + +// Parses the range [begin, end) as an unsigned integer. This function assumes +// that the range is non-empty and the first character is a digit. +template +FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end, + int error_value) noexcept -> int { + FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', ""); + unsigned value = 0, prev = 0; + auto p = begin; + do { + prev = value; + value = value * 10 + unsigned(*p - '0'); + ++p; + } while (p != end && '0' <= *p && *p <= '9'); + auto num_digits = p - begin; + begin = p; + if (num_digits <= std::numeric_limits::digits10) + return static_cast(value); + // Check for overflow. + const unsigned max = to_unsigned((std::numeric_limits::max)()); + return num_digits == std::numeric_limits::digits10 + 1 && + prev * 10ull + unsigned(p[-1] - '0') <= max + ? static_cast(value) + : error_value; +} + +// Parses fill and alignment. +template +FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end, + Handler&& handler) -> const Char* { + FMT_ASSERT(begin != end, ""); + auto align = align::none; + auto p = begin + code_point_length(begin); + if (end - p <= 0) p = begin; + for (;;) { + switch (to_ascii(*p)) { + case '<': + align = align::left; + break; + case '>': + align = align::right; + break; + case '^': + align = align::center; + break; + default: + break; + } + if (align != align::none) { + if (p != begin) { + auto c = *begin; + if (c == '{') + return handler.on_error("invalid fill character '{'"), begin; + if (c == '}') return begin; + handler.on_fill(basic_string_view(begin, to_unsigned(p - begin))); + begin = p + 1; + } else + ++begin; + handler.on_align(align); + break; + } else if (p == begin) { + break; + } + p = begin; + } + return begin; +} + +template FMT_CONSTEXPR bool is_name_start(Char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c; +} + +template +FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end, + IDHandler&& handler) -> const Char* { + FMT_ASSERT(begin != end, ""); + Char c = *begin; + if (c >= '0' && c <= '9') { + int index = 0; + if (c != '0') + index = + parse_nonnegative_int(begin, end, (std::numeric_limits::max)()); + else + ++begin; + if (begin == end || (*begin != '}' && *begin != ':')) + handler.on_error("invalid format string"); + else + handler(index); + return begin; + } + if (!is_name_start(c)) { + handler.on_error("invalid format string"); + return begin; + } + auto it = begin; + do { + ++it; + } while (it != end && (is_name_start(c = *it) || ('0' <= c && c <= '9'))); + handler(basic_string_view(begin, to_unsigned(it - begin))); + return it; +} + +template +FMT_CONSTEXPR FMT_INLINE auto parse_arg_id(const Char* begin, const Char* end, + IDHandler&& handler) -> const Char* { + Char c = *begin; + if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler); + handler(); + return begin; +} + +template +FMT_CONSTEXPR auto parse_width(const Char* begin, const Char* end, + Handler&& handler) -> const Char* { + using detail::auto_id; + struct width_adapter { + Handler& handler; + + FMT_CONSTEXPR void operator()() { handler.on_dynamic_width(auto_id()); } + FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_width(id); } + FMT_CONSTEXPR void operator()(basic_string_view id) { + handler.on_dynamic_width(id); + } + FMT_CONSTEXPR void on_error(const char* message) { + if (message) handler.on_error(message); + } + }; + + FMT_ASSERT(begin != end, ""); + if ('0' <= *begin && *begin <= '9') { + int width = parse_nonnegative_int(begin, end, -1); + if (width != -1) + handler.on_width(width); + else + handler.on_error("number is too big"); + } else if (*begin == '{') { + ++begin; + if (begin != end) begin = parse_arg_id(begin, end, width_adapter{handler}); + if (begin == end || *begin != '}') + return handler.on_error("invalid format string"), begin; + ++begin; + } + return begin; +} + +template +FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end, + Handler&& handler) -> const Char* { + using detail::auto_id; + struct precision_adapter { + Handler& handler; + + FMT_CONSTEXPR void operator()() { handler.on_dynamic_precision(auto_id()); } + FMT_CONSTEXPR void operator()(int id) { handler.on_dynamic_precision(id); } + FMT_CONSTEXPR void operator()(basic_string_view id) { + handler.on_dynamic_precision(id); + } + FMT_CONSTEXPR void on_error(const char* message) { + if (message) handler.on_error(message); + } + }; + + ++begin; + auto c = begin != end ? *begin : Char(); + if ('0' <= c && c <= '9') { + auto precision = parse_nonnegative_int(begin, end, -1); + if (precision != -1) + handler.on_precision(precision); + else + handler.on_error("number is too big"); + } else if (c == '{') { + ++begin; + if (begin != end) + begin = parse_arg_id(begin, end, precision_adapter{handler}); + if (begin == end || *begin++ != '}') + return handler.on_error("invalid format string"), begin; + } else { + return handler.on_error("missing precision specifier"), begin; + } + handler.end_precision(); + return begin; +} + +template +FMT_CONSTEXPR auto parse_presentation_type(Char type) -> presentation_type { + switch (to_ascii(type)) { + case 'd': + return presentation_type::dec; + case 'o': + return presentation_type::oct; + case 'x': + return presentation_type::hex_lower; + case 'X': + return presentation_type::hex_upper; + case 'b': + return presentation_type::bin_lower; + case 'B': + return presentation_type::bin_upper; + case 'a': + return presentation_type::hexfloat_lower; + case 'A': + return presentation_type::hexfloat_upper; + case 'e': + return presentation_type::exp_lower; + case 'E': + return presentation_type::exp_upper; + case 'f': + return presentation_type::fixed_lower; + case 'F': + return presentation_type::fixed_upper; + case 'g': + return presentation_type::general_lower; + case 'G': + return presentation_type::general_upper; + case 'c': + return presentation_type::chr; + case 's': + return presentation_type::string; + case 'p': + return presentation_type::pointer; + case '?': + return presentation_type::debug; + default: + return presentation_type::none; + } +} + +// Parses standard format specifiers and sends notifications about parsed +// components to handler. +template +FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(const Char* begin, + const Char* end, + SpecHandler&& handler) + -> const Char* { + if (1 < end - begin && begin[1] == '}' && is_ascii_letter(*begin) && + *begin != 'L') { + presentation_type type = parse_presentation_type(*begin++); + if (type == presentation_type::none) + handler.on_error("invalid type specifier"); + handler.on_type(type); + return begin; + } + + if (begin == end) return begin; + + begin = parse_align(begin, end, handler); + if (begin == end) return begin; + + // Parse sign. + switch (to_ascii(*begin)) { + case '+': + handler.on_sign(sign::plus); + ++begin; + break; + case '-': + handler.on_sign(sign::minus); + ++begin; + break; + case ' ': + handler.on_sign(sign::space); + ++begin; + break; + default: + break; + } + if (begin == end) return begin; + + if (*begin == '#') { + handler.on_hash(); + if (++begin == end) return begin; + } + + // Parse zero flag. + if (*begin == '0') { + handler.on_zero(); + if (++begin == end) return begin; + } + + begin = parse_width(begin, end, handler); + if (begin == end) return begin; + + // Parse precision. + if (*begin == '.') { + begin = parse_precision(begin, end, handler); + if (begin == end) return begin; + } + + if (*begin == 'L') { + handler.on_localized(); + ++begin; + } + + // Parse type. + if (begin != end && *begin != '}') { + presentation_type type = parse_presentation_type(*begin++); + if (type == presentation_type::none) + handler.on_error("invalid type specifier"); + handler.on_type(type); + } + return begin; +} + +template +FMT_CONSTEXPR auto parse_replacement_field(const Char* begin, const Char* end, + Handler&& handler) -> const Char* { + struct id_adapter { + Handler& handler; + int arg_id; + + FMT_CONSTEXPR void operator()() { arg_id = handler.on_arg_id(); } + FMT_CONSTEXPR void operator()(int id) { arg_id = handler.on_arg_id(id); } + FMT_CONSTEXPR void operator()(basic_string_view id) { + arg_id = handler.on_arg_id(id); + } + FMT_CONSTEXPR void on_error(const char* message) { + if (message) handler.on_error(message); + } + }; + + ++begin; + if (begin == end) return handler.on_error("invalid format string"), end; + if (*begin == '}') { + handler.on_replacement_field(handler.on_arg_id(), begin); + } else if (*begin == '{') { + handler.on_text(begin, begin + 1); + } else { + auto adapter = id_adapter{handler, 0}; + begin = parse_arg_id(begin, end, adapter); + Char c = begin != end ? *begin : Char(); + if (c == '}') { + handler.on_replacement_field(adapter.arg_id, begin); + } else if (c == ':') { + begin = handler.on_format_specs(adapter.arg_id, begin + 1, end); + if (begin == end || *begin != '}') + return handler.on_error("unknown format specifier"), end; + } else { + return handler.on_error("missing '}' in format string"), end; + } + } + return begin + 1; +} + +template +FMT_CONSTEXPR FMT_INLINE void parse_format_string( + basic_string_view format_str, Handler&& handler) { + // Workaround a name-lookup bug in MSVC's modules implementation. + using detail::find; + + auto begin = format_str.data(); + auto end = begin + format_str.size(); + if (end - begin < 32) { + // Use a simple loop instead of memchr for small strings. + const Char* p = begin; + while (p != end) { + auto c = *p++; + if (c == '{') { + handler.on_text(begin, p - 1); + begin = p = parse_replacement_field(p - 1, end, handler); + } else if (c == '}') { + if (p == end || *p != '}') + return handler.on_error("unmatched '}' in format string"); + handler.on_text(begin, p); + begin = ++p; + } + } + handler.on_text(begin, end); + return; + } + struct writer { + FMT_CONSTEXPR void operator()(const Char* from, const Char* to) { + if (from == to) return; + for (;;) { + const Char* p = nullptr; + if (!find(from, to, Char('}'), p)) + return handler_.on_text(from, to); + ++p; + if (p == to || *p != '}') + return handler_.on_error("unmatched '}' in format string"); + handler_.on_text(from, p); + from = p + 1; + } + } + Handler& handler_; + } write = {handler}; + while (begin != end) { + // Doing two passes with memchr (one for '{' and another for '}') is up to + // 2.5x faster than the naive one-pass implementation on big format strings. + const Char* p = begin; + if (*begin != '{' && !find(begin + 1, end, Char('{'), p)) + return write(begin, end); + write(begin, p); + begin = parse_replacement_field(p, end, handler); + } +} + +template ::value> struct strip_named_arg { + using type = T; +}; +template struct strip_named_arg { + using type = remove_cvref_t; +}; + +template +FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx) + -> decltype(ctx.begin()) { + using char_type = typename ParseContext::char_type; + using context = buffer_context; + using stripped_type = typename strip_named_arg::type; + using mapped_type = conditional_t< + mapped_type_constant::value != type::custom_type, + decltype(arg_mapper().map(std::declval())), + stripped_type>; + auto f = conditional_t::value, + formatter, + fallback_formatter>(); + return f.parse(ctx); +} + +template +FMT_CONSTEXPR void check_int_type_spec(presentation_type type, + ErrorHandler&& eh) { + if (type > presentation_type::bin_upper && type != presentation_type::chr) + eh.on_error("invalid type specifier"); +} + +// Checks char specs and returns true if the type spec is char (and not int). +template +FMT_CONSTEXPR auto check_char_specs(const basic_format_specs& specs, + ErrorHandler&& eh = {}) -> bool { + if (specs.type != presentation_type::none && + specs.type != presentation_type::chr && + specs.type != presentation_type::debug) { + check_int_type_spec(specs.type, eh); + return false; + } + if (specs.align == align::numeric || specs.sign != sign::none || specs.alt) + eh.on_error("invalid format specifier for char"); + return true; +} + +// A floating-point presentation format. +enum class float_format : unsigned char { + general, // General: exponent notation or fixed point based on magnitude. + exp, // Exponent notation with the default precision of 6, e.g. 1.2e-3. + fixed, // Fixed point with the default precision of 6, e.g. 0.0012. + hex +}; + +struct float_specs { + int precision; + float_format format : 8; + sign_t sign : 8; + bool upper : 1; + bool locale : 1; + bool binary32 : 1; + bool showpoint : 1; +}; + +template +FMT_CONSTEXPR auto parse_float_type_spec(const basic_format_specs& specs, + ErrorHandler&& eh = {}) + -> float_specs { + auto result = float_specs(); + result.showpoint = specs.alt; + result.locale = specs.localized; + switch (specs.type) { + case presentation_type::none: + result.format = float_format::general; + break; + case presentation_type::general_upper: + result.upper = true; + FMT_FALLTHROUGH; + case presentation_type::general_lower: + result.format = float_format::general; + break; + case presentation_type::exp_upper: + result.upper = true; + FMT_FALLTHROUGH; + case presentation_type::exp_lower: + result.format = float_format::exp; + result.showpoint |= specs.precision != 0; + break; + case presentation_type::fixed_upper: + result.upper = true; + FMT_FALLTHROUGH; + case presentation_type::fixed_lower: + result.format = float_format::fixed; + result.showpoint |= specs.precision != 0; + break; + case presentation_type::hexfloat_upper: + result.upper = true; + FMT_FALLTHROUGH; + case presentation_type::hexfloat_lower: + result.format = float_format::hex; + break; + default: + eh.on_error("invalid type specifier"); + break; + } + return result; +} + +template +FMT_CONSTEXPR auto check_cstring_type_spec(presentation_type type, + ErrorHandler&& eh = {}) -> bool { + if (type == presentation_type::none || type == presentation_type::string || + type == presentation_type::debug) + return true; + if (type != presentation_type::pointer) eh.on_error("invalid type specifier"); + return false; +} + +template +FMT_CONSTEXPR void check_string_type_spec(presentation_type type, + ErrorHandler&& eh = {}) { + if (type != presentation_type::none && type != presentation_type::string && + type != presentation_type::debug) + eh.on_error("invalid type specifier"); +} + +template +FMT_CONSTEXPR void check_pointer_type_spec(presentation_type type, + ErrorHandler&& eh) { + if (type != presentation_type::none && type != presentation_type::pointer) + eh.on_error("invalid type specifier"); +} + +// A parse_format_specs handler that checks if specifiers are consistent with +// the argument type. +template class specs_checker : public Handler { + private: + detail::type arg_type_; + + FMT_CONSTEXPR void require_numeric_argument() { + if (!is_arithmetic_type(arg_type_)) + this->on_error("format specifier requires numeric argument"); + } + + public: + FMT_CONSTEXPR specs_checker(const Handler& handler, detail::type arg_type) + : Handler(handler), arg_type_(arg_type) {} + + FMT_CONSTEXPR void on_align(align_t align) { + if (align == align::numeric) require_numeric_argument(); + Handler::on_align(align); + } + + FMT_CONSTEXPR void on_sign(sign_t s) { + require_numeric_argument(); + if (is_integral_type(arg_type_) && arg_type_ != type::int_type && + arg_type_ != type::long_long_type && arg_type_ != type::int128_type && + arg_type_ != type::char_type) { + this->on_error("format specifier requires signed argument"); + } + Handler::on_sign(s); + } + + FMT_CONSTEXPR void on_hash() { + require_numeric_argument(); + Handler::on_hash(); + } + + FMT_CONSTEXPR void on_localized() { + require_numeric_argument(); + Handler::on_localized(); + } + + FMT_CONSTEXPR void on_zero() { + require_numeric_argument(); + Handler::on_zero(); + } + + FMT_CONSTEXPR void end_precision() { + if (is_integral_type(arg_type_) || arg_type_ == type::pointer_type) + this->on_error("precision not allowed for this argument type"); + } +}; + +constexpr int invalid_arg_index = -1; + +#if FMT_USE_NONTYPE_TEMPLATE_ARGS +template +constexpr auto get_arg_index_by_name(basic_string_view name) -> int { + if constexpr (detail::is_statically_named_arg()) { + if (name == T::name) return N; + } + if constexpr (sizeof...(Args) > 0) + return get_arg_index_by_name(name); + (void)name; // Workaround an MSVC bug about "unused" parameter. + return invalid_arg_index; +} +#endif + +template +FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view name) -> int { +#if FMT_USE_NONTYPE_TEMPLATE_ARGS + if constexpr (sizeof...(Args) > 0) + return get_arg_index_by_name<0, Args...>(name); +#endif + (void)name; + return invalid_arg_index; +} + +template +class format_string_checker { + private: + // In the future basic_format_parse_context will replace compile_parse_context + // here and will use is_constant_evaluated and downcasting to access the data + // needed for compile-time checks: https://godbolt.org/z/GvWzcTjh1. + using parse_context_type = compile_parse_context; + static constexpr int num_args = sizeof...(Args); + + // Format specifier parsing function. + using parse_func = const Char* (*)(parse_context_type&); + + parse_context_type context_; + parse_func parse_funcs_[num_args > 0 ? static_cast(num_args) : 1]; + type types_[num_args > 0 ? static_cast(num_args) : 1]; + + public: + explicit FMT_CONSTEXPR format_string_checker( + basic_string_view format_str, ErrorHandler eh) + : context_(format_str, num_args, types_, eh), + parse_funcs_{&parse_format_specs...}, + types_{ + mapped_type_constant>::value...} { + } + + FMT_CONSTEXPR void on_text(const Char*, const Char*) {} + + FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); } + FMT_CONSTEXPR auto on_arg_id(int id) -> int { + return context_.check_arg_id(id), id; + } + FMT_CONSTEXPR auto on_arg_id(basic_string_view id) -> int { +#if FMT_USE_NONTYPE_TEMPLATE_ARGS + auto index = get_arg_index_by_name(id); + if (index == invalid_arg_index) on_error("named argument is not found"); + return context_.check_arg_id(index), index; +#else + (void)id; + on_error("compile-time checks for named arguments require C++20 support"); + return 0; +#endif + } + + FMT_CONSTEXPR void on_replacement_field(int, const Char*) {} + + FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char*) + -> const Char* { + context_.advance_to(context_.begin() + (begin - &*context_.begin())); + // id >= 0 check is a workaround for gcc 10 bug (#2065). + return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin; + } + + FMT_CONSTEXPR void on_error(const char* message) { + context_.on_error(message); + } +}; + +// Reports a compile-time error if S is not a valid format string. +template ::value)> +FMT_INLINE void check_format_string(const S&) { +#ifdef FMT_ENFORCE_COMPILE_STRING + static_assert(is_compile_string::value, + "FMT_ENFORCE_COMPILE_STRING requires all format strings to use " + "FMT_STRING."); +#endif +} +template ::value)> +void check_format_string(S format_str) { + FMT_CONSTEXPR auto s = basic_string_view(format_str); + using checker = format_string_checker...>; + FMT_CONSTEXPR bool invalid_format = + (parse_format_string(s, checker(s, {})), true); + ignore_unused(invalid_format); +} + +// Don't use type_identity for args to simplify symbols. +template +void vformat_to(buffer& buf, basic_string_view fmt, + basic_format_args args, + locale_ref loc = {}); + +FMT_API void vprint_mojibake(std::FILE*, string_view, format_args); +#ifndef _WIN32 +inline void vprint_mojibake(std::FILE*, string_view, format_args) {} +#endif +FMT_END_DETAIL_NAMESPACE + +// A formatter specialization for the core types corresponding to detail::type +// constants. +template +struct formatter::value != + detail::type::custom_type>> { + private: + detail::dynamic_format_specs specs_; + + public: + // Parses format specifiers stopping either at the end of the range or at the + // terminating '}'. + template + FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) { + auto begin = ctx.begin(), end = ctx.end(); + if (begin == end) return begin; + using handler_type = detail::dynamic_specs_handler; + auto type = detail::type_constant::value; + auto checker = + detail::specs_checker(handler_type(specs_, ctx), type); + auto it = detail::parse_format_specs(begin, end, checker); + auto eh = ctx.error_handler(); + switch (type) { + case detail::type::none_type: + FMT_ASSERT(false, "invalid argument type"); + break; + case detail::type::bool_type: + if (specs_.type == presentation_type::none || + specs_.type == presentation_type::string) { + break; + } + FMT_FALLTHROUGH; + case detail::type::int_type: + case detail::type::uint_type: + case detail::type::long_long_type: + case detail::type::ulong_long_type: + case detail::type::int128_type: + case detail::type::uint128_type: + detail::check_int_type_spec(specs_.type, eh); + break; + case detail::type::char_type: + detail::check_char_specs(specs_, eh); + break; + case detail::type::float_type: + if (detail::const_check(FMT_USE_FLOAT)) + detail::parse_float_type_spec(specs_, eh); + else + FMT_ASSERT(false, "float support disabled"); + break; + case detail::type::double_type: + if (detail::const_check(FMT_USE_DOUBLE)) + detail::parse_float_type_spec(specs_, eh); + else + FMT_ASSERT(false, "double support disabled"); + break; + case detail::type::long_double_type: + if (detail::const_check(FMT_USE_LONG_DOUBLE)) + detail::parse_float_type_spec(specs_, eh); + else + FMT_ASSERT(false, "long double support disabled"); + break; + case detail::type::cstring_type: + detail::check_cstring_type_spec(specs_.type, eh); + break; + case detail::type::string_type: + detail::check_string_type_spec(specs_.type, eh); + break; + case detail::type::pointer_type: + detail::check_pointer_type_spec(specs_.type, eh); + break; + case detail::type::custom_type: + // Custom format specifiers are checked in parse functions of + // formatter specializations. + break; + } + return it; + } + + template ::value, + enable_if_t<(U == detail::type::string_type || + U == detail::type::cstring_type || + U == detail::type::char_type), + int> = 0> + FMT_CONSTEXPR void set_debug_format() { + specs_.type = presentation_type::debug; + } + + template + FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const + -> decltype(ctx.out()); +}; + +#define FMT_FORMAT_AS(Type, Base) \ + template \ + struct formatter : formatter { \ + template \ + auto format(Type const& val, FormatContext& ctx) const \ + -> decltype(ctx.out()) { \ + return formatter::format(static_cast(val), ctx); \ + } \ + } + +FMT_FORMAT_AS(signed char, int); +FMT_FORMAT_AS(unsigned char, unsigned); +FMT_FORMAT_AS(short, int); +FMT_FORMAT_AS(unsigned short, unsigned); +FMT_FORMAT_AS(long, long long); +FMT_FORMAT_AS(unsigned long, unsigned long long); +FMT_FORMAT_AS(Char*, const Char*); +FMT_FORMAT_AS(std::basic_string, basic_string_view); +FMT_FORMAT_AS(std::nullptr_t, const void*); +FMT_FORMAT_AS(detail::std_string_view, basic_string_view); + +template struct basic_runtime { basic_string_view str; }; + +/** A compile-time format string. */ +template class basic_format_string { + private: + basic_string_view str_; + + public: + template >::value)> + FMT_CONSTEVAL FMT_INLINE basic_format_string(const S& s) : str_(s) { + static_assert( + detail::count< + (std::is_base_of>::value && + std::is_reference::value)...>() == 0, + "passing views as lvalues is disallowed"); +#ifdef FMT_HAS_CONSTEVAL + if constexpr (detail::count_named_args() == + detail::count_statically_named_args()) { + using checker = detail::format_string_checker...>; + detail::parse_format_string(str_, checker(s, {})); + } +#else + detail::check_format_string(s); +#endif + } + basic_format_string(basic_runtime r) : str_(r.str) {} + + FMT_INLINE operator basic_string_view() const { return str_; } +}; + +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409 +// Workaround broken conversion on older gcc. +template using format_string = string_view; +inline auto runtime(string_view s) -> string_view { return s; } +#else +template +using format_string = basic_format_string...>; +/** + \rst + Creates a runtime format string. + + **Example**:: + + // Check format string at runtime instead of compile-time. + fmt::print(fmt::runtime("{:d}"), "I am not a number"); + \endrst + */ +inline auto runtime(string_view s) -> basic_runtime { return {{s}}; } +#endif + +FMT_API auto vformat(string_view fmt, format_args args) -> std::string; + +/** + \rst + Formats ``args`` according to specifications in ``fmt`` and returns the result + as a string. + + **Example**:: + + #include + std::string message = fmt::format("The answer is {}.", 42); + \endrst +*/ +template +FMT_NODISCARD FMT_INLINE auto format(format_string fmt, T&&... args) + -> std::string { + return vformat(fmt, fmt::make_format_args(args...)); +} + +/** Formats a string and writes the output to ``out``. */ +template ::value)> +auto vformat_to(OutputIt out, string_view fmt, format_args args) -> OutputIt { + auto&& buf = detail::get_buffer(out); + detail::vformat_to(buf, fmt, args, {}); + return detail::get_iterator(buf, out); +} + +/** + \rst + Formats ``args`` according to specifications in ``fmt``, writes the result to + the output iterator ``out`` and returns the iterator past the end of the output + range. `format_to` does not append a terminating null character. + + **Example**:: + + auto out = std::vector(); + fmt::format_to(std::back_inserter(out), "{}", 42); + \endrst + */ +template ::value)> +FMT_INLINE auto format_to(OutputIt out, format_string fmt, T&&... args) + -> OutputIt { + return vformat_to(out, fmt, fmt::make_format_args(args...)); +} + +template struct format_to_n_result { + /** Iterator past the end of the output range. */ + OutputIt out; + /** Total (not truncated) output size. */ + size_t size; +}; + +template ::value)> +auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args) + -> format_to_n_result { + using traits = detail::fixed_buffer_traits; + auto buf = detail::iterator_buffer(out, n); + detail::vformat_to(buf, fmt, args, {}); + return {buf.out(), buf.count()}; +} + +/** + \rst + Formats ``args`` according to specifications in ``fmt``, writes up to ``n`` + characters of the result to the output iterator ``out`` and returns the total + (not truncated) output size and the iterator past the end of the output range. + `format_to_n` does not append a terminating null character. + \endrst + */ +template ::value)> +FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string fmt, + T&&... args) -> format_to_n_result { + return vformat_to_n(out, n, fmt, fmt::make_format_args(args...)); +} + +/** Returns the number of chars in the output of ``format(fmt, args...)``. */ +template +FMT_NODISCARD FMT_INLINE auto formatted_size(format_string fmt, + T&&... args) -> size_t { + auto buf = detail::counting_buffer<>(); + detail::vformat_to(buf, string_view(fmt), + format_args(fmt::make_format_args(args...)), {}); + return buf.count(); +} + +FMT_API void vprint(string_view fmt, format_args args); +FMT_API void vprint(std::FILE* f, string_view fmt, format_args args); + +/** + \rst + Formats ``args`` according to specifications in ``fmt`` and writes the output + to ``stdout``. + + **Example**:: + + fmt::print("Elapsed time: {0:.2f} seconds", 1.23); + \endrst + */ +template +FMT_INLINE void print(format_string fmt, T&&... args) { + const auto& vargs = fmt::make_format_args(args...); + return detail::is_utf8() ? vprint(fmt, vargs) + : detail::vprint_mojibake(stdout, fmt, vargs); +} + +/** + \rst + Formats ``args`` according to specifications in ``fmt`` and writes the + output to the file ``f``. + + **Example**:: + + fmt::print(stderr, "Don't {}!", "panic"); + \endrst + */ +template +FMT_INLINE void print(std::FILE* f, format_string fmt, T&&... args) { + const auto& vargs = fmt::make_format_args(args...); + return detail::is_utf8() ? vprint(f, fmt, vargs) + : detail::vprint_mojibake(f, fmt, vargs); +} + +FMT_MODULE_EXPORT_END +FMT_GCC_PRAGMA("GCC pop_options") +FMT_END_NAMESPACE + +#ifdef FMT_HEADER_ONLY +# include "format.h" +#endif +#endif // FMT_CORE_H_ diff --git a/libkram/fmt/fmt.cpp b/libkram/fmt/fmt.cpp new file mode 100644 index 00000000..971d46da --- /dev/null +++ b/libkram/fmt/fmt.cpp @@ -0,0 +1,100 @@ +module; +#ifndef __cpp_modules +# error Module not supported. +#endif + +// put all implementation-provided headers into the global module fragment +// to prevent attachment to this module +#if !defined(_CRT_SECURE_NO_WARNINGS) && defined(_MSC_VER) +# define _CRT_SECURE_NO_WARNINGS +#endif +#if !defined(WIN32_LEAN_AND_MEAN) && defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if _MSC_VER +# include +#endif +#if defined __APPLE__ || defined(__FreeBSD__) +# include +#endif +#if __has_include() +# include +#endif +#if (__has_include() || defined(__APPLE__) || \ + defined(__linux__)) && \ + (!defined(WINAPI_FAMILY) || (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP)) +# include +# include +# include +# ifndef _WIN32 +# include +# else +# include +# endif +#endif +#ifdef _WIN32 +# include +#endif + +export module fmt; + +#define FMT_MODULE_EXPORT export +#define FMT_MODULE_EXPORT_BEGIN export { +#define FMT_MODULE_EXPORT_END } +#define FMT_BEGIN_DETAIL_NAMESPACE \ + } \ + namespace detail { +#define FMT_END_DETAIL_NAMESPACE \ + } \ + export { +// all library-provided declarations and definitions +// must be in the module purview to be exported +#include "args.h" +#include "chrono.h" +#include "color.h" +#include "compile.h" +#include "format.h" +#include "os.h" +#include "printf.h" +#include "xchar.h" + +// gcc doesn't yet implement private module fragments +#if !FMT_GCC_VERSION +module : private; +#endif + +// These are already included in project +//#include "format.cpp" +//#include "os.cpp" diff --git a/libkram/fmt/format-inl.h b/libkram/fmt/format-inl.h new file mode 100644 index 00000000..2d3a4d61 --- /dev/null +++ b/libkram/fmt/format-inl.h @@ -0,0 +1,1754 @@ +// Formatting library for C++ - implementation +// +// Copyright (c) 2012 - 2016, Victor Zverovich +// All rights reserved. +// +// For the license information refer to format.h. + +#ifndef FMT_FORMAT_INL_H_ +#define FMT_FORMAT_INL_H_ + +#include +#include +#include // errno +#include +#include +#include +#include // std::memmove +#include +#include + +#ifndef FMT_STATIC_THOUSANDS_SEPARATOR +# include +#endif + +#ifdef _WIN32 +# include // _isatty +#endif + +#include "format.h" + +FMT_BEGIN_NAMESPACE +namespace detail { + +FMT_FUNC void assert_fail(const char* file, int line, const char* message) { + // Use unchecked std::fprintf to avoid triggering another assertion when + // writing to stderr fails + std::fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message); + // Chosen instead of std::abort to satisfy Clang in CUDA mode during device + // code pass. + std::terminate(); +} + +FMT_FUNC void throw_format_error(const char* message) { + FMT_THROW(format_error(message)); +} + +FMT_FUNC void format_error_code(detail::buffer& out, int error_code, + string_view message) noexcept { + // Report error code making sure that the output fits into + // inline_buffer_size to avoid dynamic memory allocation and potential + // bad_alloc. + out.try_resize(0); + static const char SEP[] = ": "; + static const char ERROR_STR[] = "error "; + // Subtract 2 to account for terminating null characters in SEP and ERROR_STR. + size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2; + auto abs_value = static_cast>(error_code); + if (detail::is_negative(error_code)) { + abs_value = 0 - abs_value; + ++error_code_size; + } + error_code_size += detail::to_unsigned(detail::count_digits(abs_value)); + auto it = buffer_appender(out); + if (message.size() <= inline_buffer_size - error_code_size) + format_to(it, FMT_STRING("{}{}"), message, SEP); + format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code); + FMT_ASSERT(out.size() <= inline_buffer_size, ""); +} + +FMT_FUNC void report_error(format_func func, int error_code, + const char* message) noexcept { + memory_buffer full_message; + func(full_message, error_code, message); + // Don't use fwrite_fully because the latter may throw. + if (std::fwrite(full_message.data(), full_message.size(), 1, stderr) > 0) + std::fputc('\n', stderr); +} + +// A wrapper around fwrite that throws on error. +inline void fwrite_fully(const void* ptr, size_t size, size_t count, + FILE* stream) { + size_t written = std::fwrite(ptr, size, count, stream); + if (written < count) + FMT_THROW(system_error(errno, FMT_STRING("cannot write to file"))); +} + +#ifndef FMT_STATIC_THOUSANDS_SEPARATOR +template +locale_ref::locale_ref(const Locale& loc) : locale_(&loc) { + static_assert(std::is_same::value, ""); +} + +template Locale locale_ref::get() const { + static_assert(std::is_same::value, ""); + return locale_ ? *static_cast(locale_) : std::locale(); +} + +template +FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result { + auto& facet = std::use_facet>(loc.get()); + auto grouping = facet.grouping(); + auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep(); + return {std::move(grouping), thousands_sep}; +} +template FMT_FUNC Char decimal_point_impl(locale_ref loc) { + return std::use_facet>(loc.get()) + .decimal_point(); +} +#else +template +FMT_FUNC auto thousands_sep_impl(locale_ref) -> thousands_sep_result { + return {"\03", FMT_STATIC_THOUSANDS_SEPARATOR}; +} +template FMT_FUNC Char decimal_point_impl(locale_ref) { + return '.'; +} +#endif + +FMT_FUNC auto write_loc(appender out, loc_value value, + const format_specs& specs, locale_ref loc) -> bool { +#ifndef FMT_STATIC_THOUSANDS_SEPARATOR + auto locale = loc.get(); + // We cannot use the num_put facet because it may produce output in + // a wrong encoding. + using facet = format_facet; + if (std::has_facet(locale)) + return std::use_facet(locale).put(out, value, specs); + return facet(locale).put(out, value, specs); +#endif + return false; +} +} // namespace detail + +template typename Locale::id format_facet::id; + +#ifndef FMT_STATIC_THOUSANDS_SEPARATOR +template format_facet::format_facet(Locale& loc) { + auto& numpunct = std::use_facet>(loc); + grouping_ = numpunct.grouping(); + if (!grouping_.empty()) separator_ = std::string(1, numpunct.thousands_sep()); +} + +template <> +FMT_API FMT_FUNC auto format_facet::do_put( + appender out, loc_value val, const format_specs& specs) const -> bool { + return val.visit( + detail::loc_writer<>{out, specs, separator_, grouping_, decimal_point_}); +} +#endif + +#if !FMT_MSC_VERSION +FMT_API FMT_FUNC format_error::~format_error() noexcept = default; +#endif + +FMT_FUNC std::system_error vsystem_error(int error_code, string_view format_str, + format_args args) { + auto ec = std::error_code(error_code, std::generic_category()); + return std::system_error(ec, vformat(format_str, args)); +} + +namespace detail { + +template inline bool operator==(basic_fp x, basic_fp y) { + return x.f == y.f && x.e == y.e; +} + +// Compilers should be able to optimize this into the ror instruction. +FMT_CONSTEXPR inline uint32_t rotr(uint32_t n, uint32_t r) noexcept { + r &= 31; + return (n >> r) | (n << (32 - r)); +} +FMT_CONSTEXPR inline uint64_t rotr(uint64_t n, uint32_t r) noexcept { + r &= 63; + return (n >> r) | (n << (64 - r)); +} + +// Computes 128-bit result of multiplication of two 64-bit unsigned integers. +inline uint128_fallback umul128(uint64_t x, uint64_t y) noexcept { +#if FMT_USE_INT128 + auto p = static_cast(x) * static_cast(y); + return {static_cast(p >> 64), static_cast(p)}; +#elif defined(_MSC_VER) && defined(_M_X64) + auto result = uint128_fallback(); + result.lo_ = _umul128(x, y, &result.hi_); + return result; +#else + const uint64_t mask = static_cast(max_value()); + + uint64_t a = x >> 32; + uint64_t b = x & mask; + uint64_t c = y >> 32; + uint64_t d = y & mask; + + uint64_t ac = a * c; + uint64_t bc = b * c; + uint64_t ad = a * d; + uint64_t bd = b * d; + + uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask); + + return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32), + (intermediate << 32) + (bd & mask)}; +#endif +} + +// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox. +namespace dragonbox { +// Computes upper 64 bits of multiplication of two 64-bit unsigned integers. +inline uint64_t umul128_upper64(uint64_t x, uint64_t y) noexcept { +#if FMT_USE_INT128 + auto p = static_cast(x) * static_cast(y); + return static_cast(p >> 64); +#elif defined(_MSC_VER) && defined(_M_X64) + return __umulh(x, y); +#else + return umul128(x, y).high(); +#endif +} + +// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a +// 128-bit unsigned integer. +inline uint128_fallback umul192_upper128(uint64_t x, + uint128_fallback y) noexcept { + uint128_fallback r = umul128(x, y.high()); + r += umul128_upper64(x, y.low()); + return r; +} + +// Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a +// 64-bit unsigned integer. +inline uint64_t umul96_upper64(uint32_t x, uint64_t y) noexcept { + return umul128_upper64(static_cast(x) << 32, y); +} + +// Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a +// 128-bit unsigned integer. +inline uint128_fallback umul192_lower128(uint64_t x, + uint128_fallback y) noexcept { + uint64_t high = x * y.high(); + uint128_fallback high_low = umul128(x, y.low()); + return {high + high_low.high(), high_low.low()}; +} + +// Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a +// 64-bit unsigned integer. +inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept { + return x * y; +} + +// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from +// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1. +inline int floor_log10_pow2(int e) noexcept { + FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent"); + static_assert((-1 >> 1) == -1, "right shift is not arithmetic"); + return (e * 315653) >> 20; +} + +// Various fast log computations. +inline int floor_log2_pow10(int e) noexcept { + FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent"); + return (e * 1741647) >> 19; +} +inline int floor_log10_pow2_minus_log10_4_over_3(int e) noexcept { + FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent"); + return (e * 631305 - 261663) >> 21; +} + +static constexpr struct { + uint32_t divisor; + int shift_amount; +} div_small_pow10_infos[] = {{10, 16}, {100, 16}}; + +// Replaces n by floor(n / pow(10, N)) returning true if and only if n is +// divisible by pow(10, N). +// Precondition: n <= pow(10, N + 1). +template +bool check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept { + // The numbers below are chosen such that: + // 1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100, + // 2. nm mod 2^k < m if and only if n is divisible by d, + // where m is magic_number, k is shift_amount + // and d is divisor. + // + // Item 1 is a common technique of replacing division by a constant with + // multiplication, see e.g. "Division by Invariant Integers Using + // Multiplication" by Granlund and Montgomery (1994). magic_number (m) is set + // to ceil(2^k/d) for large enough k. + // The idea for item 2 originates from Schubfach. + constexpr auto info = div_small_pow10_infos[N - 1]; + FMT_ASSERT(n <= info.divisor * 10, "n is too large"); + constexpr uint32_t magic_number = + (1u << info.shift_amount) / info.divisor + 1; + n *= magic_number; + const uint32_t comparison_mask = (1u << info.shift_amount) - 1; + bool result = (n & comparison_mask) < magic_number; + n >>= info.shift_amount; + return result; +} + +// Computes floor(n / pow(10, N)) for small n and N. +// Precondition: n <= pow(10, N + 1). +template uint32_t small_division_by_pow10(uint32_t n) noexcept { + constexpr auto info = div_small_pow10_infos[N - 1]; + FMT_ASSERT(n <= info.divisor * 10, "n is too large"); + constexpr uint32_t magic_number = + (1u << info.shift_amount) / info.divisor + 1; + return (n * magic_number) >> info.shift_amount; +} + +// Computes floor(n / 10^(kappa + 1)) (float) +inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) noexcept { + // 1374389535 = ceil(2^37/100) + return static_cast((static_cast(n) * 1374389535) >> 37); +} +// Computes floor(n / 10^(kappa + 1)) (double) +inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) noexcept { + // 2361183241434822607 = ceil(2^(64+7)/1000) + return umul128_upper64(n, 2361183241434822607ull) >> 7; +} + +// Various subroutines using pow10 cache +template struct cache_accessor; + +template <> struct cache_accessor { + using carrier_uint = float_info::carrier_uint; + using cache_entry_type = uint64_t; + + static uint64_t get_cached_power(int k) noexcept { + FMT_ASSERT(k >= float_info::min_k && k <= float_info::max_k, + "k is out of range"); + static constexpr const uint64_t pow10_significands[] = { + 0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f, + 0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb, + 0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28, + 0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb, + 0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a, + 0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810, + 0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff, + 0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd, + 0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424, + 0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b, + 0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000, + 0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000, + 0xc350000000000000, 0xf424000000000000, 0x9896800000000000, + 0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000, + 0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000, + 0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000, + 0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000, + 0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000, + 0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0, + 0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940985, + 0xa18f07d736b90be6, 0xc9f2c9cd04674edf, 0xfc6f7c4045812297, + 0x9dc5ada82b70b59e, 0xc5371912364ce306, 0xf684df56c3e01bc7, + 0x9a130b963a6c115d, 0xc097ce7bc90715b4, 0xf0bdc21abb48db21, + 0x96769950b50d88f5, 0xbc143fa4e250eb32, 0xeb194f8e1ae525fe, + 0x92efd1b8d0cf37bf, 0xb7abc627050305ae, 0xe596b7b0c643c71a, + 0x8f7e32ce7bea5c70, 0xb35dbf821ae4f38c, 0xe0352f62a19e306f}; + return pow10_significands[k - float_info::min_k]; + } + + struct compute_mul_result { + carrier_uint result; + bool is_integer; + }; + struct compute_mul_parity_result { + bool parity; + bool is_integer; + }; + + static compute_mul_result compute_mul( + carrier_uint u, const cache_entry_type& cache) noexcept { + auto r = umul96_upper64(u, cache); + return {static_cast(r >> 32), + static_cast(r) == 0}; + } + + static uint32_t compute_delta(const cache_entry_type& cache, + int beta) noexcept { + return static_cast(cache >> (64 - 1 - beta)); + } + + static compute_mul_parity_result compute_mul_parity( + carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept { + FMT_ASSERT(beta >= 1, ""); + FMT_ASSERT(beta < 64, ""); + + auto r = umul96_lower64(two_f, cache); + return {((r >> (64 - beta)) & 1) != 0, + static_cast(r >> (32 - beta)) == 0}; + } + + static carrier_uint compute_left_endpoint_for_shorter_interval_case( + const cache_entry_type& cache, int beta) noexcept { + return static_cast( + (cache - (cache >> (num_significand_bits() + 2))) >> + (64 - num_significand_bits() - 1 - beta)); + } + + static carrier_uint compute_right_endpoint_for_shorter_interval_case( + const cache_entry_type& cache, int beta) noexcept { + return static_cast( + (cache + (cache >> (num_significand_bits() + 1))) >> + (64 - num_significand_bits() - 1 - beta)); + } + + static carrier_uint compute_round_up_for_shorter_interval_case( + const cache_entry_type& cache, int beta) noexcept { + return (static_cast( + cache >> (64 - num_significand_bits() - 2 - beta)) + + 1) / + 2; + } +}; + +template <> struct cache_accessor { + using carrier_uint = float_info::carrier_uint; + using cache_entry_type = uint128_fallback; + + static uint128_fallback get_cached_power(int k) noexcept { + FMT_ASSERT(k >= float_info::min_k && k <= float_info::max_k, + "k is out of range"); + + static constexpr const uint128_fallback pow10_significands[] = { +#if FMT_USE_FULL_CACHE_DRAGONBOX + {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b}, + {0x9faacf3df73609b1, 0x77b191618c54e9ad}, + {0xc795830d75038c1d, 0xd59df5b9ef6a2418}, + {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e}, + {0x9becce62836ac577, 0x4ee367f9430aec33}, + {0xc2e801fb244576d5, 0x229c41f793cda740}, + {0xf3a20279ed56d48a, 0x6b43527578c11110}, + {0x9845418c345644d6, 0x830a13896b78aaaa}, + {0xbe5691ef416bd60c, 0x23cc986bc656d554}, + {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9}, + {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa}, + {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54}, + {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69}, + {0x91376c36d99995be, 0x23100809b9c21fa2}, + {0xb58547448ffffb2d, 0xabd40a0c2832a78b}, + {0xe2e69915b3fff9f9, 0x16c90c8f323f516d}, + {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4}, + {0xb1442798f49ffb4a, 0x99cd11cfdf41779d}, + {0xdd95317f31c7fa1d, 0x40405643d711d584}, + {0x8a7d3eef7f1cfc52, 0x482835ea666b2573}, + {0xad1c8eab5ee43b66, 0xda3243650005eed0}, + {0xd863b256369d4a40, 0x90bed43e40076a83}, + {0x873e4f75e2224e68, 0x5a7744a6e804a292}, + {0xa90de3535aaae202, 0x711515d0a205cb37}, + {0xd3515c2831559a83, 0x0d5a5b44ca873e04}, + {0x8412d9991ed58091, 0xe858790afe9486c3}, + {0xa5178fff668ae0b6, 0x626e974dbe39a873}, + {0xce5d73ff402d98e3, 0xfb0a3d212dc81290}, + {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a}, + {0xa139029f6a239f72, 0x1c1fffc1ebc44e81}, + {0xc987434744ac874e, 0xa327ffb266b56221}, + {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9}, + {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa}, + {0xc4ce17b399107c22, 0xcb550fb4384d21d4}, + {0xf6019da07f549b2b, 0x7e2a53a146606a49}, + {0x99c102844f94e0fb, 0x2eda7444cbfc426e}, + {0xc0314325637a1939, 0xfa911155fefb5309}, + {0xf03d93eebc589f88, 0x793555ab7eba27cb}, + {0x96267c7535b763b5, 0x4bc1558b2f3458df}, + {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17}, + {0xea9c227723ee8bcb, 0x465e15a979c1cadd}, + {0x92a1958a7675175f, 0x0bfacd89ec191eca}, + {0xb749faed14125d36, 0xcef980ec671f667c}, + {0xe51c79a85916f484, 0x82b7e12780e7401b}, + {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811}, + {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16}, + {0xdfbdcece67006ac9, 0x67a791e093e1d49b}, + {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1}, + {0xaecc49914078536d, 0x58fae9f773886e19}, + {0xda7f5bf590966848, 0xaf39a475506a899f}, + {0x888f99797a5e012d, 0x6d8406c952429604}, + {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84}, + {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65}, + {0x855c3be0a17fcd26, 0x5cf2eea09a550680}, + {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f}, + {0xd0601d8efc57b08b, 0xf13b94daf124da27}, + {0x823c12795db6ce57, 0x76c53d08d6b70859}, + {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f}, + {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a}, + {0xfe5d54150b090b02, 0xd3f93b35435d7c4d}, + {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0}, + {0xc6b8e9b0709f109a, 0x359ab6419ca1091c}, + {0xf867241c8cc6d4c0, 0xc30163d203c94b63}, + {0x9b407691d7fc44f8, 0x79e0de63425dcf1e}, + {0xc21094364dfb5636, 0x985915fc12f542e5}, + {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e}, + {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43}, + {0xbd8430bd08277231, 0x50c6ff782a838354}, + {0xece53cec4a314ebd, 0xa4f8bf5635246429}, + {0x940f4613ae5ed136, 0x871b7795e136be9a}, + {0xb913179899f68584, 0x28e2557b59846e40}, + {0xe757dd7ec07426e5, 0x331aeada2fe589d0}, + {0x9096ea6f3848984f, 0x3ff0d2c85def7622}, + {0xb4bca50b065abe63, 0x0fed077a756b53aa}, + {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895}, + {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d}, + {0xb080392cc4349dec, 0xbd8d794d96aacfb4}, + {0xdca04777f541c567, 0xecf0d7a0fc5583a1}, + {0x89e42caaf9491b60, 0xf41686c49db57245}, + {0xac5d37d5b79b6239, 0x311c2875c522ced6}, + {0xd77485cb25823ac7, 0x7d633293366b828c}, + {0x86a8d39ef77164bc, 0xae5dff9c02033198}, + {0xa8530886b54dbdeb, 0xd9f57f830283fdfd}, + {0xd267caa862a12d66, 0xd072df63c324fd7c}, + {0x8380dea93da4bc60, 0x4247cb9e59f71e6e}, + {0xa46116538d0deb78, 0x52d9be85f074e609}, + {0xcd795be870516656, 0x67902e276c921f8c}, + {0x806bd9714632dff6, 0x00ba1cd8a3db53b7}, + {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5}, + {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce}, + {0xfad2a4b13d1b5d6c, 0x796b805720085f82}, + {0x9cc3a6eec6311a63, 0xcbe3303674053bb1}, + {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d}, + {0xf4f1b4d515acb93b, 0xee92fb5515482d45}, + {0x991711052d8bf3c5, 0x751bdd152d4d1c4b}, + {0xbf5cd54678eef0b6, 0xd262d45a78a0635e}, + {0xef340a98172aace4, 0x86fb897116c87c35}, + {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1}, + {0xbae0a846d2195712, 0x8974836059cca10a}, + {0xe998d258869facd7, 0x2bd1a438703fc94c}, + {0x91ff83775423cc06, 0x7b6306a34627ddd0}, + {0xb67f6455292cbf08, 0x1a3bc84c17b1d543}, + {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94}, + {0x8e938662882af53e, 0x547eb47b7282ee9d}, + {0xb23867fb2a35b28d, 0xe99e619a4f23aa44}, + {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5}, + {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05}, + {0xae0b158b4738705e, 0x9624ab50b148d446}, + {0xd98ddaee19068c76, 0x3badd624dd9b0958}, + {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7}, + {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d}, + {0xd47487cc8470652b, 0x7647c32000696720}, + {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074}, + {0xa5fb0a17c777cf09, 0xf468107100525891}, + {0xcf79cc9db955c2cc, 0x7182148d4066eeb5}, + {0x81ac1fe293d599bf, 0xc6f14cd848405531}, + {0xa21727db38cb002f, 0xb8ada00e5a506a7d}, + {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d}, + {0xfd442e4688bd304a, 0x908f4a166d1da664}, + {0x9e4a9cec15763e2e, 0x9a598e4e043287ff}, + {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe}, + {0xf7549530e188c128, 0xd12bee59e68ef47d}, + {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf}, + {0xc13a148e3032d6e7, 0xe36a52363c1faf02}, + {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2}, + {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba}, + {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8}, + {0xebdf661791d60f56, 0x111b495b3464ad22}, + {0x936b9fcebb25c995, 0xcab10dd900beec35}, + {0xb84687c269ef3bfb, 0x3d5d514f40eea743}, + {0xe65829b3046b0afa, 0x0cb4a5a3112a5113}, + {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac}, + {0xb3f4e093db73a093, 0x59ed216765690f57}, + {0xe0f218b8d25088b8, 0x306869c13ec3532d}, + {0x8c974f7383725573, 0x1e414218c73a13fc}, + {0xafbd2350644eeacf, 0xe5d1929ef90898fb}, + {0xdbac6c247d62a583, 0xdf45f746b74abf3a}, + {0x894bc396ce5da772, 0x6b8bba8c328eb784}, + {0xab9eb47c81f5114f, 0x066ea92f3f326565}, + {0xd686619ba27255a2, 0xc80a537b0efefebe}, + {0x8613fd0145877585, 0xbd06742ce95f5f37}, + {0xa798fc4196e952e7, 0x2c48113823b73705}, + {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6}, + {0x82ef85133de648c4, 0x9a984d73dbe722fc}, + {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb}, + {0xcc963fee10b7d1b3, 0x318df905079926a9}, + {0xffbbcfe994e5c61f, 0xfdf17746497f7053}, + {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634}, + {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1}, + {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1}, + {0x9c1661a651213e2d, 0x06bea10ca65c084f}, + {0xc31bfa0fe5698db8, 0x486e494fcff30a63}, + {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb}, + {0x986ddb5c6b3a76b7, 0xf89629465a75e01d}, + {0xbe89523386091465, 0xf6bbb397f1135824}, + {0xee2ba6c0678b597f, 0x746aa07ded582e2d}, + {0x94db483840b717ef, 0xa8c2a44eb4571cdd}, + {0xba121a4650e4ddeb, 0x92f34d62616ce414}, + {0xe896a0d7e51e1566, 0x77b020baf9c81d18}, + {0x915e2486ef32cd60, 0x0ace1474dc1d122f}, + {0xb5b5ada8aaff80b8, 0x0d819992132456bb}, + {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a}, + {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2}, + {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3}, + {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf}, + {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c}, + {0xad4ab7112eb3929d, 0x86c16c98d2c953c7}, + {0xd89d64d57a607744, 0xe871c7bf077ba8b8}, + {0x87625f056c7c4a8b, 0x11471cd764ad4973}, + {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0}, + {0xd389b47879823479, 0x4aff1d108d4ec2c4}, + {0x843610cb4bf160cb, 0xcedf722a585139bb}, + {0xa54394fe1eedb8fe, 0xc2974eb4ee658829}, + {0xce947a3da6a9273e, 0x733d226229feea33}, + {0x811ccc668829b887, 0x0806357d5a3f5260}, + {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8}, + {0xc9bcff6034c13052, 0xfc89b393dd02f0b6}, + {0xfc2c3f3841f17c67, 0xbbac2078d443ace3}, + {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e}, + {0xc5029163f384a931, 0x0a9e795e65d4df12}, + {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6}, + {0x99ea0196163fa42e, 0x504bced1bf8e4e46}, + {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7}, + {0xf07da27a82c37088, 0x5d767327bb4e5a4d}, + {0x964e858c91ba2655, 0x3a6a07f8d510f870}, + {0xbbe226efb628afea, 0x890489f70a55368c}, + {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f}, + {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e}, + {0xb77ada0617e3bbcb, 0x09ce6ebb40173745}, + {0xe55990879ddcaabd, 0xcc420a6a101d0516}, + {0x8f57fa54c2a9eab6, 0x9fa946824a12232e}, + {0xb32df8e9f3546564, 0x47939822dc96abfa}, + {0xdff9772470297ebd, 0x59787e2b93bc56f8}, + {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b}, + {0xaefae51477a06b03, 0xede622920b6b23f2}, + {0xdab99e59958885c4, 0xe95fab368e45ecee}, + {0x88b402f7fd75539b, 0x11dbcb0218ebb415}, + {0xaae103b5fcd2a881, 0xd652bdc29f26a11a}, + {0xd59944a37c0752a2, 0x4be76d3346f04960}, + {0x857fcae62d8493a5, 0x6f70a4400c562ddc}, + {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953}, + {0xd097ad07a71f26b2, 0x7e2000a41346a7a8}, + {0x825ecc24c873782f, 0x8ed400668c0c28c9}, + {0xa2f67f2dfa90563b, 0x728900802f0f32fb}, + {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba}, + {0xfea126b7d78186bc, 0xe2f610c84987bfa9}, + {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca}, + {0xc6ede63fa05d3143, 0x91503d1c79720dbc}, + {0xf8a95fcf88747d94, 0x75a44c6397ce912b}, + {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb}, + {0xc24452da229b021b, 0xfbe85badce996169}, + {0xf2d56790ab41c2a2, 0xfae27299423fb9c4}, + {0x97c560ba6b0919a5, 0xdccd879fc967d41b}, + {0xbdb6b8e905cb600f, 0x5400e987bbc1c921}, + {0xed246723473e3813, 0x290123e9aab23b69}, + {0x9436c0760c86e30b, 0xf9a0b6720aaf6522}, + {0xb94470938fa89bce, 0xf808e40e8d5b3e6a}, + {0xe7958cb87392c2c2, 0xb60b1d1230b20e05}, + {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3}, + {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4}, + {0xe2280b6c20dd5232, 0x25c6da63c38de1b1}, + {0x8d590723948a535f, 0x579c487e5a38ad0f}, + {0xb0af48ec79ace837, 0x2d835a9df0c6d852}, + {0xdcdb1b2798182244, 0xf8e431456cf88e66}, + {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900}, + {0xac8b2d36eed2dac5, 0xe272467e3d222f40}, + {0xd7adf884aa879177, 0x5b0ed81dcc6abb10}, + {0x86ccbb52ea94baea, 0x98e947129fc2b4ea}, + {0xa87fea27a539e9a5, 0x3f2398d747b36225}, + {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae}, + {0x83a3eeeef9153e89, 0x1953cf68300424ad}, + {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8}, + {0xcdb02555653131b6, 0x3792f412cb06794e}, + {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1}, + {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5}, + {0xc8de047564d20a8b, 0xf245825a5a445276}, + {0xfb158592be068d2e, 0xeed6e2f0f0d56713}, + {0x9ced737bb6c4183d, 0x55464dd69685606c}, + {0xc428d05aa4751e4c, 0xaa97e14c3c26b887}, + {0xf53304714d9265df, 0xd53dd99f4b3066a9}, + {0x993fe2c6d07b7fab, 0xe546a8038efe402a}, + {0xbf8fdb78849a5f96, 0xde98520472bdd034}, + {0xef73d256a5c0f77c, 0x963e66858f6d4441}, + {0x95a8637627989aad, 0xdde7001379a44aa9}, + {0xbb127c53b17ec159, 0x5560c018580d5d53}, + {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7}, + {0x9226712162ab070d, 0xcab3961304ca70e9}, + {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23}, + {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b}, + {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243}, + {0xb267ed1940f1c61c, 0x55f038b237591ed4}, + {0xdf01e85f912e37a3, 0x6b6c46dec52f6689}, + {0x8b61313bbabce2c6, 0x2323ac4b3b3da016}, + {0xae397d8aa96c1b77, 0xabec975e0a0d081b}, + {0xd9c7dced53c72255, 0x96e7bd358c904a22}, + {0x881cea14545c7575, 0x7e50d64177da2e55}, + {0xaa242499697392d2, 0xdde50bd1d5d0b9ea}, + {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865}, + {0x84ec3c97da624ab4, 0xbd5af13bef0b113f}, + {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f}, + {0xcfb11ead453994ba, 0x67de18eda5814af3}, + {0x81ceb32c4b43fcf4, 0x80eacf948770ced8}, + {0xa2425ff75e14fc31, 0xa1258379a94d028e}, + {0xcad2f7f5359a3b3e, 0x096ee45813a04331}, + {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd}, + {0x9e74d1b791e07e48, 0x775ea264cf55347e}, + {0xc612062576589dda, 0x95364afe032a819e}, + {0xf79687aed3eec551, 0x3a83ddbd83f52205}, + {0x9abe14cd44753b52, 0xc4926a9672793543}, + {0xc16d9a0095928a27, 0x75b7053c0f178294}, + {0xf1c90080baf72cb1, 0x5324c68b12dd6339}, + {0x971da05074da7bee, 0xd3f6fc16ebca5e04}, + {0xbce5086492111aea, 0x88f4bb1ca6bcf585}, + {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6}, + {0x9392ee8e921d5d07, 0x3aff322e62439fd0}, + {0xb877aa3236a4b449, 0x09befeb9fad487c3}, + {0xe69594bec44de15b, 0x4c2ebe687989a9b4}, + {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11}, + {0xb424dc35095cd80f, 0x538484c19ef38c95}, + {0xe12e13424bb40e13, 0x2865a5f206b06fba}, + {0x8cbccc096f5088cb, 0xf93f87b7442e45d4}, + {0xafebff0bcb24aafe, 0xf78f69a51539d749}, + {0xdbe6fecebdedd5be, 0xb573440e5a884d1c}, + {0x89705f4136b4a597, 0x31680a88f8953031}, + {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e}, + {0xd6bf94d5e57a42bc, 0x3d32907604691b4d}, + {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110}, + {0xa7c5ac471b478423, 0x0fcf80dc33721d54}, + {0xd1b71758e219652b, 0xd3c36113404ea4a9}, + {0x83126e978d4fdf3b, 0x645a1cac083126ea}, + {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4}, + {0xcccccccccccccccc, 0xcccccccccccccccd}, + {0x8000000000000000, 0x0000000000000000}, + {0xa000000000000000, 0x0000000000000000}, + {0xc800000000000000, 0x0000000000000000}, + {0xfa00000000000000, 0x0000000000000000}, + {0x9c40000000000000, 0x0000000000000000}, + {0xc350000000000000, 0x0000000000000000}, + {0xf424000000000000, 0x0000000000000000}, + {0x9896800000000000, 0x0000000000000000}, + {0xbebc200000000000, 0x0000000000000000}, + {0xee6b280000000000, 0x0000000000000000}, + {0x9502f90000000000, 0x0000000000000000}, + {0xba43b74000000000, 0x0000000000000000}, + {0xe8d4a51000000000, 0x0000000000000000}, + {0x9184e72a00000000, 0x0000000000000000}, + {0xb5e620f480000000, 0x0000000000000000}, + {0xe35fa931a0000000, 0x0000000000000000}, + {0x8e1bc9bf04000000, 0x0000000000000000}, + {0xb1a2bc2ec5000000, 0x0000000000000000}, + {0xde0b6b3a76400000, 0x0000000000000000}, + {0x8ac7230489e80000, 0x0000000000000000}, + {0xad78ebc5ac620000, 0x0000000000000000}, + {0xd8d726b7177a8000, 0x0000000000000000}, + {0x878678326eac9000, 0x0000000000000000}, + {0xa968163f0a57b400, 0x0000000000000000}, + {0xd3c21bcecceda100, 0x0000000000000000}, + {0x84595161401484a0, 0x0000000000000000}, + {0xa56fa5b99019a5c8, 0x0000000000000000}, + {0xcecb8f27f4200f3a, 0x0000000000000000}, + {0x813f3978f8940984, 0x4000000000000000}, + {0xa18f07d736b90be5, 0x5000000000000000}, + {0xc9f2c9cd04674ede, 0xa400000000000000}, + {0xfc6f7c4045812296, 0x4d00000000000000}, + {0x9dc5ada82b70b59d, 0xf020000000000000}, + {0xc5371912364ce305, 0x6c28000000000000}, + {0xf684df56c3e01bc6, 0xc732000000000000}, + {0x9a130b963a6c115c, 0x3c7f400000000000}, + {0xc097ce7bc90715b3, 0x4b9f100000000000}, + {0xf0bdc21abb48db20, 0x1e86d40000000000}, + {0x96769950b50d88f4, 0x1314448000000000}, + {0xbc143fa4e250eb31, 0x17d955a000000000}, + {0xeb194f8e1ae525fd, 0x5dcfab0800000000}, + {0x92efd1b8d0cf37be, 0x5aa1cae500000000}, + {0xb7abc627050305ad, 0xf14a3d9e40000000}, + {0xe596b7b0c643c719, 0x6d9ccd05d0000000}, + {0x8f7e32ce7bea5c6f, 0xe4820023a2000000}, + {0xb35dbf821ae4f38b, 0xdda2802c8a800000}, + {0xe0352f62a19e306e, 0xd50b2037ad200000}, + {0x8c213d9da502de45, 0x4526f422cc340000}, + {0xaf298d050e4395d6, 0x9670b12b7f410000}, + {0xdaf3f04651d47b4c, 0x3c0cdd765f114000}, + {0x88d8762bf324cd0f, 0xa5880a69fb6ac800}, + {0xab0e93b6efee0053, 0x8eea0d047a457a00}, + {0xd5d238a4abe98068, 0x72a4904598d6d880}, + {0x85a36366eb71f041, 0x47a6da2b7f864750}, + {0xa70c3c40a64e6c51, 0x999090b65f67d924}, + {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d}, + {0x82818f1281ed449f, 0xbff8f10e7a8921a5}, + {0xa321f2d7226895c7, 0xaff72d52192b6a0e}, + {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764491}, + {0xfee50b7025c36a08, 0x02f236d04753d5b5}, + {0x9f4f2726179a2245, 0x01d762422c946591}, + {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef6}, + {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb3}, + {0x9b934c3b330c8577, 0x63cc55f49f88eb30}, + {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fc}, + {0xf316271c7fc3908a, 0x8bef464e3945ef7b}, + {0x97edd871cfda3a56, 0x97758bf0e3cbb5ad}, + {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea318}, + {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bde}, + {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6b}, + {0xb975d6b6ee39e436, 0xb3e2fd538e122b45}, + {0xe7d34c64a9c85d44, 0x60dbbca87196b617}, + {0x90e40fbeea1d3a4a, 0xbc8955e946fe31ce}, + {0xb51d13aea4a488dd, 0x6babab6398bdbe42}, + {0xe264589a4dcdab14, 0xc696963c7eed2dd2}, + {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca3}, + {0xb0de65388cc8ada8, 0x3b25a55f43294bcc}, + {0xdd15fe86affad912, 0x49ef0eb713f39ebf}, + {0x8a2dbf142dfcc7ab, 0x6e3569326c784338}, + {0xacb92ed9397bf996, 0x49c2c37f07965405}, + {0xd7e77a8f87daf7fb, 0xdc33745ec97be907}, + {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a4}, + {0xa8acd7c0222311bc, 0xc40832ea0d68ce0d}, + {0xd2d80db02aabd62b, 0xf50a3fa490c30191}, + {0x83c7088e1aab65db, 0x792667c6da79e0fb}, + {0xa4b8cab1a1563f52, 0x577001b891185939}, + {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87}, + {0x80b05e5ac60b6178, 0x544f8158315b05b5}, + {0xa0dc75f1778e39d6, 0x696361ae3db1c722}, + {0xc913936dd571c84c, 0x03bc3a19cd1e38ea}, + {0xfb5878494ace3a5f, 0x04ab48a04065c724}, + {0x9d174b2dcec0e47b, 0x62eb0d64283f9c77}, + {0xc45d1df942711d9a, 0x3ba5d0bd324f8395}, + {0xf5746577930d6500, 0xca8f44ec7ee3647a}, + {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecc}, + {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67f}, + {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101f}, + {0x95d04aee3b80ece5, 0xbba1f1d158724a13}, + {0xbb445da9ca61281f, 0x2a8a6e45ae8edc98}, + {0xea1575143cf97226, 0xf52d09d71a3293be}, + {0x924d692ca61be758, 0x593c2626705f9c57}, + {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836d}, + {0xe498f455c38b997a, 0x0b6dfb9c0f956448}, + {0x8edf98b59a373fec, 0x4724bd4189bd5ead}, + {0xb2977ee300c50fe7, 0x58edec91ec2cb658}, + {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ee}, + {0x8b865b215899f46c, 0xbd79e0d20082ee75}, + {0xae67f1e9aec07187, 0xecd8590680a3aa12}, + {0xda01ee641a708de9, 0xe80e6f4820cc9496}, + {0x884134fe908658b2, 0x3109058d147fdcde}, + {0xaa51823e34a7eede, 0xbd4b46f0599fd416}, + {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91b}, + {0x850fadc09923329e, 0x03e2cf6bc604ddb1}, + {0xa6539930bf6bff45, 0x84db8346b786151d}, + {0xcfe87f7cef46ff16, 0xe612641865679a64}, + {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07f}, + {0xa26da3999aef7749, 0xe3be5e330f38f09e}, + {0xcb090c8001ab551c, 0x5cadf5bfd3072cc6}, + {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f7}, + {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afb}, + {0xc646d63501a1511d, 0xb281e1fd541501b9}, + {0xf7d88bc24209a565, 0x1f225a7ca91a4227}, + {0x9ae757596946075f, 0x3375788de9b06959}, + {0xc1a12d2fc3978937, 0x0052d6b1641c83af}, + {0xf209787bb47d6b84, 0xc0678c5dbd23a49b}, + {0x9745eb4d50ce6332, 0xf840b7ba963646e1}, + {0xbd176620a501fbff, 0xb650e5a93bc3d899}, + {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebf}, + {0x93ba47c980e98cdf, 0xc66f336c36b10138}, + {0xb8a8d9bbe123f017, 0xb80b0047445d4185}, + {0xe6d3102ad96cec1d, 0xa60dc059157491e6}, + {0x9043ea1ac7e41392, 0x87c89837ad68db30}, + {0xb454e4a179dd1877, 0x29babe4598c311fc}, + {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67b}, + {0x8ce2529e2734bb1d, 0x1899e4a65f58660d}, + {0xb01ae745b101e9e4, 0x5ec05dcff72e7f90}, + {0xdc21a1171d42645d, 0x76707543f4fa1f74}, + {0x899504ae72497eba, 0x6a06494a791c53a9}, + {0xabfa45da0edbde69, 0x0487db9d17636893}, + {0xd6f8d7509292d603, 0x45a9d2845d3c42b7}, + {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3}, + {0xa7f26836f282b732, 0x8e6cac7768d7141f}, + {0xd1ef0244af2364ff, 0x3207d795430cd927}, + {0x8335616aed761f1f, 0x7f44e6bd49e807b9}, + {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a7}, + {0xcd036837130890a1, 0x36dba887c37a8c10}, + {0x802221226be55a64, 0xc2494954da2c978a}, + {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6d}, + {0xc83553c5c8965d3d, 0x6f92829494e5acc8}, + {0xfa42a8b73abbf48c, 0xcb772339ba1f17fa}, + {0x9c69a97284b578d7, 0xff2a760414536efc}, + {0xc38413cf25e2d70d, 0xfef5138519684abb}, + {0xf46518c2ef5b8cd1, 0x7eb258665fc25d6a}, + {0x98bf2f79d5993802, 0xef2f773ffbd97a62}, + {0xbeeefb584aff8603, 0xaafb550ffacfd8fb}, + {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf39}, + {0x952ab45cfa97a0b2, 0xdd945a747bf26184}, + {0xba756174393d88df, 0x94f971119aeef9e5}, + {0xe912b9d1478ceb17, 0x7a37cd5601aab85e}, + {0x91abb422ccb812ee, 0xac62e055c10ab33b}, + {0xb616a12b7fe617aa, 0x577b986b314d600a}, + {0xe39c49765fdf9d94, 0xed5a7e85fda0b80c}, + {0x8e41ade9fbebc27d, 0x14588f13be847308}, + {0xb1d219647ae6b31c, 0x596eb2d8ae258fc9}, + {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bc}, + {0x8aec23d680043bee, 0x25de7bb9480d5855}, + {0xada72ccc20054ae9, 0xaf561aa79a10ae6b}, + {0xd910f7ff28069da4, 0x1b2ba1518094da05}, + {0x87aa9aff79042286, 0x90fb44d2f05d0843}, + {0xa99541bf57452b28, 0x353a1607ac744a54}, + {0xd3fa922f2d1675f2, 0x42889b8997915ce9}, + {0x847c9b5d7c2e09b7, 0x69956135febada12}, + {0xa59bc234db398c25, 0x43fab9837e699096}, + {0xcf02b2c21207ef2e, 0x94f967e45e03f4bc}, + {0x8161afb94b44f57d, 0x1d1be0eebac278f6}, + {0xa1ba1ba79e1632dc, 0x6462d92a69731733}, + {0xca28a291859bbf93, 0x7d7b8f7503cfdcff}, + {0xfcb2cb35e702af78, 0x5cda735244c3d43f}, + {0x9defbf01b061adab, 0x3a0888136afa64a8}, + {0xc56baec21c7a1916, 0x088aaa1845b8fdd1}, + {0xf6c69a72a3989f5b, 0x8aad549e57273d46}, + {0x9a3c2087a63f6399, 0x36ac54e2f678864c}, + {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7de}, + {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d6}, + {0x969eb7c47859e743, 0x9f644ae5a4b1b326}, + {0xbc4665b596706114, 0x873d5d9f0dde1fef}, + {0xeb57ff22fc0c7959, 0xa90cb506d155a7eb}, + {0x9316ff75dd87cbd8, 0x09a7f12442d588f3}, + {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb30}, + {0xe5d3ef282a242e81, 0x8f1668c8a86da5fb}, + {0x8fa475791a569d10, 0xf96e017d694487bd}, + {0xb38d92d760ec4455, 0x37c981dcc395a9ad}, + {0xe070f78d3927556a, 0x85bbe253f47b1418}, + {0x8c469ab843b89562, 0x93956d7478ccec8f}, + {0xaf58416654a6babb, 0x387ac8d1970027b3}, + {0xdb2e51bfe9d0696a, 0x06997b05fcc0319f}, + {0x88fcf317f22241e2, 0x441fece3bdf81f04}, + {0xab3c2fddeeaad25a, 0xd527e81cad7626c4}, + {0xd60b3bd56a5586f1, 0x8a71e223d8d3b075}, + {0x85c7056562757456, 0xf6872d5667844e4a}, + {0xa738c6bebb12d16c, 0xb428f8ac016561dc}, + {0xd106f86e69d785c7, 0xe13336d701beba53}, + {0x82a45b450226b39c, 0xecc0024661173474}, + {0xa34d721642b06084, 0x27f002d7f95d0191}, + {0xcc20ce9bd35c78a5, 0x31ec038df7b441f5}, + {0xff290242c83396ce, 0x7e67047175a15272}, + {0x9f79a169bd203e41, 0x0f0062c6e984d387}, + {0xc75809c42c684dd1, 0x52c07b78a3e60869}, + {0xf92e0c3537826145, 0xa7709a56ccdf8a83}, + {0x9bbcc7a142b17ccb, 0x88a66076400bb692}, + {0xc2abf989935ddbfe, 0x6acff893d00ea436}, + {0xf356f7ebf83552fe, 0x0583f6b8c4124d44}, + {0x98165af37b2153de, 0xc3727a337a8b704b}, + {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5d}, + {0xeda2ee1c7064130c, 0x1162def06f79df74}, + {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba9}, + {0xb9a74a0637ce2ee1, 0x6d953e2bd7173693}, + {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0438}, + {0x910ab1d4db9914a0, 0x1d9c9892400a22a3}, + {0xb54d5e4a127f59c8, 0x2503beb6d00cab4c}, + {0xe2a0b5dc971f303a, 0x2e44ae64840fd61e}, + {0x8da471a9de737e24, 0x5ceaecfed289e5d3}, + {0xb10d8e1456105dad, 0x7425a83e872c5f48}, + {0xdd50f1996b947518, 0xd12f124e28f7771a}, + {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa70}, + {0xace73cbfdc0bfb7b, 0x636cc64d1001550c}, + {0xd8210befd30efa5a, 0x3c47f7e05401aa4f}, + {0x8714a775e3e95c78, 0x65acfaec34810a72}, + {0xa8d9d1535ce3b396, 0x7f1839a741a14d0e}, + {0xd31045a8341ca07c, 0x1ede48111209a051}, + {0x83ea2b892091e44d, 0x934aed0aab460433}, + {0xa4e4b66b68b65d60, 0xf81da84d56178540}, + {0xce1de40642e3f4b9, 0x36251260ab9d668f}, + {0x80d2ae83e9ce78f3, 0xc1d72b7c6b42601a}, + {0xa1075a24e4421730, 0xb24cf65b8612f820}, + {0xc94930ae1d529cfc, 0xdee033f26797b628}, + {0xfb9b7cd9a4a7443c, 0x169840ef017da3b2}, + {0x9d412e0806e88aa5, 0x8e1f289560ee864f}, + {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e3}, + {0xf5b5d7ec8acb58a2, 0xae10af696774b1dc}, + {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef2a}, + {0xbff610b0cc6edd3f, 0x17fd090a58d32af4}, + {0xeff394dcff8a948e, 0xddfc4b4cef07f5b1}, + {0x95f83d0a1fb69cd9, 0x4abdaf101564f98f}, + {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f2}, + {0xea53df5fd18d5513, 0x84c86189216dc5ee}, + {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb5}, + {0xb7118682dbb66a77, 0x3fbc8c33221dc2a2}, + {0xe4d5e82392a40515, 0x0fabaf3feaa5334b}, + {0x8f05b1163ba6832d, 0x29cb4d87f2a7400f}, + {0xb2c71d5bca9023f8, 0x743e20e9ef511013}, + {0xdf78e4b2bd342cf6, 0x914da9246b255417}, + {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548f}, + {0xae9672aba3d0c320, 0xa184ac2473b529b2}, + {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741f}, + {0x8865899617fb1871, 0x7e2fa67c7a658893}, + {0xaa7eebfb9df9de8d, 0xddbb901b98feeab8}, + {0xd51ea6fa85785631, 0x552a74227f3ea566}, + {0x8533285c936b35de, 0xd53a88958f872760}, + {0xa67ff273b8460356, 0x8a892abaf368f138}, + {0xd01fef10a657842c, 0x2d2b7569b0432d86}, + {0x8213f56a67f6b29b, 0x9c3b29620e29fc74}, + {0xa298f2c501f45f42, 0x8349f3ba91b47b90}, + {0xcb3f2f7642717713, 0x241c70a936219a74}, + {0xfe0efb53d30dd4d7, 0xed238cd383aa0111}, + {0x9ec95d1463e8a506, 0xf4363804324a40ab}, + {0xc67bb4597ce2ce48, 0xb143c6053edcd0d6}, + {0xf81aa16fdc1b81da, 0xdd94b7868e94050b}, + {0x9b10a4e5e9913128, 0xca7cf2b4191c8327}, + {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f1}, + {0xf24a01a73cf2dccf, 0xbc633b39673c8ced}, + {0x976e41088617ca01, 0xd5be0503e085d814}, + {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e19}, + {0xec9c459d51852ba2, 0xddf8e7d60ed1219f}, + {0x93e1ab8252f33b45, 0xcabb90e5c942b504}, + {0xb8da1662e7b00a17, 0x3d6a751f3b936244}, + {0xe7109bfba19c0c9d, 0x0cc512670a783ad5}, + {0x906a617d450187e2, 0x27fb2b80668b24c6}, + {0xb484f9dc9641e9da, 0xb1f9f660802dedf7}, + {0xe1a63853bbd26451, 0x5e7873f8a0396974}, + {0x8d07e33455637eb2, 0xdb0b487b6423e1e9}, + {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda63}, + {0xdc5c5301c56b75f7, 0x7641a140cc7810fc}, + {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9e}, + {0xac2820d9623bf429, 0x546345fa9fbdcd45}, + {0xd732290fbacaf133, 0xa97c177947ad4096}, + {0x867f59a9d4bed6c0, 0x49ed8eabcccc485e}, + {0xa81f301449ee8c70, 0x5c68f256bfff5a75}, + {0xd226fc195c6a2f8c, 0x73832eec6fff3112}, + {0x83585d8fd9c25db7, 0xc831fd53c5ff7eac}, + {0xa42e74f3d032f525, 0xba3e7ca8b77f5e56}, + {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35ec}, + {0x80444b5e7aa7cf85, 0x7980d163cf5b81b4}, + {0xa0555e361951c366, 0xd7e105bcc3326220}, + {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa8}, + {0xfa856334878fc150, 0xb14f98f6f0feb952}, + {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d4}, + {0xc3b8358109e84f07, 0x0a862f80ec4700c9}, + {0xf4a642e14c6262c8, 0xcd27bb612758c0fb}, + {0x98e7e9cccfbd7dbd, 0x8038d51cb897789d}, + {0xbf21e44003acdd2c, 0xe0470a63e6bd56c4}, + {0xeeea5d5004981478, 0x1858ccfce06cac75}, + {0x95527a5202df0ccb, 0x0f37801e0c43ebc9}, + {0xbaa718e68396cffd, 0xd30560258f54e6bb}, + {0xe950df20247c83fd, 0x47c6b82ef32a206a}, + {0x91d28b7416cdd27e, 0x4cdc331d57fa5442}, + {0xb6472e511c81471d, 0xe0133fe4adf8e953}, + {0xe3d8f9e563a198e5, 0x58180fddd97723a7}, + {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7649}, + {0xb201833b35d63f73, 0x2cd2cc6551e513db}, + {0xde81e40a034bcf4f, 0xf8077f7ea65e58d2}, + {0x8b112e86420f6191, 0xfb04afaf27faf783}, + {0xadd57a27d29339f6, 0x79c5db9af1f9b564}, + {0xd94ad8b1c7380874, 0x18375281ae7822bd}, + {0x87cec76f1c830548, 0x8f2293910d0b15b6}, + {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb23}, + {0xd433179d9c8cb841, 0x5fa60692a46151ec}, + {0x849feec281d7f328, 0xdbc7c41ba6bcd334}, + {0xa5c7ea73224deff3, 0x12b9b522906c0801}, + {0xcf39e50feae16bef, 0xd768226b34870a01}, + {0x81842f29f2cce375, 0xe6a1158300d46641}, + {0xa1e53af46f801c53, 0x60495ae3c1097fd1}, + {0xca5e89b18b602368, 0x385bb19cb14bdfc5}, + {0xfcf62c1dee382c42, 0x46729e03dd9ed7b6}, + {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d2}, + {0xc5a05277621be293, 0xc7098b7305241886}, + { 0xf70867153aa2db38, + 0xb8cbee4fc66d1ea8 } +#else + {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b}, + {0xce5d73ff402d98e3, 0xfb0a3d212dc81290}, + {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f}, + {0x86a8d39ef77164bc, 0xae5dff9c02033198}, + {0xd98ddaee19068c76, 0x3badd624dd9b0958}, + {0xafbd2350644eeacf, 0xe5d1929ef90898fb}, + {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2}, + {0xe55990879ddcaabd, 0xcc420a6a101d0516}, + {0xb94470938fa89bce, 0xf808e40e8d5b3e6a}, + {0x95a8637627989aad, 0xdde7001379a44aa9}, + {0xf1c90080baf72cb1, 0x5324c68b12dd6339}, + {0xc350000000000000, 0x0000000000000000}, + {0x9dc5ada82b70b59d, 0xf020000000000000}, + {0xfee50b7025c36a08, 0x02f236d04753d5b5}, + {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87}, + {0xa6539930bf6bff45, 0x84db8346b786151d}, + {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3}, + {0xd910f7ff28069da4, 0x1b2ba1518094da05}, + {0xaf58416654a6babb, 0x387ac8d1970027b3}, + {0x8da471a9de737e24, 0x5ceaecfed289e5d3}, + {0xe4d5e82392a40515, 0x0fabaf3feaa5334b}, + {0xb8da1662e7b00a17, 0x3d6a751f3b936244}, + { 0x95527a5202df0ccb, + 0x0f37801e0c43ebc9 } +#endif + }; + +#if FMT_USE_FULL_CACHE_DRAGONBOX + return pow10_significands[k - float_info::min_k]; +#else + static constexpr const uint64_t powers_of_5_64[] = { + 0x0000000000000001, 0x0000000000000005, 0x0000000000000019, + 0x000000000000007d, 0x0000000000000271, 0x0000000000000c35, + 0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1, + 0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd, + 0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9, + 0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5, + 0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631, + 0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed, + 0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9}; + + static const int compression_ratio = 27; + + // Compute base index. + int cache_index = (k - float_info::min_k) / compression_ratio; + int kb = cache_index * compression_ratio + float_info::min_k; + int offset = k - kb; + + // Get base cache. + uint128_fallback base_cache = pow10_significands[cache_index]; + if (offset == 0) return base_cache; + + // Compute the required amount of bit-shift. + int alpha = floor_log2_pow10(kb + offset) - floor_log2_pow10(kb) - offset; + FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected"); + + // Try to recover the real cache. + uint64_t pow5 = powers_of_5_64[offset]; + uint128_fallback recovered_cache = umul128(base_cache.high(), pow5); + uint128_fallback middle_low = umul128(base_cache.low(), pow5); + + recovered_cache += middle_low.high(); + + uint64_t high_to_middle = recovered_cache.high() << (64 - alpha); + uint64_t middle_to_low = recovered_cache.low() << (64 - alpha); + + recovered_cache = + uint128_fallback{(recovered_cache.low() >> alpha) | high_to_middle, + ((middle_low.low() >> alpha) | middle_to_low)}; + FMT_ASSERT(recovered_cache.low() + 1 != 0, ""); + return {recovered_cache.high(), recovered_cache.low() + 1}; +#endif + } + + struct compute_mul_result { + carrier_uint result; + bool is_integer; + }; + struct compute_mul_parity_result { + bool parity; + bool is_integer; + }; + + static compute_mul_result compute_mul( + carrier_uint u, const cache_entry_type& cache) noexcept { + auto r = umul192_upper128(u, cache); + return {r.high(), r.low() == 0}; + } + + static uint32_t compute_delta(cache_entry_type const& cache, + int beta) noexcept { + return static_cast(cache.high() >> (64 - 1 - beta)); + } + + static compute_mul_parity_result compute_mul_parity( + carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept { + FMT_ASSERT(beta >= 1, ""); + FMT_ASSERT(beta < 64, ""); + + auto r = umul192_lower128(two_f, cache); + return {((r.high() >> (64 - beta)) & 1) != 0, + ((r.high() << beta) | (r.low() >> (64 - beta))) == 0}; + } + + static carrier_uint compute_left_endpoint_for_shorter_interval_case( + const cache_entry_type& cache, int beta) noexcept { + return (cache.high() - + (cache.high() >> (num_significand_bits() + 2))) >> + (64 - num_significand_bits() - 1 - beta); + } + + static carrier_uint compute_right_endpoint_for_shorter_interval_case( + const cache_entry_type& cache, int beta) noexcept { + return (cache.high() + + (cache.high() >> (num_significand_bits() + 1))) >> + (64 - num_significand_bits() - 1 - beta); + } + + static carrier_uint compute_round_up_for_shorter_interval_case( + const cache_entry_type& cache, int beta) noexcept { + return ((cache.high() >> (64 - num_significand_bits() - 2 - beta)) + + 1) / + 2; + } +}; + +// Various integer checks +template +bool is_left_endpoint_integer_shorter_interval(int exponent) noexcept { + const int case_shorter_interval_left_endpoint_lower_threshold = 2; + const int case_shorter_interval_left_endpoint_upper_threshold = 3; + return exponent >= case_shorter_interval_left_endpoint_lower_threshold && + exponent <= case_shorter_interval_left_endpoint_upper_threshold; +} + +// Remove trailing zeros from n and return the number of zeros removed (float) +FMT_INLINE int remove_trailing_zeros(uint32_t& n) noexcept { + FMT_ASSERT(n != 0, ""); + const uint32_t mod_inv_5 = 0xcccccccd; + const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5; + + int s = 0; + while (true) { + auto q = rotr(n * mod_inv_25, 2); + if (q > max_value() / 100) break; + n = q; + s += 2; + } + auto q = rotr(n * mod_inv_5, 1); + if (q <= max_value() / 10) { + n = q; + s |= 1; + } + + return s; +} + +// Removes trailing zeros and returns the number of zeros removed (double) +FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept { + FMT_ASSERT(n != 0, ""); + + // This magic number is ceil(2^90 / 10^8). + constexpr uint64_t magic_number = 12379400392853802749ull; + auto nm = umul128(n, magic_number); + + // Is n is divisible by 10^8? + if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) { + // If yes, work with the quotient. + auto n32 = static_cast(nm.high() >> (90 - 64)); + + const uint32_t mod_inv_5 = 0xcccccccd; + const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5; + + int s = 8; + while (true) { + auto q = rotr(n32 * mod_inv_25, 2); + if (q > max_value() / 100) break; + n32 = q; + s += 2; + } + auto q = rotr(n32 * mod_inv_5, 1); + if (q <= max_value() / 10) { + n32 = q; + s |= 1; + } + + n = n32; + return s; + } + + // If n is not divisible by 10^8, work with n itself. + const uint64_t mod_inv_5 = 0xcccccccccccccccd; + const uint64_t mod_inv_25 = mod_inv_5 * mod_inv_5; + + int s = 0; + while (true) { + auto q = rotr(n * mod_inv_25, 2); + if (q > max_value() / 100) break; + n = q; + s += 2; + } + auto q = rotr(n * mod_inv_5, 1); + if (q <= max_value() / 10) { + n = q; + s |= 1; + } + + return s; +} + +// The main algorithm for shorter interval case +template +FMT_INLINE decimal_fp shorter_interval_case(int exponent) noexcept { + decimal_fp ret_value; + // Compute k and beta + const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent); + const int beta = exponent + floor_log2_pow10(-minus_k); + + // Compute xi and zi + using cache_entry_type = typename cache_accessor::cache_entry_type; + const cache_entry_type cache = cache_accessor::get_cached_power(-minus_k); + + auto xi = cache_accessor::compute_left_endpoint_for_shorter_interval_case( + cache, beta); + auto zi = cache_accessor::compute_right_endpoint_for_shorter_interval_case( + cache, beta); + + // If the left endpoint is not an integer, increase it + if (!is_left_endpoint_integer_shorter_interval(exponent)) ++xi; + + // Try bigger divisor + ret_value.significand = zi / 10; + + // If succeed, remove trailing zeros if necessary and return + if (ret_value.significand * 10 >= xi) { + ret_value.exponent = minus_k + 1; + ret_value.exponent += remove_trailing_zeros(ret_value.significand); + return ret_value; + } + + // Otherwise, compute the round-up of y + ret_value.significand = + cache_accessor::compute_round_up_for_shorter_interval_case(cache, + beta); + ret_value.exponent = minus_k; + + // When tie occurs, choose one of them according to the rule + if (exponent >= float_info::shorter_interval_tie_lower_threshold && + exponent <= float_info::shorter_interval_tie_upper_threshold) { + ret_value.significand = ret_value.significand % 2 == 0 + ? ret_value.significand + : ret_value.significand - 1; + } else if (ret_value.significand < xi) { + ++ret_value.significand; + } + return ret_value; +} + +template decimal_fp to_decimal(T x) noexcept { + // Step 1: integer promotion & Schubfach multiplier calculation. + + using carrier_uint = typename float_info::carrier_uint; + using cache_entry_type = typename cache_accessor::cache_entry_type; + auto br = bit_cast(x); + + // Extract significand bits and exponent bits. + const carrier_uint significand_mask = + (static_cast(1) << num_significand_bits()) - 1; + carrier_uint significand = (br & significand_mask); + int exponent = + static_cast((br & exponent_mask()) >> num_significand_bits()); + + if (exponent != 0) { // Check if normal. + exponent -= exponent_bias() + num_significand_bits(); + + // Shorter interval case; proceed like Schubfach. + // In fact, when exponent == 1 and significand == 0, the interval is + // regular. However, it can be shown that the end-results are anyway same. + if (significand == 0) return shorter_interval_case(exponent); + + significand |= (static_cast(1) << num_significand_bits()); + } else { + // Subnormal case; the interval is always regular. + if (significand == 0) return {0, 0}; + exponent = + std::numeric_limits::min_exponent - num_significand_bits() - 1; + } + + const bool include_left_endpoint = (significand % 2 == 0); + const bool include_right_endpoint = include_left_endpoint; + + // Compute k and beta. + const int minus_k = floor_log10_pow2(exponent) - float_info::kappa; + const cache_entry_type cache = cache_accessor::get_cached_power(-minus_k); + const int beta = exponent + floor_log2_pow10(-minus_k); + + // Compute zi and deltai. + // 10^kappa <= deltai < 10^(kappa + 1) + const uint32_t deltai = cache_accessor::compute_delta(cache, beta); + const carrier_uint two_fc = significand << 1; + + // For the case of binary32, the result of integer check is not correct for + // 29711844 * 2^-82 + // = 6.1442653300000000008655037797566933477355632930994033813476... * 10^-18 + // and 29711844 * 2^-81 + // = 1.2288530660000000001731007559513386695471126586198806762695... * 10^-17, + // and they are the unique counterexamples. However, since 29711844 is even, + // this does not cause any problem for the endpoints calculations; it can only + // cause a problem when we need to perform integer check for the center. + // Fortunately, with these inputs, that branch is never executed, so we are + // fine. + const typename cache_accessor::compute_mul_result z_mul = + cache_accessor::compute_mul((two_fc | 1) << beta, cache); + + // Step 2: Try larger divisor; remove trailing zeros if necessary. + + // Using an upper bound on zi, we might be able to optimize the division + // better than the compiler; we are computing zi / big_divisor here. + decimal_fp ret_value; + ret_value.significand = divide_by_10_to_kappa_plus_1(z_mul.result); + uint32_t r = static_cast(z_mul.result - float_info::big_divisor * + ret_value.significand); + + if (r < deltai) { + // Exclude the right endpoint if necessary. + if (r == 0 && (z_mul.is_integer & !include_right_endpoint)) { + --ret_value.significand; + r = float_info::big_divisor; + goto small_divisor_case_label; + } + } else if (r > deltai) { + goto small_divisor_case_label; + } else { + // r == deltai; compare fractional parts. + const typename cache_accessor::compute_mul_parity_result x_mul = + cache_accessor::compute_mul_parity(two_fc - 1, cache, beta); + + if (!(x_mul.parity | (x_mul.is_integer & include_left_endpoint))) + goto small_divisor_case_label; + } + ret_value.exponent = minus_k + float_info::kappa + 1; + + // We may need to remove trailing zeros. + ret_value.exponent += remove_trailing_zeros(ret_value.significand); + return ret_value; + + // Step 3: Find the significand with the smaller divisor. + +small_divisor_case_label: + ret_value.significand *= 10; + ret_value.exponent = minus_k + float_info::kappa; + + uint32_t dist = r - (deltai / 2) + (float_info::small_divisor / 2); + const bool approx_y_parity = + ((dist ^ (float_info::small_divisor / 2)) & 1) != 0; + + // Is dist divisible by 10^kappa? + const bool divisible_by_small_divisor = + check_divisibility_and_divide_by_pow10::kappa>(dist); + + // Add dist / 10^kappa to the significand. + ret_value.significand += dist; + + if (!divisible_by_small_divisor) return ret_value; + + // Check z^(f) >= epsilon^(f). + // We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1, + // where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f). + // Since there are only 2 possibilities, we only need to care about the + // parity. Also, zi and r should have the same parity since the divisor + // is an even number. + const auto y_mul = cache_accessor::compute_mul_parity(two_fc, cache, beta); + + // If z^(f) >= epsilon^(f), we might have a tie when z^(f) == epsilon^(f), + // or equivalently, when y is an integer. + if (y_mul.parity != approx_y_parity) + --ret_value.significand; + else if (y_mul.is_integer & (ret_value.significand % 2 != 0)) + --ret_value.significand; + return ret_value; +} +} // namespace dragonbox + +#ifdef _MSC_VER +FMT_FUNC auto fmt_snprintf(char* buf, size_t size, const char* fmt, ...) + -> int { + auto args = va_list(); + va_start(args, fmt); + int result = vsnprintf_s(buf, size, _TRUNCATE, fmt, args); + va_end(args); + return result; +} +#endif +} // namespace detail + +template <> struct formatter { + FMT_CONSTEXPR auto parse(format_parse_context& ctx) + -> format_parse_context::iterator { + return ctx.begin(); + } + + template + auto format(const detail::bigint& n, FormatContext& ctx) const -> + typename FormatContext::iterator { + auto out = ctx.out(); + bool first = true; + for (auto i = n.bigits_.size(); i > 0; --i) { + auto value = n.bigits_[i - 1u]; + if (first) { + out = format_to(out, FMT_STRING("{:x}"), value); + first = false; + continue; + } + out = format_to(out, FMT_STRING("{:08x}"), value); + } + if (n.exp_ > 0) + out = format_to(out, FMT_STRING("p{}"), + n.exp_ * detail::bigint::bigit_bits); + return out; + } +}; + +FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) { + for_each_codepoint(s, [this](uint32_t cp, string_view) { + if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8")); + if (cp <= 0xFFFF) { + buffer_.push_back(static_cast(cp)); + } else { + cp -= 0x10000; + buffer_.push_back(static_cast(0xD800 + (cp >> 10))); + buffer_.push_back(static_cast(0xDC00 + (cp & 0x3FF))); + } + return true; + }); + buffer_.push_back(0); +} + +FMT_FUNC void format_system_error(detail::buffer& out, int error_code, + const char* message) noexcept { + FMT_TRY { + auto ec = std::error_code(error_code, std::generic_category()); + write(std::back_inserter(out), std::system_error(ec, message).what()); + return; + } + FMT_CATCH(...) {} + format_error_code(out, error_code, message); +} + +FMT_FUNC void report_system_error(int error_code, + const char* message) noexcept { + report_error(format_system_error, error_code, message); +} + +FMT_FUNC std::string vformat(string_view fmt, format_args args) { + // Don't optimize the "{}" case to keep the binary size small and because it + // can be better optimized in fmt::format anyway. + auto buffer = memory_buffer(); + detail::vformat_to(buffer, fmt, args); + return to_string(buffer); +} + +namespace detail { +#ifdef _WIN32 +using dword = conditional_t; +extern "C" __declspec(dllimport) int __stdcall WriteConsoleW( // + void*, const void*, dword, dword*, void*); + +FMT_FUNC bool write_console(std::FILE* f, string_view text) { + auto fd = _fileno(f); + if (_isatty(fd)) { + detail::utf8_to_utf16 u16(string_view(text.data(), text.size())); + auto written = detail::dword(); + if (detail::WriteConsoleW(reinterpret_cast(_get_osfhandle(fd)), + u16.c_str(), static_cast(u16.size()), + &written, nullptr)) { + return true; + } + } + // We return false if the file descriptor was not TTY, or it was but + // SetConsoleW failed which can happen if the output has been redirected to + // NUL. In both cases when we return false, we should attempt to do regular + // write via fwrite or std::ostream::write. + return false; +} +#endif + +FMT_FUNC void print(std::FILE* f, string_view text) { +#ifdef _WIN32 + if (write_console(f, text)) return; +#endif + detail::fwrite_fully(text.data(), 1, text.size(), f); +} +} // namespace detail + +FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) { + memory_buffer buffer; + detail::vformat_to(buffer, format_str, args); + detail::print(f, {buffer.data(), buffer.size()}); +} + +#ifdef _WIN32 +// Print assuming legacy (non-Unicode) encoding. +FMT_FUNC void detail::vprint_mojibake(std::FILE* f, string_view format_str, + format_args args) { + memory_buffer buffer; + detail::vformat_to(buffer, format_str, + basic_format_args>(args)); + fwrite_fully(buffer.data(), 1, buffer.size(), f); +} +#endif + +FMT_FUNC void vprint(string_view format_str, format_args args) { + vprint(stdout, format_str, args); +} + +namespace detail { + +struct singleton { + unsigned char upper; + unsigned char lower_count; +}; + +inline auto is_printable(uint16_t x, const singleton* singletons, + size_t singletons_size, + const unsigned char* singleton_lowers, + const unsigned char* normal, size_t normal_size) + -> bool { + auto upper = x >> 8; + auto lower_start = 0; + for (size_t i = 0; i < singletons_size; ++i) { + auto s = singletons[i]; + auto lower_end = lower_start + s.lower_count; + if (upper < s.upper) break; + if (upper == s.upper) { + for (auto j = lower_start; j < lower_end; ++j) { + if (singleton_lowers[j] == (x & 0xff)) return false; + } + } + lower_start = lower_end; + } + + auto xsigned = static_cast(x); + auto current = true; + for (size_t i = 0; i < normal_size; ++i) { + auto v = static_cast(normal[i]); + auto len = (v & 0x80) != 0 ? (v & 0x7f) << 8 | normal[++i] : v; + xsigned -= len; + if (xsigned < 0) break; + current = !current; + } + return current; +} + +// This code is generated by support/printable.py. +FMT_FUNC auto is_printable(uint32_t cp) -> bool { + static constexpr singleton singletons0[] = { + {0x00, 1}, {0x03, 5}, {0x05, 6}, {0x06, 3}, {0x07, 6}, {0x08, 8}, + {0x09, 17}, {0x0a, 28}, {0x0b, 25}, {0x0c, 20}, {0x0d, 16}, {0x0e, 13}, + {0x0f, 4}, {0x10, 3}, {0x12, 18}, {0x13, 9}, {0x16, 1}, {0x17, 5}, + {0x18, 2}, {0x19, 3}, {0x1a, 7}, {0x1c, 2}, {0x1d, 1}, {0x1f, 22}, + {0x20, 3}, {0x2b, 3}, {0x2c, 2}, {0x2d, 11}, {0x2e, 1}, {0x30, 3}, + {0x31, 2}, {0x32, 1}, {0xa7, 2}, {0xa9, 2}, {0xaa, 4}, {0xab, 8}, + {0xfa, 2}, {0xfb, 5}, {0xfd, 4}, {0xfe, 3}, {0xff, 9}, + }; + static constexpr unsigned char singletons0_lower[] = { + 0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90, + 0x1c, 0x1d, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f, + 0x5c, 0x5d, 0x5f, 0xb5, 0xe2, 0x84, 0x8d, 0x8e, 0x91, 0x92, 0xa9, 0xb1, + 0xba, 0xbb, 0xc5, 0xc6, 0xc9, 0xca, 0xde, 0xe4, 0xe5, 0xff, 0x00, 0x04, + 0x11, 0x12, 0x29, 0x31, 0x34, 0x37, 0x3a, 0x3b, 0x3d, 0x49, 0x4a, 0x5d, + 0x84, 0x8e, 0x92, 0xa9, 0xb1, 0xb4, 0xba, 0xbb, 0xc6, 0xca, 0xce, 0xcf, + 0xe4, 0xe5, 0x00, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a, + 0x3b, 0x45, 0x46, 0x49, 0x4a, 0x5e, 0x64, 0x65, 0x84, 0x91, 0x9b, 0x9d, + 0xc9, 0xce, 0xcf, 0x0d, 0x11, 0x29, 0x45, 0x49, 0x57, 0x64, 0x65, 0x8d, + 0x91, 0xa9, 0xb4, 0xba, 0xbb, 0xc5, 0xc9, 0xdf, 0xe4, 0xe5, 0xf0, 0x0d, + 0x11, 0x45, 0x49, 0x64, 0x65, 0x80, 0x84, 0xb2, 0xbc, 0xbe, 0xbf, 0xd5, + 0xd7, 0xf0, 0xf1, 0x83, 0x85, 0x8b, 0xa4, 0xa6, 0xbe, 0xbf, 0xc5, 0xc7, + 0xce, 0xcf, 0xda, 0xdb, 0x48, 0x98, 0xbd, 0xcd, 0xc6, 0xce, 0xcf, 0x49, + 0x4e, 0x4f, 0x57, 0x59, 0x5e, 0x5f, 0x89, 0x8e, 0x8f, 0xb1, 0xb6, 0xb7, + 0xbf, 0xc1, 0xc6, 0xc7, 0xd7, 0x11, 0x16, 0x17, 0x5b, 0x5c, 0xf6, 0xf7, + 0xfe, 0xff, 0x80, 0x0d, 0x6d, 0x71, 0xde, 0xdf, 0x0e, 0x0f, 0x1f, 0x6e, + 0x6f, 0x1c, 0x1d, 0x5f, 0x7d, 0x7e, 0xae, 0xaf, 0xbb, 0xbc, 0xfa, 0x16, + 0x17, 0x1e, 0x1f, 0x46, 0x47, 0x4e, 0x4f, 0x58, 0x5a, 0x5c, 0x5e, 0x7e, + 0x7f, 0xb5, 0xc5, 0xd4, 0xd5, 0xdc, 0xf0, 0xf1, 0xf5, 0x72, 0x73, 0x8f, + 0x74, 0x75, 0x96, 0x2f, 0x5f, 0x26, 0x2e, 0x2f, 0xa7, 0xaf, 0xb7, 0xbf, + 0xc7, 0xcf, 0xd7, 0xdf, 0x9a, 0x40, 0x97, 0x98, 0x30, 0x8f, 0x1f, 0xc0, + 0xc1, 0xce, 0xff, 0x4e, 0x4f, 0x5a, 0x5b, 0x07, 0x08, 0x0f, 0x10, 0x27, + 0x2f, 0xee, 0xef, 0x6e, 0x6f, 0x37, 0x3d, 0x3f, 0x42, 0x45, 0x90, 0x91, + 0xfe, 0xff, 0x53, 0x67, 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, 0xd9, 0xe7, + 0xfe, 0xff, + }; + static constexpr singleton singletons1[] = { + {0x00, 6}, {0x01, 1}, {0x03, 1}, {0x04, 2}, {0x08, 8}, {0x09, 2}, + {0x0a, 5}, {0x0b, 2}, {0x0e, 4}, {0x10, 1}, {0x11, 2}, {0x12, 5}, + {0x13, 17}, {0x14, 1}, {0x15, 2}, {0x17, 2}, {0x19, 13}, {0x1c, 5}, + {0x1d, 8}, {0x24, 1}, {0x6a, 3}, {0x6b, 2}, {0xbc, 2}, {0xd1, 2}, + {0xd4, 12}, {0xd5, 9}, {0xd6, 2}, {0xd7, 2}, {0xda, 1}, {0xe0, 5}, + {0xe1, 2}, {0xe8, 2}, {0xee, 32}, {0xf0, 4}, {0xf8, 2}, {0xf9, 2}, + {0xfa, 2}, {0xfb, 1}, + }; + static constexpr unsigned char singletons1_lower[] = { + 0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x06, 0x07, + 0x09, 0x36, 0x3d, 0x3e, 0x56, 0xf3, 0xd0, 0xd1, 0x04, 0x14, 0x18, 0x36, + 0x37, 0x56, 0x57, 0x7f, 0xaa, 0xae, 0xaf, 0xbd, 0x35, 0xe0, 0x12, 0x87, + 0x89, 0x8e, 0x9e, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a, + 0x45, 0x46, 0x49, 0x4a, 0x4e, 0x4f, 0x64, 0x65, 0x5c, 0xb6, 0xb7, 0x1b, + 0x1c, 0x07, 0x08, 0x0a, 0x0b, 0x14, 0x17, 0x36, 0x39, 0x3a, 0xa8, 0xa9, + 0xd8, 0xd9, 0x09, 0x37, 0x90, 0x91, 0xa8, 0x07, 0x0a, 0x3b, 0x3e, 0x66, + 0x69, 0x8f, 0x92, 0x6f, 0x5f, 0xee, 0xef, 0x5a, 0x62, 0x9a, 0x9b, 0x27, + 0x28, 0x55, 0x9d, 0xa0, 0xa1, 0xa3, 0xa4, 0xa7, 0xa8, 0xad, 0xba, 0xbc, + 0xc4, 0x06, 0x0b, 0x0c, 0x15, 0x1d, 0x3a, 0x3f, 0x45, 0x51, 0xa6, 0xa7, + 0xcc, 0xcd, 0xa0, 0x07, 0x19, 0x1a, 0x22, 0x25, 0x3e, 0x3f, 0xc5, 0xc6, + 0x04, 0x20, 0x23, 0x25, 0x26, 0x28, 0x33, 0x38, 0x3a, 0x48, 0x4a, 0x4c, + 0x50, 0x53, 0x55, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x63, 0x65, 0x66, + 0x6b, 0x73, 0x78, 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, 0xaf, 0xb0, 0xc0, 0xd0, + 0xae, 0xaf, 0x79, 0xcc, 0x6e, 0x6f, 0x93, + }; + static constexpr unsigned char normal0[] = { + 0x00, 0x20, 0x5f, 0x22, 0x82, 0xdf, 0x04, 0x82, 0x44, 0x08, 0x1b, 0x04, + 0x06, 0x11, 0x81, 0xac, 0x0e, 0x80, 0xab, 0x35, 0x28, 0x0b, 0x80, 0xe0, + 0x03, 0x19, 0x08, 0x01, 0x04, 0x2f, 0x04, 0x34, 0x04, 0x07, 0x03, 0x01, + 0x07, 0x06, 0x07, 0x11, 0x0a, 0x50, 0x0f, 0x12, 0x07, 0x55, 0x07, 0x03, + 0x04, 0x1c, 0x0a, 0x09, 0x03, 0x08, 0x03, 0x07, 0x03, 0x02, 0x03, 0x03, + 0x03, 0x0c, 0x04, 0x05, 0x03, 0x0b, 0x06, 0x01, 0x0e, 0x15, 0x05, 0x3a, + 0x03, 0x11, 0x07, 0x06, 0x05, 0x10, 0x07, 0x57, 0x07, 0x02, 0x07, 0x15, + 0x0d, 0x50, 0x04, 0x43, 0x03, 0x2d, 0x03, 0x01, 0x04, 0x11, 0x06, 0x0f, + 0x0c, 0x3a, 0x04, 0x1d, 0x25, 0x5f, 0x20, 0x6d, 0x04, 0x6a, 0x25, 0x80, + 0xc8, 0x05, 0x82, 0xb0, 0x03, 0x1a, 0x06, 0x82, 0xfd, 0x03, 0x59, 0x07, + 0x15, 0x0b, 0x17, 0x09, 0x14, 0x0c, 0x14, 0x0c, 0x6a, 0x06, 0x0a, 0x06, + 0x1a, 0x06, 0x59, 0x07, 0x2b, 0x05, 0x46, 0x0a, 0x2c, 0x04, 0x0c, 0x04, + 0x01, 0x03, 0x31, 0x0b, 0x2c, 0x04, 0x1a, 0x06, 0x0b, 0x03, 0x80, 0xac, + 0x06, 0x0a, 0x06, 0x21, 0x3f, 0x4c, 0x04, 0x2d, 0x03, 0x74, 0x08, 0x3c, + 0x03, 0x0f, 0x03, 0x3c, 0x07, 0x38, 0x08, 0x2b, 0x05, 0x82, 0xff, 0x11, + 0x18, 0x08, 0x2f, 0x11, 0x2d, 0x03, 0x20, 0x10, 0x21, 0x0f, 0x80, 0x8c, + 0x04, 0x82, 0x97, 0x19, 0x0b, 0x15, 0x88, 0x94, 0x05, 0x2f, 0x05, 0x3b, + 0x07, 0x02, 0x0e, 0x18, 0x09, 0x80, 0xb3, 0x2d, 0x74, 0x0c, 0x80, 0xd6, + 0x1a, 0x0c, 0x05, 0x80, 0xff, 0x05, 0x80, 0xdf, 0x0c, 0xee, 0x0d, 0x03, + 0x84, 0x8d, 0x03, 0x37, 0x09, 0x81, 0x5c, 0x14, 0x80, 0xb8, 0x08, 0x80, + 0xcb, 0x2a, 0x38, 0x03, 0x0a, 0x06, 0x38, 0x08, 0x46, 0x08, 0x0c, 0x06, + 0x74, 0x0b, 0x1e, 0x03, 0x5a, 0x04, 0x59, 0x09, 0x80, 0x83, 0x18, 0x1c, + 0x0a, 0x16, 0x09, 0x4c, 0x04, 0x80, 0x8a, 0x06, 0xab, 0xa4, 0x0c, 0x17, + 0x04, 0x31, 0xa1, 0x04, 0x81, 0xda, 0x26, 0x07, 0x0c, 0x05, 0x05, 0x80, + 0xa5, 0x11, 0x81, 0x6d, 0x10, 0x78, 0x28, 0x2a, 0x06, 0x4c, 0x04, 0x80, + 0x8d, 0x04, 0x80, 0xbe, 0x03, 0x1b, 0x03, 0x0f, 0x0d, + }; + static constexpr unsigned char normal1[] = { + 0x5e, 0x22, 0x7b, 0x05, 0x03, 0x04, 0x2d, 0x03, 0x66, 0x03, 0x01, 0x2f, + 0x2e, 0x80, 0x82, 0x1d, 0x03, 0x31, 0x0f, 0x1c, 0x04, 0x24, 0x09, 0x1e, + 0x05, 0x2b, 0x05, 0x44, 0x04, 0x0e, 0x2a, 0x80, 0xaa, 0x06, 0x24, 0x04, + 0x24, 0x04, 0x28, 0x08, 0x34, 0x0b, 0x01, 0x80, 0x90, 0x81, 0x37, 0x09, + 0x16, 0x0a, 0x08, 0x80, 0x98, 0x39, 0x03, 0x63, 0x08, 0x09, 0x30, 0x16, + 0x05, 0x21, 0x03, 0x1b, 0x05, 0x01, 0x40, 0x38, 0x04, 0x4b, 0x05, 0x2f, + 0x04, 0x0a, 0x07, 0x09, 0x07, 0x40, 0x20, 0x27, 0x04, 0x0c, 0x09, 0x36, + 0x03, 0x3a, 0x05, 0x1a, 0x07, 0x04, 0x0c, 0x07, 0x50, 0x49, 0x37, 0x33, + 0x0d, 0x33, 0x07, 0x2e, 0x08, 0x0a, 0x81, 0x26, 0x52, 0x4e, 0x28, 0x08, + 0x2a, 0x56, 0x1c, 0x14, 0x17, 0x09, 0x4e, 0x04, 0x1e, 0x0f, 0x43, 0x0e, + 0x19, 0x07, 0x0a, 0x06, 0x48, 0x08, 0x27, 0x09, 0x75, 0x0b, 0x3f, 0x41, + 0x2a, 0x06, 0x3b, 0x05, 0x0a, 0x06, 0x51, 0x06, 0x01, 0x05, 0x10, 0x03, + 0x05, 0x80, 0x8b, 0x62, 0x1e, 0x48, 0x08, 0x0a, 0x80, 0xa6, 0x5e, 0x22, + 0x45, 0x0b, 0x0a, 0x06, 0x0d, 0x13, 0x39, 0x07, 0x0a, 0x36, 0x2c, 0x04, + 0x10, 0x80, 0xc0, 0x3c, 0x64, 0x53, 0x0c, 0x48, 0x09, 0x0a, 0x46, 0x45, + 0x1b, 0x48, 0x08, 0x53, 0x1d, 0x39, 0x81, 0x07, 0x46, 0x0a, 0x1d, 0x03, + 0x47, 0x49, 0x37, 0x03, 0x0e, 0x08, 0x0a, 0x06, 0x39, 0x07, 0x0a, 0x81, + 0x36, 0x19, 0x80, 0xb7, 0x01, 0x0f, 0x32, 0x0d, 0x83, 0x9b, 0x66, 0x75, + 0x0b, 0x80, 0xc4, 0x8a, 0xbc, 0x84, 0x2f, 0x8f, 0xd1, 0x82, 0x47, 0xa1, + 0xb9, 0x82, 0x39, 0x07, 0x2a, 0x04, 0x02, 0x60, 0x26, 0x0a, 0x46, 0x0a, + 0x28, 0x05, 0x13, 0x82, 0xb0, 0x5b, 0x65, 0x4b, 0x04, 0x39, 0x07, 0x11, + 0x40, 0x05, 0x0b, 0x02, 0x0e, 0x97, 0xf8, 0x08, 0x84, 0xd6, 0x2a, 0x09, + 0xa2, 0xf7, 0x81, 0x1f, 0x31, 0x03, 0x11, 0x04, 0x08, 0x81, 0x8c, 0x89, + 0x04, 0x6b, 0x05, 0x0d, 0x03, 0x09, 0x07, 0x10, 0x93, 0x60, 0x80, 0xf6, + 0x0a, 0x73, 0x08, 0x6e, 0x17, 0x46, 0x80, 0x9a, 0x14, 0x0c, 0x57, 0x09, + 0x19, 0x80, 0x87, 0x81, 0x47, 0x03, 0x85, 0x42, 0x0f, 0x15, 0x85, 0x50, + 0x2b, 0x80, 0xd5, 0x2d, 0x03, 0x1a, 0x04, 0x02, 0x81, 0x70, 0x3a, 0x05, + 0x01, 0x85, 0x00, 0x80, 0xd7, 0x29, 0x4c, 0x04, 0x0a, 0x04, 0x02, 0x83, + 0x11, 0x44, 0x4c, 0x3d, 0x80, 0xc2, 0x3c, 0x06, 0x01, 0x04, 0x55, 0x05, + 0x1b, 0x34, 0x02, 0x81, 0x0e, 0x2c, 0x04, 0x64, 0x0c, 0x56, 0x0a, 0x80, + 0xae, 0x38, 0x1d, 0x0d, 0x2c, 0x04, 0x09, 0x07, 0x02, 0x0e, 0x06, 0x80, + 0x9a, 0x83, 0xd8, 0x08, 0x0d, 0x03, 0x0d, 0x03, 0x74, 0x0c, 0x59, 0x07, + 0x0c, 0x14, 0x0c, 0x04, 0x38, 0x08, 0x0a, 0x06, 0x28, 0x08, 0x22, 0x4e, + 0x81, 0x54, 0x0c, 0x15, 0x03, 0x03, 0x05, 0x07, 0x09, 0x19, 0x07, 0x07, + 0x09, 0x03, 0x0d, 0x07, 0x29, 0x80, 0xcb, 0x25, 0x0a, 0x84, 0x06, + }; + auto lower = static_cast(cp); + if (cp < 0x10000) { + return is_printable(lower, singletons0, + sizeof(singletons0) / sizeof(*singletons0), + singletons0_lower, normal0, sizeof(normal0)); + } + if (cp < 0x20000) { + return is_printable(lower, singletons1, + sizeof(singletons1) / sizeof(*singletons1), + singletons1_lower, normal1, sizeof(normal1)); + } + if (0x2a6de <= cp && cp < 0x2a700) return false; + if (0x2b735 <= cp && cp < 0x2b740) return false; + if (0x2b81e <= cp && cp < 0x2b820) return false; + if (0x2cea2 <= cp && cp < 0x2ceb0) return false; + if (0x2ebe1 <= cp && cp < 0x2f800) return false; + if (0x2fa1e <= cp && cp < 0x30000) return false; + if (0x3134b <= cp && cp < 0xe0100) return false; + if (0xe01f0 <= cp && cp < 0x110000) return false; + return cp < 0x110000; +} + +} // namespace detail + +FMT_END_NAMESPACE + +#endif // FMT_FORMAT_INL_H_ diff --git a/libkram/fmt/format.cpp b/libkram/fmt/format.cpp new file mode 100644 index 00000000..a84bea85 --- /dev/null +++ b/libkram/fmt/format.cpp @@ -0,0 +1,44 @@ +// Formatting library for C++ +// +// Copyright (c) 2012 - 2016, Victor Zverovich +// All rights reserved. +// +// For the license information refer to format.h. + +#include "format-inl.h" + +FMT_BEGIN_NAMESPACE +namespace detail { + +template FMT_API auto dragonbox::to_decimal(float x) noexcept + -> dragonbox::decimal_fp; +template FMT_API auto dragonbox::to_decimal(double x) noexcept + -> dragonbox::decimal_fp; + +#ifndef FMT_STATIC_THOUSANDS_SEPARATOR +template FMT_API locale_ref::locale_ref(const std::locale& loc); +template FMT_API auto locale_ref::get() const -> std::locale; +#endif + +// Explicit instantiations for char. + +template FMT_API auto thousands_sep_impl(locale_ref) + -> thousands_sep_result; +template FMT_API auto decimal_point_impl(locale_ref) -> char; + +template FMT_API void buffer::append(const char*, const char*); + +template FMT_API void vformat_to(buffer&, string_view, + basic_format_args, + locale_ref); + +// Explicit instantiations for wchar_t. + +template FMT_API auto thousands_sep_impl(locale_ref) + -> thousands_sep_result; +template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t; + +template FMT_API void buffer::append(const wchar_t*, const wchar_t*); + +} // namespace detail +FMT_END_NAMESPACE diff --git a/libkram/fmt/format.h b/libkram/fmt/format.h new file mode 100644 index 00000000..4b26f926 --- /dev/null +++ b/libkram/fmt/format.h @@ -0,0 +1,4310 @@ +/* + Formatting library for C++ + + Copyright (c) 2012 - present, Victor Zverovich + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + --- Optional exception to the license --- + + As an exception, if, as a result of your compiling your source code, portions + of this Software are embedded into a machine-executable object form of such + source code, you may redistribute such embedded portions in such object form + without including the above copyright and permission notices. + */ + +#ifndef FMT_FORMAT_H_ +#define FMT_FORMAT_H_ + +#include // std::signbit +#include // uint32_t +#include // std::memcpy +#include // std::initializer_list +#include // std::numeric_limits +#include // std::uninitialized_copy +#include // std::runtime_error +#include // std::system_error + +#ifdef __cpp_lib_bit_cast +# include // std::bitcast +#endif + +#include "core.h" + +#if FMT_GCC_VERSION +# define FMT_GCC_VISIBILITY_HIDDEN __attribute__((visibility("hidden"))) +#else +# define FMT_GCC_VISIBILITY_HIDDEN +#endif + +#ifdef __NVCC__ +# define FMT_CUDA_VERSION (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__) +#else +# define FMT_CUDA_VERSION 0 +#endif + +#ifdef __has_builtin +# define FMT_HAS_BUILTIN(x) __has_builtin(x) +#else +# define FMT_HAS_BUILTIN(x) 0 +#endif + +#if FMT_GCC_VERSION || FMT_CLANG_VERSION +# define FMT_NOINLINE __attribute__((noinline)) +#else +# define FMT_NOINLINE +#endif + +#if FMT_MSC_VERSION +# define FMT_MSC_DEFAULT = default +#else +# define FMT_MSC_DEFAULT +#endif + +#ifndef FMT_THROW +# if FMT_EXCEPTIONS +# if FMT_MSC_VERSION || defined(__NVCC__) +FMT_BEGIN_NAMESPACE +namespace detail { +template inline void do_throw(const Exception& x) { + // Silence unreachable code warnings in MSVC and NVCC because these + // are nearly impossible to fix in a generic code. + volatile bool b = true; + if (b) throw x; +} +} // namespace detail +FMT_END_NAMESPACE +# define FMT_THROW(x) detail::do_throw(x) +# else +# define FMT_THROW(x) throw x +# endif +# else +# define FMT_THROW(x) \ + do { \ + FMT_ASSERT(false, (x).what()); \ + } while (false) +# endif +#endif + +#if FMT_EXCEPTIONS +# define FMT_TRY try +# define FMT_CATCH(x) catch (x) +#else +# define FMT_TRY if (true) +# define FMT_CATCH(x) if (false) +#endif + +#ifndef FMT_MAYBE_UNUSED +# if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused) +# define FMT_MAYBE_UNUSED [[maybe_unused]] +# else +# define FMT_MAYBE_UNUSED +# endif +#endif + +#ifndef FMT_USE_USER_DEFINED_LITERALS +// EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs. +# if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 407 || \ + FMT_MSC_VERSION >= 1900) && \ + (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= /* UDL feature */ 480) +# define FMT_USE_USER_DEFINED_LITERALS 1 +# else +# define FMT_USE_USER_DEFINED_LITERALS 0 +# endif +#endif + +// Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of +// integer formatter template instantiations to just one by only using the +// largest integer type. This results in a reduction in binary size but will +// cause a decrease in integer formatting performance. +#if !defined(FMT_REDUCE_INT_INSTANTIATIONS) +# define FMT_REDUCE_INT_INSTANTIATIONS 0 +#endif + +// __builtin_clz is broken in clang with Microsoft CodeGen: +// https://github.com/fmtlib/fmt/issues/519. +#if !FMT_MSC_VERSION +# if FMT_HAS_BUILTIN(__builtin_clz) || FMT_GCC_VERSION || FMT_ICC_VERSION +# define FMT_BUILTIN_CLZ(n) __builtin_clz(n) +# endif +# if FMT_HAS_BUILTIN(__builtin_clzll) || FMT_GCC_VERSION || FMT_ICC_VERSION +# define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n) +# endif +#endif + +// __builtin_ctz is broken in Intel Compiler Classic on Windows: +// https://github.com/fmtlib/fmt/issues/2510. +#ifndef __ICL +# if FMT_HAS_BUILTIN(__builtin_ctz) || FMT_GCC_VERSION || FMT_ICC_VERSION || \ + defined(__NVCOMPILER) +# define FMT_BUILTIN_CTZ(n) __builtin_ctz(n) +# endif +# if FMT_HAS_BUILTIN(__builtin_ctzll) || FMT_GCC_VERSION || \ + FMT_ICC_VERSION || defined(__NVCOMPILER) +# define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n) +# endif +#endif + +#if FMT_MSC_VERSION +# include // _BitScanReverse[64], _BitScanForward[64], _umul128 +#endif + +// Some compilers masquerade as both MSVC and GCC-likes or otherwise support +// __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the +// MSVC intrinsics if the clz and clzll builtins are not available. +#if FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL) && \ + !defined(FMT_BUILTIN_CTZLL) +FMT_BEGIN_NAMESPACE +namespace detail { +// Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning. +# if !defined(__clang__) +# pragma intrinsic(_BitScanForward) +# pragma intrinsic(_BitScanReverse) +# if defined(_WIN64) +# pragma intrinsic(_BitScanForward64) +# pragma intrinsic(_BitScanReverse64) +# endif +# endif + +inline auto clz(uint32_t x) -> int { + unsigned long r = 0; + _BitScanReverse(&r, x); + FMT_ASSERT(x != 0, ""); + // Static analysis complains about using uninitialized data + // "r", but the only way that can happen is if "x" is 0, + // which the callers guarantee to not happen. + FMT_MSC_WARNING(suppress : 6102) + return 31 ^ static_cast(r); +} +# define FMT_BUILTIN_CLZ(n) detail::clz(n) + +inline auto clzll(uint64_t x) -> int { + unsigned long r = 0; +# ifdef _WIN64 + _BitScanReverse64(&r, x); +# else + // Scan the high 32 bits. + if (_BitScanReverse(&r, static_cast(x >> 32))) return 63 ^ (r + 32); + // Scan the low 32 bits. + _BitScanReverse(&r, static_cast(x)); +# endif + FMT_ASSERT(x != 0, ""); + FMT_MSC_WARNING(suppress : 6102) // Suppress a bogus static analysis warning. + return 63 ^ static_cast(r); +} +# define FMT_BUILTIN_CLZLL(n) detail::clzll(n) + +inline auto ctz(uint32_t x) -> int { + unsigned long r = 0; + _BitScanForward(&r, x); + FMT_ASSERT(x != 0, ""); + FMT_MSC_WARNING(suppress : 6102) // Suppress a bogus static analysis warning. + return static_cast(r); +} +# define FMT_BUILTIN_CTZ(n) detail::ctz(n) + +inline auto ctzll(uint64_t x) -> int { + unsigned long r = 0; + FMT_ASSERT(x != 0, ""); + FMT_MSC_WARNING(suppress : 6102) // Suppress a bogus static analysis warning. +# ifdef _WIN64 + _BitScanForward64(&r, x); +# else + // Scan the low 32 bits. + if (_BitScanForward(&r, static_cast(x))) return static_cast(r); + // Scan the high 32 bits. + _BitScanForward(&r, static_cast(x >> 32)); + r += 32; +# endif + return static_cast(r); +} +# define FMT_BUILTIN_CTZLL(n) detail::ctzll(n) +} // namespace detail +FMT_END_NAMESPACE +#endif + +FMT_BEGIN_NAMESPACE +namespace detail { + +FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) { + ignore_unused(condition); +#ifdef FMT_FUZZ + if (condition) throw std::runtime_error("fuzzing limit reached"); +#endif +} + +template struct string_literal { + static constexpr CharT value[sizeof...(C)] = {C...}; + constexpr operator basic_string_view() const { + return {value, sizeof...(C)}; + } +}; + +#if FMT_CPLUSPLUS < 201703L +template +constexpr CharT string_literal::value[sizeof...(C)]; +#endif + +template class formatbuf : public Streambuf { + private: + using char_type = typename Streambuf::char_type; + using streamsize = decltype(std::declval().sputn(nullptr, 0)); + using int_type = typename Streambuf::int_type; + using traits_type = typename Streambuf::traits_type; + + buffer& buffer_; + + public: + explicit formatbuf(buffer& buf) : buffer_(buf) {} + + protected: + // The put area is always empty. This makes the implementation simpler and has + // the advantage that the streambuf and the buffer are always in sync and + // sputc never writes into uninitialized memory. A disadvantage is that each + // call to sputc always results in a (virtual) call to overflow. There is no + // disadvantage here for sputn since this always results in a call to xsputn. + + auto overflow(int_type ch) -> int_type override { + if (!traits_type::eq_int_type(ch, traits_type::eof())) + buffer_.push_back(static_cast(ch)); + return ch; + } + + auto xsputn(const char_type* s, streamsize count) -> streamsize override { + buffer_.append(s, s + count); + return count; + } +}; + +// Implementation of std::bit_cast for pre-C++20. +template +FMT_CONSTEXPR20 auto bit_cast(const From& from) -> To { +#ifdef __cpp_lib_bit_cast + if (is_constant_evaluated()) return std::bit_cast(from); +#endif + auto to = To(); + // The cast suppresses a bogus -Wclass-memaccess on GCC. + std::memcpy(static_cast(&to), &from, sizeof(to)); + return to; +} + +inline auto is_big_endian() -> bool { +#ifdef _WIN32 + return false; +#elif defined(__BIG_ENDIAN__) + return true; +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) + return __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__; +#else + struct bytes { + char data[sizeof(int)]; + }; + return bit_cast(1).data[0] == 0; +#endif +} + +class uint128_fallback { + private: + uint64_t lo_, hi_; + + friend uint128_fallback umul128(uint64_t x, uint64_t y) noexcept; + + public: + constexpr uint128_fallback(uint64_t hi, uint64_t lo) : lo_(lo), hi_(hi) {} + constexpr uint128_fallback(uint64_t value = 0) : lo_(value), hi_(0) {} + + constexpr uint64_t high() const noexcept { return hi_; } + constexpr uint64_t low() const noexcept { return lo_; } + + template ::value)> + constexpr explicit operator T() const { + return static_cast(lo_); + } + + friend constexpr auto operator==(const uint128_fallback& lhs, + const uint128_fallback& rhs) -> bool { + return lhs.hi_ == rhs.hi_ && lhs.lo_ == rhs.lo_; + } + friend constexpr auto operator!=(const uint128_fallback& lhs, + const uint128_fallback& rhs) -> bool { + return !(lhs == rhs); + } + friend constexpr auto operator>(const uint128_fallback& lhs, + const uint128_fallback& rhs) -> bool { + return lhs.hi_ != rhs.hi_ ? lhs.hi_ > rhs.hi_ : lhs.lo_ > rhs.lo_; + } + friend constexpr auto operator|(const uint128_fallback& lhs, + const uint128_fallback& rhs) + -> uint128_fallback { + return {lhs.hi_ | rhs.hi_, lhs.lo_ | rhs.lo_}; + } + friend constexpr auto operator&(const uint128_fallback& lhs, + const uint128_fallback& rhs) + -> uint128_fallback { + return {lhs.hi_ & rhs.hi_, lhs.lo_ & rhs.lo_}; + } + friend auto operator+(const uint128_fallback& lhs, + const uint128_fallback& rhs) -> uint128_fallback { + auto result = uint128_fallback(lhs); + result += rhs; + return result; + } + friend auto operator*(const uint128_fallback& lhs, uint32_t rhs) + -> uint128_fallback { + FMT_ASSERT(lhs.hi_ == 0, ""); + uint64_t hi = (lhs.lo_ >> 32) * rhs; + uint64_t lo = (lhs.lo_ & ~uint32_t()) * rhs; + uint64_t new_lo = (hi << 32) + lo; + return {(hi >> 32) + (new_lo < lo ? 1 : 0), new_lo}; + } + friend auto operator-(const uint128_fallback& lhs, uint64_t rhs) + -> uint128_fallback { + return {lhs.hi_ - (lhs.lo_ < rhs ? 1 : 0), lhs.lo_ - rhs}; + } + FMT_CONSTEXPR auto operator>>(int shift) const -> uint128_fallback { + if (shift == 64) return {0, hi_}; + if (shift > 64) return uint128_fallback(0, hi_) >> (shift - 64); + return {hi_ >> shift, (hi_ << (64 - shift)) | (lo_ >> shift)}; + } + FMT_CONSTEXPR auto operator<<(int shift) const -> uint128_fallback { + if (shift == 64) return {lo_, 0}; + if (shift > 64) return uint128_fallback(lo_, 0) << (shift - 64); + return {hi_ << shift | (lo_ >> (64 - shift)), (lo_ << shift)}; + } + FMT_CONSTEXPR auto operator>>=(int shift) -> uint128_fallback& { + return *this = *this >> shift; + } + FMT_CONSTEXPR void operator+=(uint128_fallback n) { + uint64_t new_lo = lo_ + n.lo_; + uint64_t new_hi = hi_ + n.hi_ + (new_lo < lo_ ? 1 : 0); + FMT_ASSERT(new_hi >= hi_, ""); + lo_ = new_lo; + hi_ = new_hi; + } + + FMT_CONSTEXPR20 uint128_fallback& operator+=(uint64_t n) noexcept { + if (is_constant_evaluated()) { + lo_ += n; + hi_ += (lo_ < n ? 1 : 0); + return *this; + } +#if FMT_HAS_BUILTIN(__builtin_addcll) && !defined(__ibmxl__) + unsigned long long carry; + lo_ = __builtin_addcll(lo_, n, 0, &carry); + hi_ += carry; +#elif FMT_HAS_BUILTIN(__builtin_ia32_addcarryx_u64) && !defined(__ibmxl__) + unsigned long long result; + auto carry = __builtin_ia32_addcarryx_u64(0, lo_, n, &result); + lo_ = result; + hi_ += carry; +#elif defined(_MSC_VER) && defined(_M_X64) + auto carry = _addcarry_u64(0, lo_, n, &lo_); + _addcarry_u64(carry, hi_, 0, &hi_); +#else + lo_ += n; + hi_ += (lo_ < n ? 1 : 0); +#endif + return *this; + } +}; + +using uint128_t = conditional_t; + +#ifdef UINTPTR_MAX +using uintptr_t = ::uintptr_t; +#else +using uintptr_t = uint128_t; +#endif + +// Returns the largest possible value for type T. Same as +// std::numeric_limits::max() but shorter and not affected by the max macro. +template constexpr auto max_value() -> T { + return (std::numeric_limits::max)(); +} +template constexpr auto num_bits() -> int { + return std::numeric_limits::digits; +} +// std::numeric_limits::digits may return 0 for 128-bit ints. +template <> constexpr auto num_bits() -> int { return 128; } +template <> constexpr auto num_bits() -> int { return 128; } + +// A heterogeneous bit_cast used for converting 96-bit long double to uint128_t +// and 128-bit pointers to uint128_fallback. +template sizeof(From))> +inline auto bit_cast(const From& from) -> To { + constexpr auto size = static_cast(sizeof(From) / sizeof(unsigned)); + struct data_t { + unsigned value[static_cast(size)]; + } data = bit_cast(from); + auto result = To(); + if (const_check(is_big_endian())) { + for (int i = 0; i < size; ++i) + result = (result << num_bits()) | data.value[i]; + } else { + for (int i = size - 1; i >= 0; --i) + result = (result << num_bits()) | data.value[i]; + } + return result; +} + +FMT_INLINE void assume(bool condition) { + (void)condition; +#if FMT_HAS_BUILTIN(__builtin_assume) && !FMT_ICC_VERSION + __builtin_assume(condition); +#endif +} + +// An approximation of iterator_t for pre-C++20 systems. +template +using iterator_t = decltype(std::begin(std::declval())); +template using sentinel_t = decltype(std::end(std::declval())); + +// A workaround for std::string not having mutable data() until C++17. +template +inline auto get_data(std::basic_string& s) -> Char* { + return &s[0]; +} +template +inline auto get_data(Container& c) -> typename Container::value_type* { + return c.data(); +} + +#if defined(_SECURE_SCL) && _SECURE_SCL +// Make a checked iterator to avoid MSVC warnings. +template using checked_ptr = stdext::checked_array_iterator; +template +constexpr auto make_checked(T* p, size_t size) -> checked_ptr { + return {p, size}; +} +#else +template using checked_ptr = T*; +template constexpr auto make_checked(T* p, size_t) -> T* { + return p; +} +#endif + +// Attempts to reserve space for n extra characters in the output range. +// Returns a pointer to the reserved range or a reference to it. +template ::value)> +#if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION +__attribute__((no_sanitize("undefined"))) +#endif +inline auto +reserve(std::back_insert_iterator it, size_t n) + -> checked_ptr { + Container& c = get_container(it); + size_t size = c.size(); + c.resize(size + n); + return make_checked(get_data(c) + size, n); +} + +template +inline auto reserve(buffer_appender it, size_t n) -> buffer_appender { + buffer& buf = get_container(it); + buf.try_reserve(buf.size() + n); + return it; +} + +template +constexpr auto reserve(Iterator& it, size_t) -> Iterator& { + return it; +} + +template +using reserve_iterator = + remove_reference_t(), 0))>; + +template +constexpr auto to_pointer(OutputIt, size_t) -> T* { + return nullptr; +} +template auto to_pointer(buffer_appender it, size_t n) -> T* { + buffer& buf = get_container(it); + auto size = buf.size(); + if (buf.capacity() < size + n) return nullptr; + buf.try_resize(size + n); + return buf.data() + size; +} + +template ::value)> +inline auto base_iterator(std::back_insert_iterator& it, + checked_ptr) + -> std::back_insert_iterator { + return it; +} + +template +constexpr auto base_iterator(Iterator, Iterator it) -> Iterator { + return it; +} + +// is spectacularly slow to compile in C++20 so use a simple fill_n +// instead (#1998). +template +FMT_CONSTEXPR auto fill_n(OutputIt out, Size count, const T& value) + -> OutputIt { + for (Size i = 0; i < count; ++i) *out++ = value; + return out; +} +template +FMT_CONSTEXPR20 auto fill_n(T* out, Size count, char value) -> T* { + if (is_constant_evaluated()) { + return fill_n(out, count, value); + } + std::memset(out, value, to_unsigned(count)); + return out + count; +} + +#ifdef __cpp_char8_t +using char8_type = char8_t; +#else +enum char8_type : unsigned char {}; +#endif + +template +FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end, + OutputIt out) -> OutputIt { + return copy_str(begin, end, out); +} + +// A public domain branchless UTF-8 decoder by Christopher Wellons: +// https://github.com/skeeto/branchless-utf8 +/* Decode the next character, c, from s, reporting errors in e. + * + * Since this is a branchless decoder, four bytes will be read from the + * buffer regardless of the actual length of the next character. This + * means the buffer _must_ have at least three bytes of zero padding + * following the end of the data stream. + * + * Errors are reported in e, which will be non-zero if the parsed + * character was somehow invalid: invalid byte sequence, non-canonical + * encoding, or a surrogate half. + * + * The function returns a pointer to the next character. When an error + * occurs, this pointer will be a guess that depends on the particular + * error, but it will always advance at least one byte. + */ +FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e) + -> const char* { + constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; + constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; + constexpr const int shiftc[] = {0, 18, 12, 6, 0}; + constexpr const int shifte[] = {0, 6, 4, 2, 0}; + + int len = code_point_length_impl(*s); + // Compute the pointer to the next character early so that the next + // iteration can start working on the next character. Neither Clang + // nor GCC figure out this reordering on their own. + const char* next = s + len + !len; + + using uchar = unsigned char; + + // Assume a four-byte character and load four bytes. Unused bits are + // shifted out. + *c = uint32_t(uchar(s[0]) & masks[len]) << 18; + *c |= uint32_t(uchar(s[1]) & 0x3f) << 12; + *c |= uint32_t(uchar(s[2]) & 0x3f) << 6; + *c |= uint32_t(uchar(s[3]) & 0x3f) << 0; + *c >>= shiftc[len]; + + // Accumulate the various error conditions. + *e = (*c < mins[len]) << 6; // non-canonical encoding + *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? + *e |= (*c > 0x10FFFF) << 8; // out of range? + *e |= (uchar(s[1]) & 0xc0) >> 2; + *e |= (uchar(s[2]) & 0xc0) >> 4; + *e |= uchar(s[3]) >> 6; + *e ^= 0x2a; // top two bits of each tail byte correct? + *e >>= shifte[len]; + + return next; +} + +constexpr uint32_t invalid_code_point = ~uint32_t(); + +// Invokes f(cp, sv) for every code point cp in s with sv being the string view +// corresponding to the code point. cp is invalid_code_point on error. +template +FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) { + auto decode = [f](const char* buf_ptr, const char* ptr) { + auto cp = uint32_t(); + auto error = 0; + auto end = utf8_decode(buf_ptr, &cp, &error); + bool result = f(error ? invalid_code_point : cp, + string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr))); + return result ? (error ? buf_ptr + 1 : end) : nullptr; + }; + auto p = s.data(); + const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars. + if (s.size() >= block_size) { + for (auto end = p + s.size() - block_size + 1; p < end;) { + p = decode(p, p); + if (!p) return; + } + } + if (auto num_chars_left = s.data() + s.size() - p) { + char buf[2 * block_size - 1] = {}; + copy_str(p, p + num_chars_left, buf); + const char* buf_ptr = buf; + do { + auto end = decode(buf_ptr, p); + if (!end) return; + p += end - buf_ptr; + buf_ptr = end; + } while (buf_ptr - buf < num_chars_left); + } +} + +template +inline auto compute_width(basic_string_view s) -> size_t { + return s.size(); +} + +// Computes approximate display width of a UTF-8 string. +FMT_CONSTEXPR inline size_t compute_width(string_view s) { + size_t num_code_points = 0; + // It is not a lambda for compatibility with C++14. + struct count_code_points { + size_t* count; + FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool { + *count += detail::to_unsigned( + 1 + + (cp >= 0x1100 && + (cp <= 0x115f || // Hangul Jamo init. consonants + cp == 0x2329 || // LEFT-POINTING ANGLE BRACKET + cp == 0x232a || // RIGHT-POINTING ANGLE BRACKET + // CJK ... Yi except IDEOGRAPHIC HALF FILL SPACE: + (cp >= 0x2e80 && cp <= 0xa4cf && cp != 0x303f) || + (cp >= 0xac00 && cp <= 0xd7a3) || // Hangul Syllables + (cp >= 0xf900 && cp <= 0xfaff) || // CJK Compatibility Ideographs + (cp >= 0xfe10 && cp <= 0xfe19) || // Vertical Forms + (cp >= 0xfe30 && cp <= 0xfe6f) || // CJK Compatibility Forms + (cp >= 0xff00 && cp <= 0xff60) || // Fullwidth Forms + (cp >= 0xffe0 && cp <= 0xffe6) || // Fullwidth Forms + (cp >= 0x20000 && cp <= 0x2fffd) || // CJK + (cp >= 0x30000 && cp <= 0x3fffd) || + // Miscellaneous Symbols and Pictographs + Emoticons: + (cp >= 0x1f300 && cp <= 0x1f64f) || + // Supplemental Symbols and Pictographs: + (cp >= 0x1f900 && cp <= 0x1f9ff)))); + return true; + } + }; + // We could avoid branches by using utf8_decode directly. + for_each_codepoint(s, count_code_points{&num_code_points}); + return num_code_points; +} + +inline auto compute_width(basic_string_view s) -> size_t { + return compute_width( + string_view(reinterpret_cast(s.data()), s.size())); +} + +template +inline auto code_point_index(basic_string_view s, size_t n) -> size_t { + size_t size = s.size(); + return n < size ? n : size; +} + +// Calculates the index of the nth code point in a UTF-8 string. +inline auto code_point_index(string_view s, size_t n) -> size_t { + const char* data = s.data(); + size_t num_code_points = 0; + for (size_t i = 0, size = s.size(); i != size; ++i) { + if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i; + } + return s.size(); +} + +inline auto code_point_index(basic_string_view s, size_t n) + -> size_t { + return code_point_index( + string_view(reinterpret_cast(s.data()), s.size()), n); +} + +template struct is_integral : std::is_integral {}; +template <> struct is_integral : std::true_type {}; +template <> struct is_integral : std::true_type {}; + +template +using is_signed = + std::integral_constant::is_signed || + std::is_same::value>; + +template +using is_integer = + bool_constant::value && !std::is_same::value && + !std::is_same::value && + !std::is_same::value>; + +#ifndef FMT_USE_FLOAT128 +# ifdef __SIZEOF_FLOAT128__ +# define FMT_USE_FLOAT128 1 +# else +# define FMT_USE_FLOAT128 0 +# endif +#endif +#if FMT_USE_FLOAT128 +using float128 = __float128; +#else +using float128 = void; +#endif +template using is_float128 = std::is_same; + +template +using is_floating_point = + bool_constant::value || is_float128::value>; + +template ::value> +struct is_fast_float : bool_constant::is_iec559 && + sizeof(T) <= sizeof(double)> {}; +template struct is_fast_float : std::false_type {}; + +template +using is_double_double = bool_constant::digits == 106>; + +#ifndef FMT_USE_FULL_CACHE_DRAGONBOX +# define FMT_USE_FULL_CACHE_DRAGONBOX 0 +#endif + +template +template +void buffer::append(const U* begin, const U* end) { + while (begin != end) { + auto count = to_unsigned(end - begin); + try_reserve(size_ + count); + auto free_cap = capacity_ - size_; + if (free_cap < count) count = free_cap; + std::uninitialized_copy_n(begin, count, make_checked(ptr_ + size_, count)); + size_ += count; + begin += count; + } +} + +template +struct is_locale : std::false_type {}; +template +struct is_locale> : std::true_type {}; +} // namespace detail + +FMT_MODULE_EXPORT_BEGIN + +// The number of characters to store in the basic_memory_buffer object itself +// to avoid dynamic memory allocation. +enum { inline_buffer_size = 500 }; + +/** + \rst + A dynamically growing memory buffer for trivially copyable/constructible types + with the first ``SIZE`` elements stored in the object itself. + + You can use the ``memory_buffer`` type alias for ``char`` instead. + + **Example**:: + + auto out = fmt::memory_buffer(); + format_to(std::back_inserter(out), "The answer is {}.", 42); + + This will append the following output to the ``out`` object: + + .. code-block:: none + + The answer is 42. + + The output can be converted to an ``std::string`` with ``to_string(out)``. + \endrst + */ +template > +class basic_memory_buffer final : public detail::buffer { + private: + T store_[SIZE]; + + // Don't inherit from Allocator avoid generating type_info for it. + Allocator alloc_; + + // Deallocate memory allocated by the buffer. + FMT_CONSTEXPR20 void deallocate() { + T* data = this->data(); + if (data != store_) alloc_.deallocate(data, this->capacity()); + } + + protected: + FMT_CONSTEXPR20 void grow(size_t size) override; + + public: + using value_type = T; + using const_reference = const T&; + + FMT_CONSTEXPR20 explicit basic_memory_buffer( + const Allocator& alloc = Allocator()) + : alloc_(alloc) { + this->set(store_, SIZE); + if (detail::is_constant_evaluated()) detail::fill_n(store_, SIZE, T()); + } + FMT_CONSTEXPR20 ~basic_memory_buffer() { deallocate(); } + + private: + // Move data from other to this buffer. + FMT_CONSTEXPR20 void move(basic_memory_buffer& other) { + alloc_ = std::move(other.alloc_); + T* data = other.data(); + size_t size = other.size(), capacity = other.capacity(); + if (data == other.store_) { + this->set(store_, capacity); + detail::copy_str(other.store_, other.store_ + size, + detail::make_checked(store_, capacity)); + } else { + this->set(data, capacity); + // Set pointer to the inline array so that delete is not called + // when deallocating. + other.set(other.store_, 0); + other.clear(); + } + this->resize(size); + } + + public: + /** + \rst + Constructs a :class:`fmt::basic_memory_buffer` object moving the content + of the other object to it. + \endrst + */ + FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept { + move(other); + } + + /** + \rst + Moves the content of the other ``basic_memory_buffer`` object to this one. + \endrst + */ + auto operator=(basic_memory_buffer&& other) noexcept -> basic_memory_buffer& { + FMT_ASSERT(this != &other, ""); + deallocate(); + move(other); + return *this; + } + + // Returns a copy of the allocator associated with this buffer. + auto get_allocator() const -> Allocator { return alloc_; } + + /** + Resizes the buffer to contain *count* elements. If T is a POD type new + elements may not be initialized. + */ + FMT_CONSTEXPR20 void resize(size_t count) { this->try_resize(count); } + + /** Increases the buffer capacity to *new_capacity*. */ + void reserve(size_t new_capacity) { this->try_reserve(new_capacity); } + + // Directly append data into the buffer + using detail::buffer::append; + template + void append(const ContiguousRange& range) { + append(range.data(), range.data() + range.size()); + } +}; + +template +FMT_CONSTEXPR20 void basic_memory_buffer::grow( + size_t size) { + detail::abort_fuzzing_if(size > 5000); + const size_t max_size = std::allocator_traits::max_size(alloc_); + size_t old_capacity = this->capacity(); + size_t new_capacity = old_capacity + old_capacity / 2; + if (size > new_capacity) + new_capacity = size; + else if (new_capacity > max_size) + new_capacity = size > max_size ? size : max_size; + T* old_data = this->data(); + T* new_data = + std::allocator_traits::allocate(alloc_, new_capacity); + // The following code doesn't throw, so the raw pointer above doesn't leak. + std::uninitialized_copy(old_data, old_data + this->size(), + detail::make_checked(new_data, new_capacity)); + this->set(new_data, new_capacity); + // deallocate must not throw according to the standard, but even if it does, + // the buffer already uses the new storage and will deallocate it in + // destructor. + if (old_data != store_) alloc_.deallocate(old_data, old_capacity); +} + +using memory_buffer = basic_memory_buffer; + +template +struct is_contiguous> : std::true_type { +}; + +namespace detail { +#ifdef _WIN32 +FMT_API bool write_console(std::FILE* f, string_view text); +#endif +FMT_API void print(std::FILE*, string_view); +} // namespace detail + +/** An error reported from a formatting function. */ +FMT_CLASS_API +class FMT_API format_error : public std::runtime_error { + public: + using std::runtime_error::runtime_error; + format_error(const format_error&) = default; + format_error& operator=(const format_error&) = default; + format_error(format_error&&) = default; + format_error& operator=(format_error&&) = default; + ~format_error() noexcept override FMT_MSC_DEFAULT; +}; + +namespace detail_exported { +#if FMT_USE_NONTYPE_TEMPLATE_ARGS +template struct fixed_string { + constexpr fixed_string(const Char (&str)[N]) { + detail::copy_str(static_cast(str), + str + N, data); + } + Char data[N] = {}; +}; +#endif + +// Converts a compile-time string to basic_string_view. +template +constexpr auto compile_string_to_view(const Char (&s)[N]) + -> basic_string_view { + // Remove trailing NUL character if needed. Won't be present if this is used + // with a raw character array (i.e. not defined as a string). + return {s, N - (std::char_traits::to_int_type(s[N - 1]) == 0 ? 1 : 0)}; +} +template +constexpr auto compile_string_to_view(detail::std_string_view s) + -> basic_string_view { + return {s.data(), s.size()}; +} +} // namespace detail_exported + +class loc_value { + private: + basic_format_arg value_; + + public: + template ::value)> + loc_value(T value) : value_(detail::make_arg(value)) {} + + template ::value)> + loc_value(T) {} + + template auto visit(Visitor&& vis) -> decltype(vis(0)) { + return visit_format_arg(vis, value_); + } +}; + +// A locale facet that formats values in UTF-8. +// It is parameterized on the locale to avoid the heavy include. +template class format_facet : public Locale::facet { + private: + std::string separator_; + std::string grouping_; + std::string decimal_point_; + + protected: + virtual auto do_put(appender out, loc_value val, + const format_specs& specs) const -> bool; + + public: + static FMT_API typename Locale::id id; + + explicit format_facet(Locale& loc); + explicit format_facet(string_view sep = "", + std::initializer_list g = {3}, + std::string decimal_point = ".") + : separator_(sep.data(), sep.size()), + grouping_(g.begin(), g.end()), + decimal_point_(decimal_point) {} + + auto put(appender out, loc_value val, const format_specs& specs) const + -> bool { + return do_put(out, val, specs); + } +}; + +FMT_BEGIN_DETAIL_NAMESPACE + +// Returns true if value is negative, false otherwise. +// Same as `value < 0` but doesn't produce warnings if T is an unsigned type. +template ::value)> +constexpr auto is_negative(T value) -> bool { + return value < 0; +} +template ::value)> +constexpr auto is_negative(T) -> bool { + return false; +} + +template +FMT_CONSTEXPR auto is_supported_floating_point(T) -> bool { + if (std::is_same()) return FMT_USE_FLOAT; + if (std::is_same()) return FMT_USE_DOUBLE; + if (std::is_same()) return FMT_USE_LONG_DOUBLE; + return true; +} + +// Smallest of uint32_t, uint64_t, uint128_t that is large enough to +// represent all values of an integral type T. +template +using uint32_or_64_or_128_t = + conditional_t() <= 32 && !FMT_REDUCE_INT_INSTANTIATIONS, + uint32_t, + conditional_t() <= 64, uint64_t, uint128_t>>; +template +using uint64_or_128_t = conditional_t() <= 64, uint64_t, uint128_t>; + +#define FMT_POWERS_OF_10(factor) \ + factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \ + (factor)*1000000, (factor)*10000000, (factor)*100000000, \ + (factor)*1000000000 + +// Converts value in the range [0, 100) to a string. +constexpr const char* digits2(size_t value) { + // GCC generates slightly better code when value is pointer-size. + return &"0001020304050607080910111213141516171819" + "2021222324252627282930313233343536373839" + "4041424344454647484950515253545556575859" + "6061626364656667686970717273747576777879" + "8081828384858687888990919293949596979899"[value * 2]; +} + +// Sign is a template parameter to workaround a bug in gcc 4.8. +template constexpr Char sign(Sign s) { +#if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604 + static_assert(std::is_same::value, ""); +#endif + return static_cast("\0-+ "[s]); +} + +template FMT_CONSTEXPR auto count_digits_fallback(T n) -> int { + int count = 1; + for (;;) { + // Integer division is slow so do it for a group of four digits instead + // of for every digit. The idea comes from the talk by Alexandrescu + // "Three Optimization Tips for C++". See speed-test for a comparison. + if (n < 10) return count; + if (n < 100) return count + 1; + if (n < 1000) return count + 2; + if (n < 10000) return count + 3; + n /= 10000u; + count += 4; + } +} +#if FMT_USE_INT128 +FMT_CONSTEXPR inline auto count_digits(uint128_opt n) -> int { + return count_digits_fallback(n); +} +#endif + +#ifdef FMT_BUILTIN_CLZLL +// It is a separate function rather than a part of count_digits to workaround +// the lack of static constexpr in constexpr functions. +inline auto do_count_digits(uint64_t n) -> int { + // This has comparable performance to the version by Kendall Willets + // (https://github.com/fmtlib/format-benchmark/blob/master/digits10) + // but uses smaller tables. + // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)). + static constexpr uint8_t bsr2log10[] = { + 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, + 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, + 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, + 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20}; + auto t = bsr2log10[FMT_BUILTIN_CLZLL(n | 1) ^ 63]; + static constexpr const uint64_t zero_or_powers_of_10[] = { + 0, 0, FMT_POWERS_OF_10(1U), FMT_POWERS_OF_10(1000000000ULL), + 10000000000000000000ULL}; + return t - (n < zero_or_powers_of_10[t]); +} +#endif + +// Returns the number of decimal digits in n. Leading zeros are not counted +// except for n == 0 in which case count_digits returns 1. +FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int { +#ifdef FMT_BUILTIN_CLZLL + if (!is_constant_evaluated()) { + return do_count_digits(n); + } +#endif + return count_digits_fallback(n); +} + +// Counts the number of digits in n. BITS = log2(radix). +template +FMT_CONSTEXPR auto count_digits(UInt n) -> int { +#ifdef FMT_BUILTIN_CLZ + if (!is_constant_evaluated() && num_bits() == 32) + return (FMT_BUILTIN_CLZ(static_cast(n) | 1) ^ 31) / BITS + 1; +#endif + // Lambda avoids unreachable code warnings from NVHPC. + return [](UInt m) { + int num_digits = 0; + do { + ++num_digits; + } while ((m >>= BITS) != 0); + return num_digits; + }(n); +} + +#ifdef FMT_BUILTIN_CLZ +// It is a separate function rather than a part of count_digits to workaround +// the lack of static constexpr in constexpr functions. +FMT_INLINE auto do_count_digits(uint32_t n) -> int { +// An optimization by Kendall Willets from https://bit.ly/3uOIQrB. +// This increments the upper 32 bits (log10(T) - 1) when >= T is added. +# define FMT_INC(T) (((sizeof(# T) - 1ull) << 32) - T) + static constexpr uint64_t table[] = { + FMT_INC(0), FMT_INC(0), FMT_INC(0), // 8 + FMT_INC(10), FMT_INC(10), FMT_INC(10), // 64 + FMT_INC(100), FMT_INC(100), FMT_INC(100), // 512 + FMT_INC(1000), FMT_INC(1000), FMT_INC(1000), // 4096 + FMT_INC(10000), FMT_INC(10000), FMT_INC(10000), // 32k + FMT_INC(100000), FMT_INC(100000), FMT_INC(100000), // 256k + FMT_INC(1000000), FMT_INC(1000000), FMT_INC(1000000), // 2048k + FMT_INC(10000000), FMT_INC(10000000), FMT_INC(10000000), // 16M + FMT_INC(100000000), FMT_INC(100000000), FMT_INC(100000000), // 128M + FMT_INC(1000000000), FMT_INC(1000000000), FMT_INC(1000000000), // 1024M + FMT_INC(1000000000), FMT_INC(1000000000) // 4B + }; + auto inc = table[FMT_BUILTIN_CLZ(n | 1) ^ 31]; + return static_cast((n + inc) >> 32); +} +#endif + +// Optional version of count_digits for better performance on 32-bit platforms. +FMT_CONSTEXPR20 inline auto count_digits(uint32_t n) -> int { +#ifdef FMT_BUILTIN_CLZ + if (!is_constant_evaluated()) { + return do_count_digits(n); + } +#endif + return count_digits_fallback(n); +} + +template constexpr auto digits10() noexcept -> int { + return std::numeric_limits::digits10; +} +template <> constexpr auto digits10() noexcept -> int { return 38; } +template <> constexpr auto digits10() noexcept -> int { return 38; } + +template struct thousands_sep_result { + std::string grouping; + Char thousands_sep; +}; + +template +FMT_API auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result; +template +inline auto thousands_sep(locale_ref loc) -> thousands_sep_result { + auto result = thousands_sep_impl(loc); + return {result.grouping, Char(result.thousands_sep)}; +} +template <> +inline auto thousands_sep(locale_ref loc) -> thousands_sep_result { + return thousands_sep_impl(loc); +} + +template +FMT_API auto decimal_point_impl(locale_ref loc) -> Char; +template inline auto decimal_point(locale_ref loc) -> Char { + return Char(decimal_point_impl(loc)); +} +template <> inline auto decimal_point(locale_ref loc) -> wchar_t { + return decimal_point_impl(loc); +} + +// Compares two characters for equality. +template auto equal2(const Char* lhs, const char* rhs) -> bool { + return lhs[0] == Char(rhs[0]) && lhs[1] == Char(rhs[1]); +} +inline auto equal2(const char* lhs, const char* rhs) -> bool { + return memcmp(lhs, rhs, 2) == 0; +} + +// Copies two characters from src to dst. +template +FMT_CONSTEXPR20 FMT_INLINE void copy2(Char* dst, const char* src) { + if (!is_constant_evaluated() && sizeof(Char) == sizeof(char)) { + memcpy(dst, src, 2); + return; + } + *dst++ = static_cast(*src++); + *dst = static_cast(*src); +} + +template struct format_decimal_result { + Iterator begin; + Iterator end; +}; + +// Formats a decimal unsigned integer value writing into out pointing to a +// buffer of specified size. The caller must ensure that the buffer is large +// enough. +template +FMT_CONSTEXPR20 auto format_decimal(Char* out, UInt value, int size) + -> format_decimal_result { + FMT_ASSERT(size >= count_digits(value), "invalid digit count"); + out += size; + Char* end = out; + while (value >= 100) { + // Integer division is slow so do it for a group of two digits instead + // of for every digit. The idea comes from the talk by Alexandrescu + // "Three Optimization Tips for C++". See speed-test for a comparison. + out -= 2; + copy2(out, digits2(static_cast(value % 100))); + value /= 100; + } + if (value < 10) { + *--out = static_cast('0' + value); + return {out, end}; + } + out -= 2; + copy2(out, digits2(static_cast(value))); + return {out, end}; +} + +template >::value)> +FMT_CONSTEXPR inline auto format_decimal(Iterator out, UInt value, int size) + -> format_decimal_result { + // Buffer is large enough to hold all digits (digits10 + 1). + Char buffer[digits10() + 1] = {}; + auto end = format_decimal(buffer, value, size).end; + return {out, detail::copy_str_noinline(buffer, end, out)}; +} + +template +FMT_CONSTEXPR auto format_uint(Char* buffer, UInt value, int num_digits, + bool upper = false) -> Char* { + buffer += num_digits; + Char* end = buffer; + do { + const char* digits = upper ? "0123456789ABCDEF" : "0123456789abcdef"; + unsigned digit = static_cast(value & ((1 << BASE_BITS) - 1)); + *--buffer = static_cast(BASE_BITS < 4 ? static_cast('0' + digit) + : digits[digit]); + } while ((value >>= BASE_BITS) != 0); + return end; +} + +template +inline auto format_uint(It out, UInt value, int num_digits, bool upper = false) + -> It { + if (auto ptr = to_pointer(out, to_unsigned(num_digits))) { + format_uint(ptr, value, num_digits, upper); + return out; + } + // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1). + char buffer[num_bits() / BASE_BITS + 1]; + format_uint(buffer, value, num_digits, upper); + return detail::copy_str_noinline(buffer, buffer + num_digits, out); +} + +// A converter from UTF-8 to UTF-16. +class utf8_to_utf16 { + private: + basic_memory_buffer buffer_; + + public: + FMT_API explicit utf8_to_utf16(string_view s); + operator basic_string_view() const { return {&buffer_[0], size()}; } + auto size() const -> size_t { return buffer_.size() - 1; } + auto c_str() const -> const wchar_t* { return &buffer_[0]; } + auto str() const -> std::wstring { return {&buffer_[0], size()}; } +}; + +namespace dragonbox { + +// Type-specific information that Dragonbox uses. +template struct float_info; + +template <> struct float_info { + using carrier_uint = uint32_t; + static const int exponent_bits = 8; + static const int kappa = 1; + static const int big_divisor = 100; + static const int small_divisor = 10; + static const int min_k = -31; + static const int max_k = 46; + static const int shorter_interval_tie_lower_threshold = -35; + static const int shorter_interval_tie_upper_threshold = -35; +}; + +template <> struct float_info { + using carrier_uint = uint64_t; + static const int exponent_bits = 11; + static const int kappa = 2; + static const int big_divisor = 1000; + static const int small_divisor = 100; + static const int min_k = -292; + static const int max_k = 326; + static const int shorter_interval_tie_lower_threshold = -77; + static const int shorter_interval_tie_upper_threshold = -77; +}; + +// An 80- or 128-bit floating point number. +template +struct float_info::digits == 64 || + std::numeric_limits::digits == 113 || + is_float128::value>> { + using carrier_uint = detail::uint128_t; + static const int exponent_bits = 15; +}; + +// A double-double floating point number. +template +struct float_info::value>> { + using carrier_uint = detail::uint128_t; +}; + +template struct decimal_fp { + using significand_type = typename float_info::carrier_uint; + significand_type significand; + int exponent; +}; + +template FMT_API auto to_decimal(T x) noexcept -> decimal_fp; +} // namespace dragonbox + +// Returns true iff Float has the implicit bit which is not stored. +template constexpr bool has_implicit_bit() { + // An 80-bit FP number has a 64-bit significand an no implicit bit. + return std::numeric_limits::digits != 64; +} + +// Returns the number of significand bits stored in Float. The implicit bit is +// not counted since it is not stored. +template constexpr int num_significand_bits() { + // std::numeric_limits may not support __float128. + return is_float128() ? 112 + : (std::numeric_limits::digits - + (has_implicit_bit() ? 1 : 0)); +} + +template +constexpr auto exponent_mask() -> + typename dragonbox::float_info::carrier_uint { + using uint = typename dragonbox::float_info::carrier_uint; + return ((uint(1) << dragonbox::float_info::exponent_bits) - 1) + << num_significand_bits(); +} +template constexpr auto exponent_bias() -> int { + // std::numeric_limits may not support __float128. + return is_float128() ? 16383 + : std::numeric_limits::max_exponent - 1; +} + +// Writes the exponent exp in the form "[+-]d{2,3}" to buffer. +template +FMT_CONSTEXPR auto write_exponent(int exp, It it) -> It { + FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range"); + if (exp < 0) { + *it++ = static_cast('-'); + exp = -exp; + } else { + *it++ = static_cast('+'); + } + if (exp >= 100) { + const char* top = digits2(to_unsigned(exp / 100)); + if (exp >= 1000) *it++ = static_cast(top[0]); + *it++ = static_cast(top[1]); + exp %= 100; + } + const char* d = digits2(to_unsigned(exp)); + *it++ = static_cast(d[0]); + *it++ = static_cast(d[1]); + return it; +} + +// A floating-point number f * pow(2, e) where F is an unsigned type. +template struct basic_fp { + F f; + int e; + + static constexpr const int num_significand_bits = + static_cast(sizeof(F) * num_bits()); + + constexpr basic_fp() : f(0), e(0) {} + constexpr basic_fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {} + + // Constructs fp from an IEEE754 floating-point number. + template FMT_CONSTEXPR basic_fp(Float n) { assign(n); } + + // Assigns n to this and return true iff predecessor is closer than successor. + template ::value)> + FMT_CONSTEXPR auto assign(Float n) -> bool { + static_assert(std::numeric_limits::digits <= 113, "unsupported FP"); + // Assume Float is in the format [sign][exponent][significand]. + using carrier_uint = typename dragonbox::float_info::carrier_uint; + const auto num_float_significand_bits = + detail::num_significand_bits(); + const auto implicit_bit = carrier_uint(1) << num_float_significand_bits; + const auto significand_mask = implicit_bit - 1; + auto u = bit_cast(n); + f = static_cast(u & significand_mask); + auto biased_e = static_cast((u & exponent_mask()) >> + num_float_significand_bits); + // The predecessor is closer if n is a normalized power of 2 (f == 0) + // other than the smallest normalized number (biased_e > 1). + auto is_predecessor_closer = f == 0 && biased_e > 1; + if (biased_e == 0) + biased_e = 1; // Subnormals use biased exponent 1 (min exponent). + else if (has_implicit_bit()) + f += static_cast(implicit_bit); + e = biased_e - exponent_bias() - num_float_significand_bits; + if (!has_implicit_bit()) ++e; + return is_predecessor_closer; + } + + template ::value)> + FMT_CONSTEXPR auto assign(Float n) -> bool { + static_assert(std::numeric_limits::is_iec559, "unsupported FP"); + return assign(static_cast(n)); + } +}; + +using fp = basic_fp; + +// Normalizes the value converted from double and multiplied by (1 << SHIFT). +template +FMT_CONSTEXPR basic_fp normalize(basic_fp value) { + // Handle subnormals. + const auto implicit_bit = F(1) << num_significand_bits(); + const auto shifted_implicit_bit = implicit_bit << SHIFT; + while ((value.f & shifted_implicit_bit) == 0) { + value.f <<= 1; + --value.e; + } + // Subtract 1 to account for hidden bit. + const auto offset = basic_fp::num_significand_bits - + num_significand_bits() - SHIFT - 1; + value.f <<= offset; + value.e -= offset; + return value; +} + +// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking. +FMT_CONSTEXPR inline uint64_t multiply(uint64_t lhs, uint64_t rhs) { +#if FMT_USE_INT128 + auto product = static_cast<__uint128_t>(lhs) * rhs; + auto f = static_cast(product >> 64); + return (static_cast(product) & (1ULL << 63)) != 0 ? f + 1 : f; +#else + // Multiply 32-bit parts of significands. + uint64_t mask = (1ULL << 32) - 1; + uint64_t a = lhs >> 32, b = lhs & mask; + uint64_t c = rhs >> 32, d = rhs & mask; + uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d; + // Compute mid 64-bit of result and round. + uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31); + return ac + (ad >> 32) + (bc >> 32) + (mid >> 32); +#endif +} + +FMT_CONSTEXPR inline fp operator*(fp x, fp y) { + return {multiply(x.f, y.f), x.e + y.e + 64}; +} + +template struct basic_data { + // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340. + // These are generated by support/compute-powers.py. + static constexpr uint64_t pow10_significands[87] = { + 0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76, + 0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df, + 0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c, + 0x8dd01fad907ffc3c, 0xd3515c2831559a83, 0x9d71ac8fada6c9b5, + 0xea9c227723ee8bcb, 0xaecc49914078536d, 0x823c12795db6ce57, + 0xc21094364dfb5637, 0x9096ea6f3848984f, 0xd77485cb25823ac7, + 0xa086cfcd97bf97f4, 0xef340a98172aace5, 0xb23867fb2a35b28e, + 0x84c8d4dfd2c63f3b, 0xc5dd44271ad3cdba, 0x936b9fcebb25c996, + 0xdbac6c247d62a584, 0xa3ab66580d5fdaf6, 0xf3e2f893dec3f126, + 0xb5b5ada8aaff80b8, 0x87625f056c7c4a8b, 0xc9bcff6034c13053, + 0x964e858c91ba2655, 0xdff9772470297ebd, 0xa6dfbd9fb8e5b88f, + 0xf8a95fcf88747d94, 0xb94470938fa89bcf, 0x8a08f0f8bf0f156b, + 0xcdb02555653131b6, 0x993fe2c6d07b7fac, 0xe45c10c42a2b3b06, + 0xaa242499697392d3, 0xfd87b5f28300ca0e, 0xbce5086492111aeb, + 0x8cbccc096f5088cc, 0xd1b71758e219652c, 0x9c40000000000000, + 0xe8d4a51000000000, 0xad78ebc5ac620000, 0x813f3978f8940984, + 0xc097ce7bc90715b3, 0x8f7e32ce7bea5c70, 0xd5d238a4abe98068, + 0x9f4f2726179a2245, 0xed63a231d4c4fb27, 0xb0de65388cc8ada8, + 0x83c7088e1aab65db, 0xc45d1df942711d9a, 0x924d692ca61be758, + 0xda01ee641a708dea, 0xa26da3999aef774a, 0xf209787bb47d6b85, + 0xb454e4a179dd1877, 0x865b86925b9bc5c2, 0xc83553c5c8965d3d, + 0x952ab45cfa97a0b3, 0xde469fbd99a05fe3, 0xa59bc234db398c25, + 0xf6c69a72a3989f5c, 0xb7dcbf5354e9bece, 0x88fcf317f22241e2, + 0xcc20ce9bd35c78a5, 0x98165af37b2153df, 0xe2a0b5dc971f303a, + 0xa8d9d1535ce3b396, 0xfb9b7cd9a4a7443c, 0xbb764c4ca7a44410, + 0x8bab8eefb6409c1a, 0xd01fef10a657842c, 0x9b10a4e5e9913129, + 0xe7109bfba19c0c9d, 0xac2820d9623bf429, 0x80444b5e7aa7cf85, + 0xbf21e44003acdd2d, 0x8e679c2f5e44ff8f, 0xd433179d9c8cb841, + 0x9e19db92b4e31ba9, 0xeb96bf6ebadf77d9, 0xaf87023b9bf0ee6b, + }; + +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wnarrowing" +#endif + // Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding + // to significands above. + static constexpr int16_t pow10_exponents[87] = { + -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954, + -927, -901, -874, -847, -821, -794, -768, -741, -715, -688, -661, + -635, -608, -582, -555, -529, -502, -475, -449, -422, -396, -369, + -343, -316, -289, -263, -236, -210, -183, -157, -130, -103, -77, + -50, -24, 3, 30, 56, 83, 109, 136, 162, 189, 216, + 242, 269, 295, 322, 348, 375, 402, 428, 455, 481, 508, + 534, 561, 588, 614, 641, 667, 694, 720, 747, 774, 800, + 827, 853, 880, 907, 933, 960, 986, 1013, 1039, 1066}; +#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409 +# pragma GCC diagnostic pop +#endif + + static constexpr uint64_t power_of_10_64[20] = { + 1, FMT_POWERS_OF_10(1ULL), FMT_POWERS_OF_10(1000000000ULL), + 10000000000000000000ULL}; +}; + +#if FMT_CPLUSPLUS < 201703L +template constexpr uint64_t basic_data::pow10_significands[]; +template constexpr int16_t basic_data::pow10_exponents[]; +template constexpr uint64_t basic_data::power_of_10_64[]; +#endif + +// This is a struct rather than an alias to avoid shadowing warnings in gcc. +struct data : basic_data<> {}; + +// Returns a cached power of 10 `c_k = c_k.f * pow(2, c_k.e)` such that its +// (binary) exponent satisfies `min_exponent <= c_k.e <= min_exponent + 28`. +FMT_CONSTEXPR inline fp get_cached_power(int min_exponent, + int& pow10_exponent) { + const int shift = 32; + // log10(2) = 0x0.4d104d427de7fbcc... + const int64_t significand = 0x4d104d427de7fbcc; + int index = static_cast( + ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) + + ((int64_t(1) << shift) - 1)) // ceil + >> 32 // arithmetic shift + ); + // Decimal exponent of the first (smallest) cached power of 10. + const int first_dec_exp = -348; + // Difference between 2 consecutive decimal exponents in cached powers of 10. + const int dec_exp_step = 8; + index = (index - first_dec_exp - 1) / dec_exp_step + 1; + pow10_exponent = first_dec_exp + index * dec_exp_step; + // Using *(x + index) instead of x[index] avoids an issue with some compilers + // using the EDG frontend (e.g. nvhpc/22.3 in C++17 mode). + return {*(data::pow10_significands + index), + *(data::pow10_exponents + index)}; +} + +#ifndef _MSC_VER +# define FMT_SNPRINTF snprintf +#else +FMT_API auto fmt_snprintf(char* buf, size_t size, const char* fmt, ...) -> int; +# define FMT_SNPRINTF fmt_snprintf +#endif // _MSC_VER + +// Formats a floating-point number with snprintf using the hexfloat format. +template +auto snprintf_float(T value, int precision, float_specs specs, + buffer& buf) -> int { + // Buffer capacity must be non-zero, otherwise MSVC's vsnprintf_s will fail. + FMT_ASSERT(buf.capacity() > buf.size(), "empty buffer"); + FMT_ASSERT(specs.format == float_format::hex, ""); + static_assert(!std::is_same::value, ""); + + // Build the format string. + char format[7]; // The longest format is "%#.*Le". + char* format_ptr = format; + *format_ptr++ = '%'; + if (specs.showpoint) *format_ptr++ = '#'; + if (precision >= 0) { + *format_ptr++ = '.'; + *format_ptr++ = '*'; + } + if (std::is_same()) *format_ptr++ = 'L'; + *format_ptr++ = specs.upper ? 'A' : 'a'; + *format_ptr = '\0'; + + // Format using snprintf. + auto offset = buf.size(); + for (;;) { + auto begin = buf.data() + offset; + auto capacity = buf.capacity() - offset; + abort_fuzzing_if(precision > 100000); + // Suppress the warning about a nonliteral format string. + // Cannot use auto because of a bug in MinGW (#1532). + int (*snprintf_ptr)(char*, size_t, const char*, ...) = FMT_SNPRINTF; + int result = precision >= 0 + ? snprintf_ptr(begin, capacity, format, precision, value) + : snprintf_ptr(begin, capacity, format, value); + if (result < 0) { + // The buffer will grow exponentially. + buf.try_reserve(buf.capacity() + 1); + continue; + } + auto size = to_unsigned(result); + // Size equal to capacity means that the last character was truncated. + if (size < capacity) { + buf.try_resize(size + offset); + return 0; + } + buf.try_reserve(size + offset + 1); // Add 1 for the terminating '\0'. + } +} + +template +using convert_float_result = + conditional_t::value || + std::numeric_limits::digits == + std::numeric_limits::digits, + double, T>; + +template +constexpr auto convert_float(T value) -> convert_float_result { + return static_cast>(value); +} + +template +FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n, + const fill_t& fill) -> OutputIt { + auto fill_size = fill.size(); + if (fill_size == 1) return detail::fill_n(it, n, fill[0]); + auto data = fill.data(); + for (size_t i = 0; i < n; ++i) + it = copy_str(data, data + fill_size, it); + return it; +} + +// Writes the output of f, padded according to format specifications in specs. +// size: output size in code units. +// width: output display width in (terminal) column positions. +template +FMT_CONSTEXPR auto write_padded(OutputIt out, + const basic_format_specs& specs, + size_t size, size_t width, F&& f) -> OutputIt { + static_assert(align == align::left || align == align::right, ""); + unsigned spec_width = to_unsigned(specs.width); + size_t padding = spec_width > width ? spec_width - width : 0; + // Shifts are encoded as string literals because static constexpr is not + // supported in constexpr functions. + auto* shifts = align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01"; + size_t left_padding = padding >> shifts[specs.align]; + size_t right_padding = padding - left_padding; + auto it = reserve(out, size + padding * specs.fill.size()); + if (left_padding != 0) it = fill(it, left_padding, specs.fill); + it = f(it); + if (right_padding != 0) it = fill(it, right_padding, specs.fill); + return base_iterator(out, it); +} + +template +constexpr auto write_padded(OutputIt out, const basic_format_specs& specs, + size_t size, F&& f) -> OutputIt { + return write_padded(out, specs, size, size, f); +} + +template +FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes, + const basic_format_specs& specs) + -> OutputIt { + return write_padded( + out, specs, bytes.size(), [bytes](reserve_iterator it) { + const char* data = bytes.data(); + return copy_str(data, data + bytes.size(), it); + }); +} + +template +auto write_ptr(OutputIt out, UIntPtr value, + const basic_format_specs* specs) -> OutputIt { + int num_digits = count_digits<4>(value); + auto size = to_unsigned(num_digits) + size_t(2); + auto write = [=](reserve_iterator it) { + *it++ = static_cast('0'); + *it++ = static_cast('x'); + return format_uint<4, Char>(it, value, num_digits); + }; + return specs ? write_padded(out, *specs, size, write) + : base_iterator(out, write(reserve(out, size))); +} + +// Returns true iff the code point cp is printable. +FMT_API auto is_printable(uint32_t cp) -> bool; + +inline auto needs_escape(uint32_t cp) -> bool { + return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' || + !is_printable(cp); +} + +template struct find_escape_result { + const Char* begin; + const Char* end; + uint32_t cp; +}; + +template +using make_unsigned_char = + typename conditional_t::value, + std::make_unsigned, + type_identity>::type; + +template +auto find_escape(const Char* begin, const Char* end) + -> find_escape_result { + for (; begin != end; ++begin) { + uint32_t cp = static_cast>(*begin); + if (const_check(sizeof(Char) == 1) && cp >= 0x80) continue; + if (needs_escape(cp)) return {begin, begin + 1, cp}; + } + return {begin, nullptr, 0}; +} + +inline auto find_escape(const char* begin, const char* end) + -> find_escape_result { + if (!is_utf8()) return find_escape(begin, end); + auto result = find_escape_result{end, nullptr, 0}; + for_each_codepoint(string_view(begin, to_unsigned(end - begin)), + [&](uint32_t cp, string_view sv) { + if (needs_escape(cp)) { + result = {sv.begin(), sv.end(), cp}; + return false; + } + return true; + }); + return result; +} + +#define FMT_STRING_IMPL(s, base, explicit) \ + [] { \ + /* Use the hidden visibility as a workaround for a GCC bug (#1973). */ \ + /* Use a macro-like name to avoid shadowing warnings. */ \ + struct FMT_GCC_VISIBILITY_HIDDEN FMT_COMPILE_STRING : base { \ + using char_type FMT_MAYBE_UNUSED = fmt::remove_cvref_t; \ + FMT_MAYBE_UNUSED FMT_CONSTEXPR explicit \ + operator fmt::basic_string_view() const { \ + return fmt::detail_exported::compile_string_to_view(s); \ + } \ + }; \ + return FMT_COMPILE_STRING(); \ + }() + +/** + \rst + Constructs a compile-time format string from a string literal *s*. + + **Example**:: + + // A compile-time error because 'd' is an invalid specifier for strings. + std::string s = fmt::format(FMT_STRING("{:d}"), "foo"); + \endrst + */ +#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string, ) + +template +auto write_codepoint(OutputIt out, char prefix, uint32_t cp) -> OutputIt { + *out++ = static_cast('\\'); + *out++ = static_cast(prefix); + Char buf[width]; + fill_n(buf, width, static_cast('0')); + format_uint<4>(buf, cp, width); + return copy_str(buf, buf + width, out); +} + +template +auto write_escaped_cp(OutputIt out, const find_escape_result& escape) + -> OutputIt { + auto c = static_cast(escape.cp); + switch (escape.cp) { + case '\n': + *out++ = static_cast('\\'); + c = static_cast('n'); + break; + case '\r': + *out++ = static_cast('\\'); + c = static_cast('r'); + break; + case '\t': + *out++ = static_cast('\\'); + c = static_cast('t'); + break; + case '"': + FMT_FALLTHROUGH; + case '\'': + FMT_FALLTHROUGH; + case '\\': + *out++ = static_cast('\\'); + break; + default: + if (is_utf8()) { + if (escape.cp < 0x100) { + return write_codepoint<2, Char>(out, 'x', escape.cp); + } + if (escape.cp < 0x10000) { + return write_codepoint<4, Char>(out, 'u', escape.cp); + } + if (escape.cp < 0x110000) { + return write_codepoint<8, Char>(out, 'U', escape.cp); + } + } + for (Char escape_char : basic_string_view( + escape.begin, to_unsigned(escape.end - escape.begin))) { + out = write_codepoint<2, Char>(out, 'x', + static_cast(escape_char) & 0xFF); + } + return out; + } + *out++ = c; + return out; +} + +template +auto write_escaped_string(OutputIt out, basic_string_view str) + -> OutputIt { + *out++ = static_cast('"'); + auto begin = str.begin(), end = str.end(); + do { + auto escape = find_escape(begin, end); + out = copy_str(begin, escape.begin, out); + begin = escape.end; + if (!begin) break; + out = write_escaped_cp(out, escape); + } while (begin != end); + *out++ = static_cast('"'); + return out; +} + +template +auto write_escaped_char(OutputIt out, Char v) -> OutputIt { + *out++ = static_cast('\''); + if ((needs_escape(static_cast(v)) && v != static_cast('"')) || + v == static_cast('\'')) { + out = write_escaped_cp( + out, find_escape_result{&v, &v + 1, static_cast(v)}); + } else { + *out++ = v; + } + *out++ = static_cast('\''); + return out; +} + +template +FMT_CONSTEXPR auto write_char(OutputIt out, Char value, + const basic_format_specs& specs) + -> OutputIt { + bool is_debug = specs.type == presentation_type::debug; + return write_padded(out, specs, 1, [=](reserve_iterator it) { + if (is_debug) return write_escaped_char(it, value); + *it++ = value; + return it; + }); +} +template +FMT_CONSTEXPR auto write(OutputIt out, Char value, + const basic_format_specs& specs, + locale_ref loc = {}) -> OutputIt { + return check_char_specs(specs) + ? write_char(out, value, specs) + : write(out, static_cast(value), specs, loc); +} + +// Data for write_int that doesn't depend on output iterator type. It is used to +// avoid template code bloat. +template struct write_int_data { + size_t size; + size_t padding; + + FMT_CONSTEXPR write_int_data(int num_digits, unsigned prefix, + const basic_format_specs& specs) + : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) { + if (specs.align == align::numeric) { + auto width = to_unsigned(specs.width); + if (width > size) { + padding = width - size; + size = width; + } + } else if (specs.precision > num_digits) { + size = (prefix >> 24) + to_unsigned(specs.precision); + padding = to_unsigned(specs.precision - num_digits); + } + } +}; + +// Writes an integer in the format +// +// where are written by write_digits(it). +// prefix contains chars in three lower bytes and the size in the fourth byte. +template +FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits, + unsigned prefix, + const basic_format_specs& specs, + W write_digits) -> OutputIt { + // Slightly faster check for specs.width == 0 && specs.precision == -1. + if ((specs.width | (specs.precision + 1)) == 0) { + auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24)); + if (prefix != 0) { + for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8) + *it++ = static_cast(p & 0xff); + } + return base_iterator(out, write_digits(it)); + } + auto data = write_int_data(num_digits, prefix, specs); + return write_padded( + out, specs, data.size, [=](reserve_iterator it) { + for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8) + *it++ = static_cast(p & 0xff); + it = detail::fill_n(it, data.padding, static_cast('0')); + return write_digits(it); + }); +} + +template class digit_grouping { + private: + std::string grouping_; + std::basic_string thousands_sep_; + + struct next_state { + std::string::const_iterator group; + int pos; + }; + next_state initial_state() const { return {grouping_.begin(), 0}; } + + // Returns the next digit group separator position. + int next(next_state& state) const { + if (thousands_sep_.empty()) return max_value(); + if (state.group == grouping_.end()) return state.pos += grouping_.back(); + if (*state.group <= 0 || *state.group == max_value()) + return max_value(); + state.pos += *state.group++; + return state.pos; + } + + public: + explicit digit_grouping(locale_ref loc, bool localized = true) { + if (!localized) return; + auto sep = thousands_sep(loc); + grouping_ = sep.grouping; + if (sep.thousands_sep) thousands_sep_.assign(1, sep.thousands_sep); + } + digit_grouping(std::string grouping, std::basic_string sep) + : grouping_(std::move(grouping)), thousands_sep_(std::move(sep)) {} + + bool has_separator() const { return !thousands_sep_.empty(); } + + int count_separators(int num_digits) const { + int count = 0; + auto state = initial_state(); + while (num_digits > next(state)) ++count; + return count; + } + + // Applies grouping to digits and write the output to out. + template + Out apply(Out out, basic_string_view digits) const { + auto num_digits = static_cast(digits.size()); + auto separators = basic_memory_buffer(); + separators.push_back(0); + auto state = initial_state(); + while (int i = next(state)) { + if (i >= num_digits) break; + separators.push_back(i); + } + for (int i = 0, sep_index = static_cast(separators.size() - 1); + i < num_digits; ++i) { + if (num_digits - i == separators[sep_index]) { + out = + copy_str(thousands_sep_.data(), + thousands_sep_.data() + thousands_sep_.size(), out); + --sep_index; + } + *out++ = static_cast(digits[to_unsigned(i)]); + } + return out; + } +}; + +// Writes a decimal integer with digit grouping. +template +auto write_int(OutputIt out, UInt value, unsigned prefix, + const basic_format_specs& specs, + const digit_grouping& grouping) -> OutputIt { + static_assert(std::is_same, UInt>::value, ""); + int num_digits = count_digits(value); + char digits[40]; + format_decimal(digits, value, num_digits); + unsigned size = to_unsigned((prefix != 0 ? 1 : 0) + num_digits + + grouping.count_separators(num_digits)); + return write_padded( + out, specs, size, size, [&](reserve_iterator it) { + if (prefix != 0) { + char sign = static_cast(prefix); + *it++ = static_cast(sign); + } + return grouping.apply(it, string_view(digits, to_unsigned(num_digits))); + }); +} + +// Writes a localized value. +FMT_API auto write_loc(appender out, loc_value value, const format_specs& specs, + locale_ref loc) -> bool; +template +inline auto write_loc(OutputIt, loc_value, const basic_format_specs&, + locale_ref) -> bool { + return false; +} + +FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) { + prefix |= prefix != 0 ? value << 8 : value; + prefix += (1u + (value > 0xff ? 1 : 0)) << 24; +} + +template struct write_int_arg { + UInt abs_value; + unsigned prefix; +}; + +template +FMT_CONSTEXPR auto make_write_int_arg(T value, sign_t sign) + -> write_int_arg> { + auto prefix = 0u; + auto abs_value = static_cast>(value); + if (is_negative(value)) { + prefix = 0x01000000 | '-'; + abs_value = 0 - abs_value; + } else { + constexpr const unsigned prefixes[4] = {0, 0, 0x1000000u | '+', + 0x1000000u | ' '}; + prefix = prefixes[sign]; + } + return {abs_value, prefix}; +} + +template struct loc_writer { + buffer_appender out; + const basic_format_specs& specs; + std::basic_string sep; + std::string grouping; + std::basic_string decimal_point; + + template ::value)> + auto operator()(T value) -> bool { + auto arg = make_write_int_arg(value, specs.sign); + write_int(out, static_cast>(arg.abs_value), arg.prefix, + specs, digit_grouping(grouping, sep)); + return true; + } + + template ::value)> + auto operator()(T) -> bool { + return false; + } + + auto operator()(...) -> bool { return false; } +}; + +template +FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg arg, + const basic_format_specs& specs, + locale_ref) -> OutputIt { + static_assert(std::is_same>::value, ""); + auto abs_value = arg.abs_value; + auto prefix = arg.prefix; + switch (specs.type) { + case presentation_type::none: + case presentation_type::dec: { + auto num_digits = count_digits(abs_value); + return write_int( + out, num_digits, prefix, specs, [=](reserve_iterator it) { + return format_decimal(it, abs_value, num_digits).end; + }); + } + case presentation_type::hex_lower: + case presentation_type::hex_upper: { + bool upper = specs.type == presentation_type::hex_upper; + if (specs.alt) + prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0'); + int num_digits = count_digits<4>(abs_value); + return write_int( + out, num_digits, prefix, specs, [=](reserve_iterator it) { + return format_uint<4, Char>(it, abs_value, num_digits, upper); + }); + } + case presentation_type::bin_lower: + case presentation_type::bin_upper: { + bool upper = specs.type == presentation_type::bin_upper; + if (specs.alt) + prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0'); + int num_digits = count_digits<1>(abs_value); + return write_int(out, num_digits, prefix, specs, + [=](reserve_iterator it) { + return format_uint<1, Char>(it, abs_value, num_digits); + }); + } + case presentation_type::oct: { + int num_digits = count_digits<3>(abs_value); + // Octal prefix '0' is counted as a digit, so only add it if precision + // is not greater than the number of digits. + if (specs.alt && specs.precision <= num_digits && abs_value != 0) + prefix_append(prefix, '0'); + return write_int(out, num_digits, prefix, specs, + [=](reserve_iterator it) { + return format_uint<3, Char>(it, abs_value, num_digits); + }); + } + case presentation_type::chr: + return write_char(out, static_cast(abs_value), specs); + default: + throw_format_error("invalid type specifier"); + } + return out; +} +template +FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline( + OutputIt out, write_int_arg arg, const basic_format_specs& specs, + locale_ref loc) -> OutputIt { + return write_int(out, arg, specs, loc); +} +template ::value && + !std::is_same::value && + std::is_same>::value)> +FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value, + const basic_format_specs& specs, + locale_ref loc) -> OutputIt { + if (specs.localized && write_loc(out, value, specs, loc)) return out; + return write_int_noinline(out, make_write_int_arg(value, specs.sign), specs, + loc); +} +// An inlined version of write used in format string compilation. +template ::value && + !std::is_same::value && + !std::is_same>::value)> +FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value, + const basic_format_specs& specs, + locale_ref loc) -> OutputIt { + if (specs.localized && write_loc(out, value, specs, loc)) return out; + return write_int(out, make_write_int_arg(value, specs.sign), specs, loc); +} + +// An output iterator that counts the number of objects written to it and +// discards them. +class counting_iterator { + private: + size_t count_; + + public: + using iterator_category = std::output_iterator_tag; + using difference_type = std::ptrdiff_t; + using pointer = void; + using reference = void; + FMT_UNCHECKED_ITERATOR(counting_iterator); + + struct value_type { + template FMT_CONSTEXPR void operator=(const T&) {} + }; + + FMT_CONSTEXPR counting_iterator() : count_(0) {} + + FMT_CONSTEXPR size_t count() const { return count_; } + + FMT_CONSTEXPR counting_iterator& operator++() { + ++count_; + return *this; + } + FMT_CONSTEXPR counting_iterator operator++(int) { + auto it = *this; + ++*this; + return it; + } + + FMT_CONSTEXPR friend counting_iterator operator+(counting_iterator it, + difference_type n) { + it.count_ += static_cast(n); + return it; + } + + FMT_CONSTEXPR value_type operator*() const { return {}; } +}; + +template +FMT_CONSTEXPR auto write(OutputIt out, basic_string_view s, + const basic_format_specs& specs) -> OutputIt { + auto data = s.data(); + auto size = s.size(); + if (specs.precision >= 0 && to_unsigned(specs.precision) < size) + size = code_point_index(s, to_unsigned(specs.precision)); + bool is_debug = specs.type == presentation_type::debug; + size_t width = 0; + if (specs.width != 0) { + if (is_debug) + width = write_escaped_string(counting_iterator{}, s).count(); + else + width = compute_width(basic_string_view(data, size)); + } + return write_padded(out, specs, size, width, + [=](reserve_iterator it) { + if (is_debug) return write_escaped_string(it, s); + return copy_str(data, data + size, it); + }); +} +template +FMT_CONSTEXPR auto write(OutputIt out, + basic_string_view> s, + const basic_format_specs& specs, locale_ref) + -> OutputIt { + check_string_type_spec(specs.type); + return write(out, s, specs); +} +template +FMT_CONSTEXPR auto write(OutputIt out, const Char* s, + const basic_format_specs& specs, locale_ref) + -> OutputIt { + return check_cstring_type_spec(specs.type) + ? write(out, basic_string_view(s), specs, {}) + : write_ptr(out, bit_cast(s), &specs); +} + +template ::value && + !std::is_same::value && + !std::is_same::value)> +FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt { + auto abs_value = static_cast>(value); + bool negative = is_negative(value); + // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer. + if (negative) abs_value = ~abs_value + 1; + int num_digits = count_digits(abs_value); + auto size = (negative ? 1 : 0) + static_cast(num_digits); + auto it = reserve(out, size); + if (auto ptr = to_pointer(it, size)) { + if (negative) *ptr++ = static_cast('-'); + format_decimal(ptr, abs_value, num_digits); + return out; + } + if (negative) *it++ = static_cast('-'); + it = format_decimal(it, abs_value, num_digits).end; + return base_iterator(out, it); +} + +template +FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan, + basic_format_specs specs, + const float_specs& fspecs) -> OutputIt { + auto str = + isnan ? (fspecs.upper ? "NAN" : "nan") : (fspecs.upper ? "INF" : "inf"); + constexpr size_t str_size = 3; + auto sign = fspecs.sign; + auto size = str_size + (sign ? 1 : 0); + // Replace '0'-padding with space for non-finite values. + const bool is_zero_fill = + specs.fill.size() == 1 && *specs.fill.data() == static_cast('0'); + if (is_zero_fill) specs.fill[0] = static_cast(' '); + return write_padded(out, specs, size, [=](reserve_iterator it) { + if (sign) *it++ = detail::sign(sign); + return copy_str(str, str + str_size, it); + }); +} + +// A decimal floating-point number significand * pow(10, exp). +struct big_decimal_fp { + const char* significand; + int significand_size; + int exponent; +}; + +constexpr auto get_significand_size(const big_decimal_fp& f) -> int { + return f.significand_size; +} +template +inline auto get_significand_size(const dragonbox::decimal_fp& f) -> int { + return count_digits(f.significand); +} + +template +constexpr auto write_significand(OutputIt out, const char* significand, + int significand_size) -> OutputIt { + return copy_str(significand, significand + significand_size, out); +} +template +inline auto write_significand(OutputIt out, UInt significand, + int significand_size) -> OutputIt { + return format_decimal(out, significand, significand_size).end; +} +template +FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand, + int significand_size, int exponent, + const Grouping& grouping) -> OutputIt { + if (!grouping.has_separator()) { + out = write_significand(out, significand, significand_size); + return detail::fill_n(out, exponent, static_cast('0')); + } + auto buffer = memory_buffer(); + write_significand(appender(buffer), significand, significand_size); + detail::fill_n(appender(buffer), exponent, '0'); + return grouping.apply(out, string_view(buffer.data(), buffer.size())); +} + +template ::value)> +inline auto write_significand(Char* out, UInt significand, int significand_size, + int integral_size, Char decimal_point) -> Char* { + if (!decimal_point) + return format_decimal(out, significand, significand_size).end; + out += significand_size + 1; + Char* end = out; + int floating_size = significand_size - integral_size; + for (int i = floating_size / 2; i > 0; --i) { + out -= 2; + copy2(out, digits2(static_cast(significand % 100))); + significand /= 100; + } + if (floating_size % 2 != 0) { + *--out = static_cast('0' + significand % 10); + significand /= 10; + } + *--out = decimal_point; + format_decimal(out - integral_size, significand, integral_size); + return end; +} + +template >::value)> +inline auto write_significand(OutputIt out, UInt significand, + int significand_size, int integral_size, + Char decimal_point) -> OutputIt { + // Buffer is large enough to hold digits (digits10 + 1) and a decimal point. + Char buffer[digits10() + 2]; + auto end = write_significand(buffer, significand, significand_size, + integral_size, decimal_point); + return detail::copy_str_noinline(buffer, end, out); +} + +template +FMT_CONSTEXPR auto write_significand(OutputIt out, const char* significand, + int significand_size, int integral_size, + Char decimal_point) -> OutputIt { + out = detail::copy_str_noinline(significand, + significand + integral_size, out); + if (!decimal_point) return out; + *out++ = decimal_point; + return detail::copy_str_noinline(significand + integral_size, + significand + significand_size, out); +} + +template +FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand, + int significand_size, int integral_size, + Char decimal_point, + const Grouping& grouping) -> OutputIt { + if (!grouping.has_separator()) { + return write_significand(out, significand, significand_size, integral_size, + decimal_point); + } + auto buffer = basic_memory_buffer(); + write_significand(buffer_appender(buffer), significand, + significand_size, integral_size, decimal_point); + grouping.apply( + out, basic_string_view(buffer.data(), to_unsigned(integral_size))); + return detail::copy_str_noinline(buffer.data() + integral_size, + buffer.end(), out); +} + +template > +FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f, + const basic_format_specs& specs, + float_specs fspecs, locale_ref loc) + -> OutputIt { + auto significand = f.significand; + int significand_size = get_significand_size(f); + const Char zero = static_cast('0'); + auto sign = fspecs.sign; + size_t size = to_unsigned(significand_size) + (sign ? 1 : 0); + using iterator = reserve_iterator; + + Char decimal_point = + fspecs.locale ? detail::decimal_point(loc) : static_cast('.'); + + int output_exp = f.exponent + significand_size - 1; + auto use_exp_format = [=]() { + if (fspecs.format == float_format::exp) return true; + if (fspecs.format != float_format::general) return false; + // Use the fixed notation if the exponent is in [exp_lower, exp_upper), + // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation. + const int exp_lower = -4, exp_upper = 16; + return output_exp < exp_lower || + output_exp >= (fspecs.precision > 0 ? fspecs.precision : exp_upper); + }; + if (use_exp_format()) { + int num_zeros = 0; + if (fspecs.showpoint) { + num_zeros = fspecs.precision - significand_size; + if (num_zeros < 0) num_zeros = 0; + size += to_unsigned(num_zeros); + } else if (significand_size == 1) { + decimal_point = Char(); + } + auto abs_output_exp = output_exp >= 0 ? output_exp : -output_exp; + int exp_digits = 2; + if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3; + + size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits); + char exp_char = fspecs.upper ? 'E' : 'e'; + auto write = [=](iterator it) { + if (sign) *it++ = detail::sign(sign); + // Insert a decimal point after the first digit and add an exponent. + it = write_significand(it, significand, significand_size, 1, + decimal_point); + if (num_zeros > 0) it = detail::fill_n(it, num_zeros, zero); + *it++ = static_cast(exp_char); + return write_exponent(output_exp, it); + }; + return specs.width > 0 ? write_padded(out, specs, size, write) + : base_iterator(out, write(reserve(out, size))); + } + + int exp = f.exponent + significand_size; + if (f.exponent >= 0) { + // 1234e5 -> 123400000[.0+] + size += to_unsigned(f.exponent); + int num_zeros = fspecs.precision - exp; + abort_fuzzing_if(num_zeros > 5000); + if (fspecs.showpoint) { + ++size; + if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 1; + if (num_zeros > 0) size += to_unsigned(num_zeros); + } + auto grouping = Grouping(loc, fspecs.locale); + size += to_unsigned(grouping.count_separators(exp)); + return write_padded(out, specs, size, [&](iterator it) { + if (sign) *it++ = detail::sign(sign); + it = write_significand(it, significand, significand_size, + f.exponent, grouping); + if (!fspecs.showpoint) return it; + *it++ = decimal_point; + return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it; + }); + } else if (exp > 0) { + // 1234e-2 -> 12.34[0+] + int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0; + size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0); + auto grouping = Grouping(loc, fspecs.locale); + size += to_unsigned(grouping.count_separators(significand_size)); + return write_padded(out, specs, size, [&](iterator it) { + if (sign) *it++ = detail::sign(sign); + it = write_significand(it, significand, significand_size, exp, + decimal_point, grouping); + return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it; + }); + } + // 1234e-6 -> 0.001234 + int num_zeros = -exp; + if (significand_size == 0 && fspecs.precision >= 0 && + fspecs.precision < num_zeros) { + num_zeros = fspecs.precision; + } + bool pointy = num_zeros != 0 || significand_size != 0 || fspecs.showpoint; + size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros); + return write_padded(out, specs, size, [&](iterator it) { + if (sign) *it++ = detail::sign(sign); + *it++ = zero; + if (!pointy) return it; + *it++ = decimal_point; + it = detail::fill_n(it, num_zeros, zero); + return write_significand(it, significand, significand_size); + }); +} + +template class fallback_digit_grouping { + public: + constexpr fallback_digit_grouping(locale_ref, bool) {} + + constexpr bool has_separator() const { return false; } + + constexpr int count_separators(int) const { return 0; } + + template + constexpr Out apply(Out out, basic_string_view) const { + return out; + } +}; + +template +FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f, + const basic_format_specs& specs, + float_specs fspecs, locale_ref loc) + -> OutputIt { + if (is_constant_evaluated()) { + return do_write_float>(out, f, specs, fspecs, + loc); + } else { + return do_write_float(out, f, specs, fspecs, loc); + } +} + +template constexpr bool isnan(T value) { + return !(value >= value); // std::isnan doesn't support __float128. +} + +template +struct has_isfinite : std::false_type {}; + +template +struct has_isfinite> + : std::true_type {}; + +template ::value&& + has_isfinite::value)> +FMT_CONSTEXPR20 bool isfinite(T value) { + constexpr T inf = T(std::numeric_limits::infinity()); + if (is_constant_evaluated()) + return !detail::isnan(value) && value < inf && value > -inf; + return std::isfinite(value); +} +template ::value)> +FMT_CONSTEXPR bool isfinite(T value) { + T inf = T(std::numeric_limits::infinity()); + // std::isfinite doesn't support __float128. + return !detail::isnan(value) && value < inf && value > -inf; +} + +template ::value)> +FMT_INLINE FMT_CONSTEXPR bool signbit(T value) { + if (is_constant_evaluated()) { +#ifdef __cpp_if_constexpr + if constexpr (std::numeric_limits::is_iec559) { + auto bits = detail::bit_cast(static_cast(value)); + return (bits >> (num_bits() - 1)) != 0; + } +#endif + } + return std::signbit(static_cast(value)); +} + +enum class round_direction { unknown, up, down }; + +// Given the divisor (normally a power of 10), the remainder = v % divisor for +// some number v and the error, returns whether v should be rounded up, down, or +// whether the rounding direction can't be determined due to error. +// error should be less than divisor / 2. +FMT_CONSTEXPR inline round_direction get_round_direction(uint64_t divisor, + uint64_t remainder, + uint64_t error) { + FMT_ASSERT(remainder < divisor, ""); // divisor - remainder won't overflow. + FMT_ASSERT(error < divisor, ""); // divisor - error won't overflow. + FMT_ASSERT(error < divisor - error, ""); // error * 2 won't overflow. + // Round down if (remainder + error) * 2 <= divisor. + if (remainder <= divisor - remainder && error * 2 <= divisor - remainder * 2) + return round_direction::down; + // Round up if (remainder - error) * 2 >= divisor. + if (remainder >= error && + remainder - error >= divisor - (remainder - error)) { + return round_direction::up; + } + return round_direction::unknown; +} + +namespace digits { +enum result { + more, // Generate more digits. + done, // Done generating digits. + error // Digit generation cancelled due to an error. +}; +} + +struct gen_digits_handler { + char* buf; + int size; + int precision; + int exp10; + bool fixed; + + FMT_CONSTEXPR digits::result on_digit(char digit, uint64_t divisor, + uint64_t remainder, uint64_t error, + bool integral) { + FMT_ASSERT(remainder < divisor, ""); + buf[size++] = digit; + if (!integral && error >= remainder) return digits::error; + if (size < precision) return digits::more; + if (!integral) { + // Check if error * 2 < divisor with overflow prevention. + // The check is not needed for the integral part because error = 1 + // and divisor > (1 << 32) there. + if (error >= divisor || error >= divisor - error) return digits::error; + } else { + FMT_ASSERT(error == 1 && divisor > 2, ""); + } + auto dir = get_round_direction(divisor, remainder, error); + if (dir != round_direction::up) + return dir == round_direction::down ? digits::done : digits::error; + ++buf[size - 1]; + for (int i = size - 1; i > 0 && buf[i] > '9'; --i) { + buf[i] = '0'; + ++buf[i - 1]; + } + if (buf[0] > '9') { + buf[0] = '1'; + if (fixed) + buf[size++] = '0'; + else + ++exp10; + } + return digits::done; + } +}; + +inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) { + // Adjust fixed precision by exponent because it is relative to decimal + // point. + if (exp10 > 0 && precision > max_value() - exp10) + FMT_THROW(format_error("number is too big")); + precision += exp10; +} + +// Generates output using the Grisu digit-gen algorithm. +// error: the size of the region (lower, upper) outside of which numbers +// definitely do not round to value (Delta in Grisu3). +FMT_INLINE FMT_CONSTEXPR20 auto grisu_gen_digits(fp value, uint64_t error, + int& exp, + gen_digits_handler& handler) + -> digits::result { + const fp one(1ULL << -value.e, value.e); + // The integral part of scaled value (p1 in Grisu) = value / one. It cannot be + // zero because it contains a product of two 64-bit numbers with MSB set (due + // to normalization) - 1, shifted right by at most 60 bits. + auto integral = static_cast(value.f >> -one.e); + FMT_ASSERT(integral != 0, ""); + FMT_ASSERT(integral == value.f >> -one.e, ""); + // The fractional part of scaled value (p2 in Grisu) c = value % one. + uint64_t fractional = value.f & (one.f - 1); + exp = count_digits(integral); // kappa in Grisu. + // Non-fixed formats require at least one digit and no precision adjustment. + if (handler.fixed) { + adjust_precision(handler.precision, exp + handler.exp10); + // Check if precision is satisfied just by leading zeros, e.g. + // format("{:.2f}", 0.001) gives "0.00" without generating any digits. + if (handler.precision <= 0) { + if (handler.precision < 0) return digits::done; + // Divide by 10 to prevent overflow. + uint64_t divisor = data::power_of_10_64[exp - 1] << -one.e; + auto dir = get_round_direction(divisor, value.f / 10, error * 10); + if (dir == round_direction::unknown) return digits::error; + handler.buf[handler.size++] = dir == round_direction::up ? '1' : '0'; + return digits::done; + } + } + // Generate digits for the integral part. This can produce up to 10 digits. + do { + uint32_t digit = 0; + auto divmod_integral = [&](uint32_t divisor) { + digit = integral / divisor; + integral %= divisor; + }; + // This optimization by Milo Yip reduces the number of integer divisions by + // one per iteration. + switch (exp) { + case 10: + divmod_integral(1000000000); + break; + case 9: + divmod_integral(100000000); + break; + case 8: + divmod_integral(10000000); + break; + case 7: + divmod_integral(1000000); + break; + case 6: + divmod_integral(100000); + break; + case 5: + divmod_integral(10000); + break; + case 4: + divmod_integral(1000); + break; + case 3: + divmod_integral(100); + break; + case 2: + divmod_integral(10); + break; + case 1: + digit = integral; + integral = 0; + break; + default: + FMT_ASSERT(false, "invalid number of digits"); + } + --exp; + auto remainder = (static_cast(integral) << -one.e) + fractional; + auto result = handler.on_digit(static_cast('0' + digit), + data::power_of_10_64[exp] << -one.e, + remainder, error, true); + if (result != digits::more) return result; + } while (exp > 0); + // Generate digits for the fractional part. + for (;;) { + fractional *= 10; + error *= 10; + char digit = static_cast('0' + (fractional >> -one.e)); + fractional &= one.f - 1; + --exp; + auto result = handler.on_digit(digit, one.f, fractional, error, false); + if (result != digits::more) return result; + } +} + +class bigint { + private: + // A bigint is stored as an array of bigits (big digits), with bigit at index + // 0 being the least significant one. + using bigit = uint32_t; + using double_bigit = uint64_t; + enum { bigits_capacity = 32 }; + basic_memory_buffer bigits_; + int exp_; + + FMT_CONSTEXPR20 bigit operator[](int index) const { + return bigits_[to_unsigned(index)]; + } + FMT_CONSTEXPR20 bigit& operator[](int index) { + return bigits_[to_unsigned(index)]; + } + + static constexpr const int bigit_bits = num_bits(); + + friend struct formatter; + + FMT_CONSTEXPR20 void subtract_bigits(int index, bigit other, bigit& borrow) { + auto result = static_cast((*this)[index]) - other - borrow; + (*this)[index] = static_cast(result); + borrow = static_cast(result >> (bigit_bits * 2 - 1)); + } + + FMT_CONSTEXPR20 void remove_leading_zeros() { + int num_bigits = static_cast(bigits_.size()) - 1; + while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits; + bigits_.resize(to_unsigned(num_bigits + 1)); + } + + // Computes *this -= other assuming aligned bigints and *this >= other. + FMT_CONSTEXPR20 void subtract_aligned(const bigint& other) { + FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints"); + FMT_ASSERT(compare(*this, other) >= 0, ""); + bigit borrow = 0; + int i = other.exp_ - exp_; + for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j) + subtract_bigits(i, other.bigits_[j], borrow); + while (borrow > 0) subtract_bigits(i, 0, borrow); + remove_leading_zeros(); + } + + FMT_CONSTEXPR20 void multiply(uint32_t value) { + const double_bigit wide_value = value; + bigit carry = 0; + for (size_t i = 0, n = bigits_.size(); i < n; ++i) { + double_bigit result = bigits_[i] * wide_value + carry; + bigits_[i] = static_cast(result); + carry = static_cast(result >> bigit_bits); + } + if (carry != 0) bigits_.push_back(carry); + } + + template ::value || + std::is_same::value)> + FMT_CONSTEXPR20 void multiply(UInt value) { + using half_uint = + conditional_t::value, uint64_t, uint32_t>; + const int shift = num_bits() - bigit_bits; + const UInt lower = static_cast(value); + const UInt upper = value >> num_bits(); + UInt carry = 0; + for (size_t i = 0, n = bigits_.size(); i < n; ++i) { + UInt result = lower * bigits_[i] + static_cast(carry); + carry = (upper * bigits_[i] << shift) + (result >> bigit_bits) + + (carry >> bigit_bits); + bigits_[i] = static_cast(result); + } + while (carry != 0) { + bigits_.push_back(static_cast(carry)); + carry >>= bigit_bits; + } + } + + template ::value || + std::is_same::value)> + FMT_CONSTEXPR20 void assign(UInt n) { + size_t num_bigits = 0; + do { + bigits_[num_bigits++] = static_cast(n); + n >>= bigit_bits; + } while (n != 0); + bigits_.resize(num_bigits); + exp_ = 0; + } + + public: + FMT_CONSTEXPR20 bigint() : exp_(0) {} + explicit bigint(uint64_t n) { assign(n); } + + bigint(const bigint&) = delete; + void operator=(const bigint&) = delete; + + FMT_CONSTEXPR20 void assign(const bigint& other) { + auto size = other.bigits_.size(); + bigits_.resize(size); + auto data = other.bigits_.data(); + std::copy(data, data + size, make_checked(bigits_.data(), size)); + exp_ = other.exp_; + } + + template FMT_CONSTEXPR20 void operator=(Int n) { + FMT_ASSERT(n > 0, ""); + assign(uint64_or_128_t(n)); + } + + FMT_CONSTEXPR20 int num_bigits() const { + return static_cast(bigits_.size()) + exp_; + } + + FMT_NOINLINE FMT_CONSTEXPR20 bigint& operator<<=(int shift) { + FMT_ASSERT(shift >= 0, ""); + exp_ += shift / bigit_bits; + shift %= bigit_bits; + if (shift == 0) return *this; + bigit carry = 0; + for (size_t i = 0, n = bigits_.size(); i < n; ++i) { + bigit c = bigits_[i] >> (bigit_bits - shift); + bigits_[i] = (bigits_[i] << shift) + carry; + carry = c; + } + if (carry != 0) bigits_.push_back(carry); + return *this; + } + + template FMT_CONSTEXPR20 bigint& operator*=(Int value) { + FMT_ASSERT(value > 0, ""); + multiply(uint32_or_64_or_128_t(value)); + return *this; + } + + friend FMT_CONSTEXPR20 int compare(const bigint& lhs, const bigint& rhs) { + int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits(); + if (num_lhs_bigits != num_rhs_bigits) + return num_lhs_bigits > num_rhs_bigits ? 1 : -1; + int i = static_cast(lhs.bigits_.size()) - 1; + int j = static_cast(rhs.bigits_.size()) - 1; + int end = i - j; + if (end < 0) end = 0; + for (; i >= end; --i, --j) { + bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j]; + if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1; + } + if (i != j) return i > j ? 1 : -1; + return 0; + } + + // Returns compare(lhs1 + lhs2, rhs). + friend FMT_CONSTEXPR20 int add_compare(const bigint& lhs1, const bigint& lhs2, + const bigint& rhs) { + auto minimum = [](int a, int b) { return a < b ? a : b; }; + auto maximum = [](int a, int b) { return a > b ? a : b; }; + int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits()); + int num_rhs_bigits = rhs.num_bigits(); + if (max_lhs_bigits + 1 < num_rhs_bigits) return -1; + if (max_lhs_bigits > num_rhs_bigits) return 1; + auto get_bigit = [](const bigint& n, int i) -> bigit { + return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0; + }; + double_bigit borrow = 0; + int min_exp = minimum(minimum(lhs1.exp_, lhs2.exp_), rhs.exp_); + for (int i = num_rhs_bigits - 1; i >= min_exp; --i) { + double_bigit sum = + static_cast(get_bigit(lhs1, i)) + get_bigit(lhs2, i); + bigit rhs_bigit = get_bigit(rhs, i); + if (sum > rhs_bigit + borrow) return 1; + borrow = rhs_bigit + borrow - sum; + if (borrow > 1) return -1; + borrow <<= bigit_bits; + } + return borrow != 0 ? -1 : 0; + } + + // Assigns pow(10, exp) to this bigint. + FMT_CONSTEXPR20 void assign_pow10(int exp) { + FMT_ASSERT(exp >= 0, ""); + if (exp == 0) return *this = 1; + // Find the top bit. + int bitmask = 1; + while (exp >= bitmask) bitmask <<= 1; + bitmask >>= 1; + // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by + // repeated squaring and multiplication. + *this = 5; + bitmask >>= 1; + while (bitmask != 0) { + square(); + if ((exp & bitmask) != 0) *this *= 5; + bitmask >>= 1; + } + *this <<= exp; // Multiply by pow(2, exp) by shifting. + } + + FMT_CONSTEXPR20 void square() { + int num_bigits = static_cast(bigits_.size()); + int num_result_bigits = 2 * num_bigits; + basic_memory_buffer n(std::move(bigits_)); + bigits_.resize(to_unsigned(num_result_bigits)); + auto sum = uint128_t(); + for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) { + // Compute bigit at position bigit_index of the result by adding + // cross-product terms n[i] * n[j] such that i + j == bigit_index. + for (int i = 0, j = bigit_index; j >= 0; ++i, --j) { + // Most terms are multiplied twice which can be optimized in the future. + sum += static_cast(n[i]) * n[j]; + } + (*this)[bigit_index] = static_cast(sum); + sum >>= num_bits(); // Compute the carry. + } + // Do the same for the top half. + for (int bigit_index = num_bigits; bigit_index < num_result_bigits; + ++bigit_index) { + for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;) + sum += static_cast(n[i++]) * n[j--]; + (*this)[bigit_index] = static_cast(sum); + sum >>= num_bits(); + } + remove_leading_zeros(); + exp_ *= 2; + } + + // If this bigint has a bigger exponent than other, adds trailing zero to make + // exponents equal. This simplifies some operations such as subtraction. + FMT_CONSTEXPR20 void align(const bigint& other) { + int exp_difference = exp_ - other.exp_; + if (exp_difference <= 0) return; + int num_bigits = static_cast(bigits_.size()); + bigits_.resize(to_unsigned(num_bigits + exp_difference)); + for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j) + bigits_[j] = bigits_[i]; + std::uninitialized_fill_n(bigits_.data(), exp_difference, 0); + exp_ -= exp_difference; + } + + // Divides this bignum by divisor, assigning the remainder to this and + // returning the quotient. + FMT_CONSTEXPR20 int divmod_assign(const bigint& divisor) { + FMT_ASSERT(this != &divisor, ""); + if (compare(*this, divisor) < 0) return 0; + FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, ""); + align(divisor); + int quotient = 0; + do { + subtract_aligned(divisor); + ++quotient; + } while (compare(*this, divisor) >= 0); + return quotient; + } +}; + +// format_dragon flags. +enum dragon { + predecessor_closer = 1, + fixup = 2, // Run fixup to correct exp10 which can be off by one. + fixed = 4, +}; + +// Formats a floating-point number using a variation of the Fixed-Precision +// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White: +// https://fmt.dev/papers/p372-steele.pdf. +FMT_CONSTEXPR20 inline void format_dragon(basic_fp value, + unsigned flags, int num_digits, + buffer& buf, int& exp10) { + bigint numerator; // 2 * R in (FPP)^2. + bigint denominator; // 2 * S in (FPP)^2. + // lower and upper are differences between value and corresponding boundaries. + bigint lower; // (M^- in (FPP)^2). + bigint upper_store; // upper's value if different from lower. + bigint* upper = nullptr; // (M^+ in (FPP)^2). + // Shift numerator and denominator by an extra bit or two (if lower boundary + // is closer) to make lower and upper integers. This eliminates multiplication + // by 2 during later computations. + bool is_predecessor_closer = (flags & dragon::predecessor_closer) != 0; + int shift = is_predecessor_closer ? 2 : 1; + if (value.e >= 0) { + numerator = value.f; + numerator <<= value.e + shift; + lower = 1; + lower <<= value.e; + if (is_predecessor_closer) { + upper_store = 1; + upper_store <<= value.e + 1; + upper = &upper_store; + } + denominator.assign_pow10(exp10); + denominator <<= shift; + } else if (exp10 < 0) { + numerator.assign_pow10(-exp10); + lower.assign(numerator); + if (is_predecessor_closer) { + upper_store.assign(numerator); + upper_store <<= 1; + upper = &upper_store; + } + numerator *= value.f; + numerator <<= shift; + denominator = 1; + denominator <<= shift - value.e; + } else { + numerator = value.f; + numerator <<= shift; + denominator.assign_pow10(exp10); + denominator <<= shift - value.e; + lower = 1; + if (is_predecessor_closer) { + upper_store = 1ULL << 1; + upper = &upper_store; + } + } + int even = static_cast((value.f & 1) == 0); + if (!upper) upper = &lower; + if ((flags & dragon::fixup) != 0) { + if (add_compare(numerator, *upper, denominator) + even <= 0) { + --exp10; + numerator *= 10; + if (num_digits < 0) { + lower *= 10; + if (upper != &lower) *upper *= 10; + } + } + if ((flags & dragon::fixed) != 0) adjust_precision(num_digits, exp10 + 1); + } + // Invariant: value == (numerator / denominator) * pow(10, exp10). + if (num_digits < 0) { + // Generate the shortest representation. + num_digits = 0; + char* data = buf.data(); + for (;;) { + int digit = numerator.divmod_assign(denominator); + bool low = compare(numerator, lower) - even < 0; // numerator <[=] lower. + // numerator + upper >[=] pow10: + bool high = add_compare(numerator, *upper, denominator) + even > 0; + data[num_digits++] = static_cast('0' + digit); + if (low || high) { + if (!low) { + ++data[num_digits - 1]; + } else if (high) { + int result = add_compare(numerator, numerator, denominator); + // Round half to even. + if (result > 0 || (result == 0 && (digit % 2) != 0)) + ++data[num_digits - 1]; + } + buf.try_resize(to_unsigned(num_digits)); + exp10 -= num_digits - 1; + return; + } + numerator *= 10; + lower *= 10; + if (upper != &lower) *upper *= 10; + } + } + // Generate the given number of digits. + exp10 -= num_digits - 1; + if (num_digits == 0) { + denominator *= 10; + auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0'; + buf.push_back(digit); + return; + } + buf.try_resize(to_unsigned(num_digits)); + for (int i = 0; i < num_digits - 1; ++i) { + int digit = numerator.divmod_assign(denominator); + buf[i] = static_cast('0' + digit); + numerator *= 10; + } + int digit = numerator.divmod_assign(denominator); + auto result = add_compare(numerator, numerator, denominator); + if (result > 0 || (result == 0 && (digit % 2) != 0)) { + if (digit == 9) { + const auto overflow = '0' + 10; + buf[num_digits - 1] = overflow; + // Propagate the carry. + for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) { + buf[i] = '0'; + ++buf[i - 1]; + } + if (buf[0] == overflow) { + buf[0] = '1'; + ++exp10; + } + return; + } + ++digit; + } + buf[num_digits - 1] = static_cast('0' + digit); +} + +template +FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs, + buffer& buf) -> int { + // float is passed as double to reduce the number of instantiations. + static_assert(!std::is_same::value, ""); + FMT_ASSERT(value >= 0, "value is negative"); + auto converted_value = convert_float(value); + + const bool fixed = specs.format == float_format::fixed; + if (value <= 0) { // <= instead of == to silence a warning. + if (precision <= 0 || !fixed) { + buf.push_back('0'); + return 0; + } + buf.try_resize(to_unsigned(precision)); + fill_n(buf.data(), precision, '0'); + return -precision; + } + + int exp = 0; + bool use_dragon = true; + unsigned dragon_flags = 0; + if (!is_fast_float()) { + const auto inv_log2_10 = 0.3010299956639812; // 1 / log2(10) + using info = dragonbox::float_info; + const auto f = basic_fp(converted_value); + // Compute exp, an approximate power of 10, such that + // 10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1). + // This is based on log10(value) == log2(value) / log2(10) and approximation + // of log2(value) by e + num_fraction_bits idea from double-conversion. + exp = static_cast( + std::ceil((f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10)); + dragon_flags = dragon::fixup; + } else if (!is_constant_evaluated() && precision < 0) { + // Use Dragonbox for the shortest format. + if (specs.binary32) { + auto dec = dragonbox::to_decimal(static_cast(value)); + write(buffer_appender(buf), dec.significand); + return dec.exponent; + } + auto dec = dragonbox::to_decimal(static_cast(value)); + write(buffer_appender(buf), dec.significand); + return dec.exponent; + } else { + // Use Grisu + Dragon4 for the given precision: + // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf. + const int min_exp = -60; // alpha in Grisu. + int cached_exp10 = 0; // K in Grisu. + fp normalized = normalize(fp(converted_value)); + const auto cached_pow = get_cached_power( + min_exp - (normalized.e + fp::num_significand_bits), cached_exp10); + normalized = normalized * cached_pow; + gen_digits_handler handler{buf.data(), 0, precision, -cached_exp10, fixed}; + if (grisu_gen_digits(normalized, 1, exp, handler) != digits::error && + !is_constant_evaluated()) { + exp += handler.exp10; + buf.try_resize(to_unsigned(handler.size)); + use_dragon = false; + } else { + exp += handler.size - cached_exp10 - 1; + precision = handler.precision; + } + } + if (use_dragon) { + auto f = basic_fp(); + bool is_predecessor_closer = specs.binary32 + ? f.assign(static_cast(value)) + : f.assign(converted_value); + if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer; + if (fixed) dragon_flags |= dragon::fixed; + // Limit precision to the maximum possible number of significant digits in + // an IEEE754 double because we don't need to generate zeros. + const int max_double_digits = 767; + if (precision > max_double_digits) precision = max_double_digits; + format_dragon(f, dragon_flags, precision, buf, exp); + } + if (!fixed && !specs.showpoint) { + // Remove trailing zeros. + auto num_digits = buf.size(); + while (num_digits > 0 && buf[num_digits - 1] == '0') { + --num_digits; + ++exp; + } + buf.try_resize(num_digits); + } + return exp; +} +template +FMT_CONSTEXPR20 auto write_float(OutputIt out, T value, + basic_format_specs specs, locale_ref loc) + -> OutputIt { + float_specs fspecs = parse_float_type_spec(specs); + fspecs.sign = specs.sign; + if (detail::signbit(value)) { // value < 0 is false for NaN so use signbit. + fspecs.sign = sign::minus; + value = -value; + } else if (fspecs.sign == sign::minus) { + fspecs.sign = sign::none; + } + + if (!detail::isfinite(value)) + return write_nonfinite(out, detail::isnan(value), specs, fspecs); + + if (specs.align == align::numeric && fspecs.sign) { + auto it = reserve(out, 1); + *it++ = detail::sign(fspecs.sign); + out = base_iterator(out, it); + fspecs.sign = sign::none; + if (specs.width != 0) --specs.width; + } + + memory_buffer buffer; + if (fspecs.format == float_format::hex) { + if (fspecs.sign) buffer.push_back(detail::sign(fspecs.sign)); + snprintf_float(convert_float(value), specs.precision, fspecs, buffer); + return write_bytes(out, {buffer.data(), buffer.size()}, + specs); + } + int precision = specs.precision >= 0 || specs.type == presentation_type::none + ? specs.precision + : 6; + if (fspecs.format == float_format::exp) { + if (precision == max_value()) + throw_format_error("number is too big"); + else + ++precision; + } else if (fspecs.format != float_format::fixed && precision == 0) { + precision = 1; + } + if (const_check(std::is_same())) fspecs.binary32 = true; + int exp = format_float(convert_float(value), precision, fspecs, buffer); + fspecs.precision = precision; + auto f = big_decimal_fp{buffer.data(), static_cast(buffer.size()), exp}; + return write_float(out, f, specs, fspecs, loc); +} + +template ::value)> +FMT_CONSTEXPR20 auto write(OutputIt out, T value, + basic_format_specs specs, locale_ref loc = {}) + -> OutputIt { + if (const_check(!is_supported_floating_point(value))) return out; + return specs.localized && write_loc(out, value, specs, loc) + ? out + : write_float(out, value, specs, loc); +} + +template ::value)> +FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt { + if (is_constant_evaluated()) + return write(out, value, basic_format_specs()); + if (const_check(!is_supported_floating_point(value))) return out; + + auto fspecs = float_specs(); + if (detail::signbit(value)) { + fspecs.sign = sign::minus; + value = -value; + } + + constexpr auto specs = basic_format_specs(); + using floaty = conditional_t::value, double, T>; + using uint = typename dragonbox::float_info::carrier_uint; + uint mask = exponent_mask(); + if ((bit_cast(value) & mask) == mask) + return write_nonfinite(out, std::isnan(value), specs, fspecs); + + auto dec = dragonbox::to_decimal(static_cast(value)); + return write_float(out, dec, specs, fspecs, {}); +} + +template ::value && + !is_fast_float::value)> +inline auto write(OutputIt out, T value) -> OutputIt { + return write(out, value, basic_format_specs()); +} + +template +auto write(OutputIt out, monostate, basic_format_specs = {}, + locale_ref = {}) -> OutputIt { + FMT_ASSERT(false, ""); + return out; +} + +template +FMT_CONSTEXPR auto write(OutputIt out, basic_string_view value) + -> OutputIt { + auto it = reserve(out, value.size()); + it = copy_str_noinline(value.begin(), value.end(), it); + return base_iterator(out, it); +} + +template ::value)> +constexpr auto write(OutputIt out, const T& value) -> OutputIt { + return write(out, to_string_view(value)); +} + +// FMT_ENABLE_IF() condition separated to workaround an MSVC bug. +template < + typename Char, typename OutputIt, typename T, + bool check = + std::is_enum::value && !std::is_same::value && + mapped_type_constant>::value != + type::custom_type, + FMT_ENABLE_IF(check)> +FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt { + return write(out, static_cast>(value)); +} + +template ::value)> +FMT_CONSTEXPR auto write(OutputIt out, T value, + const basic_format_specs& specs = {}, + locale_ref = {}) -> OutputIt { + return specs.type != presentation_type::none && + specs.type != presentation_type::string + ? write(out, value ? 1 : 0, specs, {}) + : write_bytes(out, value ? "true" : "false", specs); +} + +template +FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt { + auto it = reserve(out, 1); + *it++ = value; + return base_iterator(out, it); +} + +template +FMT_CONSTEXPR_CHAR_TRAITS auto write(OutputIt out, const Char* value) + -> OutputIt { + if (!value) { + throw_format_error("string pointer is null"); + } else { + out = write(out, basic_string_view(value)); + } + return out; +} + +template ::value)> +auto write(OutputIt out, const T* value, + const basic_format_specs& specs = {}, locale_ref = {}) + -> OutputIt { + check_pointer_type_spec(specs.type, error_handler()); + return write_ptr(out, bit_cast(value), &specs); +} + +// A write overload that handles implicit conversions. +template > +FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> enable_if_t< + std::is_class::value && !is_string::value && + !is_floating_point::value && !std::is_same::value && + !std::is_same().map(value))>::value, + OutputIt> { + return write(out, arg_mapper().map(value)); +} + +template > +FMT_CONSTEXPR auto write(OutputIt out, const T& value) + -> enable_if_t::value == type::custom_type, + OutputIt> { + using formatter_type = + conditional_t::value, + typename Context::template formatter_type, + fallback_formatter>; + auto ctx = Context(out, {}, {}); + return formatter_type().format(value, ctx); +} + +// An argument visitor that formats the argument and writes it via the output +// iterator. It's a class and not a generic lambda for compatibility with C++11. +template struct default_arg_formatter { + using iterator = buffer_appender; + using context = buffer_context; + + iterator out; + basic_format_args args; + locale_ref loc; + + template auto operator()(T value) -> iterator { + return write(out, value); + } + auto operator()(typename basic_format_arg::handle h) -> iterator { + basic_format_parse_context parse_ctx({}); + context format_ctx(out, args, loc); + h.format(parse_ctx, format_ctx); + return format_ctx.out(); + } +}; + +template struct arg_formatter { + using iterator = buffer_appender; + using context = buffer_context; + + iterator out; + const basic_format_specs& specs; + locale_ref locale; + + template + FMT_CONSTEXPR FMT_INLINE auto operator()(T value) -> iterator { + return detail::write(out, value, specs, locale); + } + auto operator()(typename basic_format_arg::handle) -> iterator { + // User-defined types are handled separately because they require access + // to the parse context. + return out; + } +}; + +template struct custom_formatter { + basic_format_parse_context& parse_ctx; + buffer_context& ctx; + + void operator()( + typename basic_format_arg>::handle h) const { + h.format(parse_ctx, ctx); + } + template void operator()(T) const {} +}; + +template class width_checker { + public: + explicit FMT_CONSTEXPR width_checker(ErrorHandler& eh) : handler_(eh) {} + + template ::value)> + FMT_CONSTEXPR auto operator()(T value) -> unsigned long long { + if (is_negative(value)) handler_.on_error("negative width"); + return static_cast(value); + } + + template ::value)> + FMT_CONSTEXPR auto operator()(T) -> unsigned long long { + handler_.on_error("width is not integer"); + return 0; + } + + private: + ErrorHandler& handler_; +}; + +template class precision_checker { + public: + explicit FMT_CONSTEXPR precision_checker(ErrorHandler& eh) : handler_(eh) {} + + template ::value)> + FMT_CONSTEXPR auto operator()(T value) -> unsigned long long { + if (is_negative(value)) handler_.on_error("negative precision"); + return static_cast(value); + } + + template ::value)> + FMT_CONSTEXPR auto operator()(T) -> unsigned long long { + handler_.on_error("precision is not integer"); + return 0; + } + + private: + ErrorHandler& handler_; +}; + +template