1 files changed, 54 insertions, 46 deletions
diff --git a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
index 39e5525c3b..7463f7e20b 100644
--- a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2023 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -27,15 +27,15 @@
 /**
  * @brief Compute the integer linear interpolation of two color endpoints.
  *
- * @param decode_mode   The ASTC profile (linear or sRGB)
+ * @param u8_mask       The mask for lanes using decode_unorm8 rather than decode_f16.
  * @param color0        The endpoint0 color.
  * @param color1        The endpoint1 color.
- * @param weights        The interpolation weight (between 0 and 64).
+ * @param weights       The interpolation weight (between 0 and 64).
  *
  * @return The interpolated color.
  */
 static vint4 lerp_color_int(
-	astcenc_profile decode_mode,
+	vmask4 u8_mask,
 	vint4 color0,
 	vint4 color1,
 	vint4 weights
@@ -43,24 +43,18 @@ static vint4 lerp_color_int(
 	vint4 weight1 = weights;
 	vint4 weight0 = vint4(64) - weight1;
 
-	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
-	{
-		color0 = asr<8>(color0);
-		color1 = asr<8>(color1);
-	}
-
 	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
 	color = asr<6>(color);
 
-	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
-	{
-		color = color * vint4(257);
-	}
+	// For decode_unorm8 values force the codec to bit replicate. This allows the
+	// rest of the codec to assume the full 0xFFFF range for everything and ignore
+	// the decode_mode setting
+	vint4 color_u8 = asr<8>(color) * vint4(257);
+	color = select(color, color_u8, u8_mask);
 
 	return color;
 }
 
-
 /**
  * @brief Convert integer color value into a float value for the decoder.
  *
@@ -104,10 +98,10 @@ void unpack_weights(
 	if (!is_dual_plane)
 	{
 		// Build full 64-entry weight lookup table
-		vint4 tab0(reinterpret_cast<const int*>(scb.weights +  0));
-		vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
-		vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
-		vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));
+		vint4 tab0 = vint4::load(scb.weights +  0);
+		vint4 tab1 = vint4::load(scb.weights + 16);
+		vint4 tab2 = vint4::load(scb.weights + 32);
+		vint4 tab3 = vint4::load(scb.weights + 48);
 
 		vint tab0p, tab1p, tab2p, tab3p;
 		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
@@ -134,14 +128,14 @@ void unpack_weights(
 	{
 		// Build a 32-entry weight lookup table per plane
 		// Plane 1
-		vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights +  0));
-		vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
+		vint4 tab0_plane1 = vint4::load(scb.weights +  0);
+		vint4 tab1_plane1 = vint4::load(scb.weights + 16);
 		vint tab0_plane1p, tab1_plane1p;
 		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
 
 		// Plane 2
-		vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
-		vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
+		vint4 tab0_plane2 = vint4::load(scb.weights + 32);
+		vint4 tab1_plane2 = vint4::load(scb.weights + 48);
 		vint tab0_plane2p, tab1_plane2p;
 		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
 
@@ -229,12 +223,13 @@ void decompress_symbolic_block(
 		{
 			vint4 colori(scb.constant_color);
 
-			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
-			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
-			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
-			{
-				colori = asr<8>(colori) * 257;
-			}
+			// Determine the UNORM8 rounding on the decode
+			vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
+			// The real decoder would just use the top 8 bits, but we rescale
+			// in to a 16-bit value that rounds correctly.
+			vint4 colori_u8 = asr<8>(colori) * 257;
+			colori = select(colori, colori_u8, u8_mask);
 
 			vint4 colorf16 = unorm16_to_sf16(colori);
 			color = float16_to_float(colorf16);
@@ -289,6 +284,8 @@ void decompress_symbolic_block(
 	int plane2_component = scb.plane2_component;
 	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
 
+	vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
 	for (int i = 0; i < partition_count; i++)
 	{
 		// Decode the color endpoints for this partition
@@ -310,7 +307,7 @@ void decompress_symbolic_block(
 		{
 			int tix = pi.texels_of_partition[i][j];
 			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
-			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
+			vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
 			vfloat4 colorf = decode_texel(color, lns_mask);
 
 			blk.data_r[tix] = colorf.lane<0>();
@@ -365,12 +362,14 @@ float compute_symbolic_block_difference_2plane(
 	                       rgb_lns, a_lns,
 	                       ep0, ep1);
 
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
 	// Unpack and compute error for each texel in the partition
 	unsigned int texel_count = bsd.texel_count;
 	for (unsigned int i = 0; i < texel_count; i++)
 	{
 		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
-		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
+		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
 
 		vfloat4 color = int_to_float(colori);
 		vfloat4 oldColor = blk.texel(i);
@@ -444,6 +443,8 @@ float compute_symbolic_block_difference_1plane(
 	int plane1_weights[BLOCK_MAX_TEXELS];
 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
 
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
 	vfloat4 summa = vfloat4::zero();
 	for (unsigned int i = 0; i < partition_count; i++)
 	{
@@ -464,7 +465,7 @@ float compute_symbolic_block_difference_1plane(
 		for (unsigned int j = 0; j < texel_count; j++)
 		{
 			unsigned int tix = pi.texels_of_partition[i][j];
-			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
+			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
 			                              vint4(plane1_weights[tix]));
 
 			vfloat4 color = int_to_float(colori);
@@ -532,7 +533,7 @@ float compute_symbolic_block_difference_1plane_1partition(
 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
 
 	// Unquantize and undecimate the weights
-	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
 	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
 
 	// Decode the color endpoints for this partition
@@ -547,19 +548,12 @@ float compute_symbolic_block_difference_1plane_1partition(
 	                       rgb_lns, a_lns,
 	                       ep0, ep1);
 
-
-	// Pre-shift sRGB so things round correctly
-	if (config.profile == ASTCENC_PRF_LDR_SRGB)
-	{
-		ep0 = asr<8>(ep0);
-		ep1 = asr<8>(ep1);
-	}
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
 
 	// Unpack and compute error for each texel in the partition
 	vfloatacc summav = vfloatacc::zero();
 
 	vint lane_id = vint::lane_id();
-	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
 
 	unsigned int texel_count = bsd.texel_count;
 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
@@ -578,11 +572,25 @@ float compute_symbolic_block_difference_1plane_1partition(
 		vint ep0_b = vint(ep0.lane<2>()) * weight0;
 		vint ep0_a = vint(ep0.lane<3>()) * weight0;
 
-		// Shift so things round correctly
-		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
-		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
-		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
-		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
+		// Combine contributions
+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
+
+		// If using a U8 decode mode bit replicate top 8 bits
+		// so rest of codec can assume 0xFFFF max range everywhere
+		vint colori_r8 = asr<8>(colori_r) * vint(257);
+		colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
+
+		vint colori_g8 = asr<8>(colori_g) * vint(257);
+		colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
+
+		vint colori_b8 = asr<8>(colori_b) * vint(257);
+		colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
+
+		vint colori_a8 = asr<8>(colori_a) * vint(257);
+		colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
 
 		// Compute color diff
 		vfloat color_r = int_to_float(colori_r);