1 files changed, 41 insertions, 14 deletions
diff --git a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
index e742eae6cb..42545e7562 100644
--- a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2022 Arm Limited
+// Copyright 2019-2023 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -38,6 +38,7 @@
 #endif
 
 #include <cstdio>
+#include <cstring>
 
 // ============================================================================
 // vfloat4 data type
@@ -270,6 +271,16 @@ struct vint4
 	}
 
 	/**
+	 * @brief Factory that returns a vector loaded from unaligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 load(const uint8_t* p)
+	{
+		vint4 data;
+		std::memcpy(&data.m, p, 4 * sizeof(int));
+		return data;
+	}
+
+	/**
 	 * @brief Factory that returns a vector loaded from 16B aligned memory.
 	 */
 	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
@@ -348,9 +359,9 @@ struct vmask4
 	/**
 	 * @brief Get the scalar from a single lane.
 	 */
-	template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
+	template <int32_t l> ASTCENC_SIMD_INLINE bool lane() const
 	{
-		return vgetq_lane_u32(m, l);
+		return vgetq_lane_u32(m, l) != 0;
 	}
 
 	/**
@@ -585,6 +596,14 @@ ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
 }
 
 /**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vint4 a, uint8_t* p)
+{
+	std::memcpy(p, &a.m, sizeof(int) * 4);
+}
+
+/**
  * @brief Store lowest N (vector width) bytes into an unaligned address.
  */
 ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
@@ -849,7 +868,7 @@ ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
  */
 ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
 {
-	a = round(a);
+	a = a + vfloat4(0.5f);
 	return vint4(vcvtq_s32_f32(a.m));
 }
 
@@ -1028,30 +1047,38 @@ ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
 }
 
 /**
+ * @brief Store a single vector lane to an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_lane(uint8_t* base, int data)
+{
+	std::memcpy(base, &data, sizeof(int));
+}
+
+/**
  * @brief Store a vector, skipping masked lanes.
  *
  * All masked lanes must be at the end of vector, after all non-masked lanes.
  */
-ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
+ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint4 data, vmask4 mask)
 {
 	if (mask.lane<3>())
 	{
 		store(data, base);
 	}
-	else if (mask.lane<2>())
+	else if (mask.lane<2>() != 0.0f)
 	{
-		base[0] = data.lane<0>();
-		base[1] = data.lane<1>();
-		base[2] = data.lane<2>();
+		store_lane(base + 0, data.lane<0>());
+		store_lane(base + 4, data.lane<1>());
+		store_lane(base + 8, data.lane<2>());
 	}
-	else if (mask.lane<1>())
+	else if (mask.lane<1>() != 0.0f)
 	{
-		base[0] = data.lane<0>();
-		base[1] = data.lane<1>();
+		store_lane(base + 0, data.lane<0>());
+		store_lane(base + 4, data.lane<1>());
 	}
-	else if (mask.lane<0>())
+	else if (mask.lane<0>() != 0.0f)
 	{
-		base[0] = data.lane<0>();
+		store_lane(base + 0, data.lane<0>());
 	}
 }