// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include // std::isfinite #include "hwy/base.h" #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/convert_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/nanobenchmark.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { namespace { template size_t DeduceN(Simd) { return N; } template struct TestRebind { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const Rebind dto; const size_t N = Lanes(d); HWY_ASSERT(N <= MaxLanes(d)); const size_t NTo = Lanes(dto); if (NTo != N) { HWY_ABORT("u%zu -> u%zu: lanes %zu %zu pow2 %d %d cap %zu %zu\n", 8 * sizeof(T), 8 * sizeof(ToT), N, NTo, d.Pow2(), dto.Pow2(), DeduceN(d), DeduceN(dto)); } } }; // Lane count remains the same when we rebind to smaller/equal/larger types. HWY_NOINLINE void TestAllRebind() { #if HWY_HAVE_INTEGER64 ForShrinkableVectors, 3>()(uint64_t()); #endif // HWY_HAVE_INTEGER64 ForShrinkableVectors, 2>()(uint32_t()); ForShrinkableVectors, 1>()(uint16_t()); ForPartialVectors>()(uint8_t()); ForExtendableVectors, 1>()(uint8_t()); ForExtendableVectors, 2>()(uint8_t()); #if HWY_HAVE_INTEGER64 ForExtendableVectors, 3>()(uint8_t()); #endif // HWY_HAVE_INTEGER64 } template struct TestPromoteTo { template HWY_NOINLINE void operator()(T /*unused*/, D from_d) { static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); const Rebind to_d; const size_t N = Lanes(from_d); auto from = AllocateAligned(N); auto expected = AllocateAligned(N); HWY_ASSERT(from && expected); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { const uint64_t bits = rng(); CopyBytes(&bits, &from[i]); // not same size expected[i] = from[i]; } HWY_ASSERT_VEC_EQ(to_d, expected.get(), PromoteTo(to_d, Load(from_d, from.get()))); } } }; HWY_NOINLINE void TestAllPromoteTo() { const ForPromoteVectors, 1> to_u16div2; to_u16div2(uint8_t()); const ForPromoteVectors, 2> to_u32div4; to_u32div4(uint8_t()); const ForPromoteVectors, 1> to_u32div2; to_u32div2(uint16_t()); const ForPromoteVectors, 1> to_i16div2; to_i16div2(uint8_t()); to_i16div2(int8_t()); const ForPromoteVectors, 1> to_i32div2; to_i32div2(uint16_t()); to_i32div2(int16_t()); const ForPromoteVectors, 2> to_i32div4; to_i32div4(uint8_t()); to_i32div4(int8_t()); // Must test f16/bf16 separately because we can only load/store/convert them. #if HWY_HAVE_INTEGER64 const ForPromoteVectors, 1> to_u64div2; to_u64div2(uint32_t()); const ForPromoteVectors, 1> to_i64div2; to_i64div2(int32_t()); to_i64div2(uint32_t()); const ForPromoteVectors, 2> to_u64div4; to_u64div4(uint16_t()); const ForPromoteVectors, 2> to_i64div4; to_i64div4(int16_t()); to_i64div4(uint16_t()); const ForPromoteVectors, 3> to_u64div8; to_u64div8(uint8_t()); const ForPromoteVectors, 3> to_i64div8; to_i64div8(int8_t()); to_i64div8(uint8_t()); #endif #if HWY_HAVE_FLOAT64 const ForPromoteVectors, 1> to_f64div2; to_f64div2(int32_t()); to_f64div2(uint32_t()); to_f64div2(float()); #endif } template struct TestPromoteUpperLowerTo { template HWY_NOINLINE void operator()(T /*unused*/, D from_d) { static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); const Repartition to_d; const size_t N = Lanes(from_d); auto from = AllocateAligned(N); auto expected = AllocateAligned(N / 2); HWY_ASSERT(from && expected); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { const uint64_t bits = rng(); CopyBytes(&bits, &from[i]); // not same size } for (size_t i = 0; i < N / 2; ++i) { expected[i] = from[N / 2 + i]; } HWY_ASSERT_VEC_EQ(to_d, expected.get(), PromoteUpperTo(to_d, Load(from_d, from.get()))); for (size_t i = 0; i < N / 2; ++i) { expected[i] = from[i]; } HWY_ASSERT_VEC_EQ(to_d, expected.get(), PromoteLowerTo(to_d, Load(from_d, from.get()))); } } }; HWY_NOINLINE void TestAllPromoteUpperLowerTo() { const ForShrinkableVectors, 1> to_u16div2; to_u16div2(uint8_t()); const ForShrinkableVectors, 1> to_u32div2; to_u32div2(uint16_t()); const ForShrinkableVectors, 1> to_i16div2; to_i16div2(uint8_t()); to_i16div2(int8_t()); const ForShrinkableVectors, 1> to_i32div2; to_i32div2(uint16_t()); to_i32div2(int16_t()); // Must test f16/bf16 separately because we can only load/store/convert them. #if HWY_HAVE_INTEGER64 const ForShrinkableVectors, 1> to_u64div2; to_u64div2(uint32_t()); const ForShrinkableVectors, 1> to_i64div2; to_i64div2(int32_t()); to_i64div2(uint32_t()); #endif // HWY_HAVE_INTEGER64 #if HWY_HAVE_FLOAT64 const ForShrinkableVectors, 1> to_f64div2; to_f64div2(int32_t()); to_f64div2(uint32_t()); to_f64div2(float()); #endif // HWY_HAVE_FLOAT64 } template struct TestPromoteOddEvenTo { static HWY_INLINE ToT CastValueToWide(hwy::FloatTag /* to_type_tag */, hwy::FloatTag /* from_type_tag */, hwy::float16_t val) { return static_cast(F32FromF16(val)); } static HWY_INLINE ToT CastValueToWide(hwy::FloatTag /* to_type_tag */, hwy::SpecialTag /* from_type_tag */, hwy::bfloat16_t val) { return static_cast(F32FromBF16(val)); } template static HWY_INLINE ToT CastValueToWide(hwy::SignedTag /* to_type_tag */, hwy::FloatTag /* from_type_tag */, T val) { const T kMinInRangeVal = ConvertScalarTo(LimitsMin()); const T kMinOutOfRangePosVal = ConvertScalarTo(-kMinInRangeVal); if (val < kMinInRangeVal) { return LimitsMin(); } else if (val >= kMinOutOfRangePosVal) { return LimitsMax(); } else { return static_cast(val); } } template static HWY_INLINE ToT CastValueToWide(hwy::UnsignedTag /* to_type_tag */, hwy::FloatTag /* from_type_tag */, T val) { const T kMinOutOfRangePosVal = ConvertScalarTo(-ConvertScalarTo(LimitsMin>()) * ConvertScalarTo(2)); if (val < ConvertScalarTo(0)) { return ToT{0}; } else if (val >= kMinOutOfRangePosVal) { return LimitsMax(); } else { return static_cast(val); } } template static HWY_INLINE ToT CastValueToWide(ToTypeTag /* to_type_tag */, FromTypeTag /* from_type_tag */, T val) { return static_cast(val); } template static HWY_INLINE ToT CastValueToWide(T val) { using FromT = RemoveCvRef; return CastValueToWide(hwy::TypeTag(), hwy::TypeTag(), static_cast(val)); } template HWY_NOINLINE void operator()(T /*unused*/, D from_d) { static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); const Repartition to_d; const size_t N = Lanes(from_d); HWY_ASSERT(N >= 2); auto from = AllocateAligned(N); auto expected = AllocateAligned(N / 2); HWY_ASSERT(from && expected); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { from[i] = RandomFiniteValue(&rng); } #if HWY_TARGET != HWY_SCALAR for (size_t i = 0; i < N / 2; ++i) { expected[i] = CastValueToWide(from[i * 2 + 1]); } HWY_ASSERT_VEC_EQ(to_d, expected.get(), PromoteOddTo(to_d, Load(from_d, from.get()))); #endif for (size_t i = 0; i < N / 2; ++i) { expected[i] = CastValueToWide(from[i * 2]); } HWY_ASSERT_VEC_EQ(to_d, expected.get(), PromoteEvenTo(to_d, Load(from_d, from.get()))); } } }; HWY_NOINLINE void TestAllPromoteOddEvenTo() { const ForShrinkableVectors, 1> to_u16div2; to_u16div2(uint8_t()); const ForShrinkableVectors, 1> to_u32div2; to_u32div2(uint16_t()); const ForShrinkableVectors, 1> to_i16div2; to_i16div2(uint8_t()); to_i16div2(int8_t()); const ForShrinkableVectors, 1> to_i32div2; to_i32div2(uint16_t()); to_i32div2(int16_t()); const ForShrinkableVectors, 1> to_f32div2; to_f32div2(hwy::float16_t()); to_f32div2(hwy::bfloat16_t()); #if HWY_HAVE_INTEGER64 const ForShrinkableVectors, 1> to_u64div2; to_u64div2(uint32_t()); to_u64div2(float()); const ForShrinkableVectors, 1> to_i64div2; to_i64div2(int32_t()); to_i64div2(uint32_t()); to_i64div2(float()); #endif // HWY_HAVE_INTEGER64 #if HWY_HAVE_FLOAT64 const ForShrinkableVectors, 1> to_f64div2; to_f64div2(int32_t()); to_f64div2(uint32_t()); to_f64div2(float()); #endif // HWY_HAVE_FLOAT64 // The following are not supported by the underlying PromoteTo: // to_u16div2(int8_t()); // to_u32div2(int16_t()); // to_u64div2(int32_t()); } template bool IsFinite(T t) { return std::isfinite(t); } // Wrapper avoids calling std::isfinite for integer types (ambiguous). template bool IsFinite(T /*unused*/) { return true; } template AlignedFreeUniquePtr F16TestCases(D d, size_t& padded) { const float test_cases[] = { // +/- 1 1.0f, -1.0f, // +/- 0 0.0f, -0.0f, // near 0 0.25f, -0.25f, // +/- integer 4.0f, -32.0f, // positive near limit 65472.0f, 65504.0f, // negative near limit -65472.0f, -65504.0f, // positive +/- delta 2.00390625f, 3.99609375f, // negative +/- delta -2.00390625f, -3.99609375f, // No infinity/NaN - implementation-defined due to Arm. }; constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); const size_t N = Lanes(d); HWY_ASSERT(N != 0); padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors auto in = AllocateAligned(padded); auto expected = AllocateAligned(padded); HWY_ASSERT(in && expected); size_t i = 0; for (; i < kNumTestCases; ++i) { // Ensure the value can be exactly represented as binary16. in[i] = F32FromF16(F16FromF32(test_cases[i])); } for (; i < padded; ++i) { in[i] = 0.0f; } return in; } // This minimal interface is always supported, even if !HWY_HAVE_FLOAT16. struct TestF16 { template HWY_NOINLINE void operator()(TF32 /*t*/, DF32 df32) { size_t padded; const size_t N = Lanes(df32); // same count for f16 HWY_ASSERT(N != 0); auto in = F16TestCases(df32, padded); using TF16 = hwy::float16_t; const Rebind df16; #if HWY_TARGET != HWY_SCALAR const Twice df16t; #endif const RebindToUnsigned du16; // Extra Load/Store to ensure they are usable. auto temp16 = AllocateAligned(N); HWY_ASSERT(temp16); // Extra Zero/BitCast to ensure they are usable. Neg is tested in // arithmetic_test. const Vec v0_u16 = BitCast(du16, Zero(df16)); #if HWY_TARGET == HWY_SCALAR const Vec v0 = BitCast(df32, ZipLower(v0_u16, v0_u16)); #else const Vec v0 = BitCast(df32, ZeroExtendVector(Twice(), v0_u16)); #endif for (size_t i = 0; i < padded; i += N) { const Vec loaded = Or(Load(df32, &in[i]), v0); const Vec v16 = DemoteTo(df16, loaded); Store(v16, df16, temp16.get()); HWY_ASSERT_VEC_EQ(df32, loaded, PromoteTo(df32, Load(df16, temp16.get()))); #if HWY_TARGET == HWY_SCALAR const Vec v16L = v16; #else const Vec v16L = Combine(df16t, Zero(df16), v16); #endif HWY_ASSERT_VEC_EQ(df32, loaded, PromoteLowerTo(df32, v16L)); #if HWY_TARGET != HWY_SCALAR const Vec v16H = Combine(df16t, v16, Zero(df16)); HWY_ASSERT_VEC_EQ(df32, loaded, PromoteUpperTo(df32, v16H)); #endif } } }; HWY_NOINLINE void TestAllF16() { ForDemoteVectors()(float()); } // This minimal interface is always supported, even if !HWY_HAVE_FLOAT16. struct TestF16FromF64 { template HWY_NOINLINE void operator()(TF64 /*t*/, DF64 df64) { #if HWY_HAVE_FLOAT64 size_t padded; const size_t N = Lanes(df64); // same count for f16 and f32 HWY_ASSERT(N != 0); const Rebind df16; const Rebind df32; const RebindToUnsigned du64; using VF16 = Vec; using VF32 = Vec; using VF64 = Vec; using VU64 = Vec; auto f32_in = F16TestCases(df32, padded); const VU64 u64_zero = Set(du64, static_cast(Unpredictable1() - 1)); const VF64 f64_zero = BitCast(df64, u64_zero); const VF16 f16_zero = ResizeBitCast(df16, u64_zero); for (size_t i = 0; i < padded; i += N) { const VF32 vf32 = Load(df32, f32_in.get() + i); const VF16 vf16 = Or(DemoteTo(df16, vf32), f16_zero); const VF64 vf64 = Or(PromoteTo(df64, vf32), f64_zero); HWY_ASSERT_VEC_EQ(df16, vf16, DemoteTo(df16, vf64)); HWY_ASSERT_VEC_EQ(df64, vf64, PromoteTo(df64, vf16)); } #else (void)df64; #endif } }; HWY_NOINLINE void TestAllF16FromF64() { #if HWY_HAVE_FLOAT64 ForDemoteVectors()(double()); #endif } template AlignedFreeUniquePtr BF16TestCases( D d, size_t& padded, AlignedFreeUniquePtr& expected) { const float test_cases[] = { // +/- 1 1.0f, -1.0f, // +/- 0 0.0f, -0.0f, // near 0 0.25f, -0.25f, // +/- integer 4.0f, -32.0f, // positive near limit 3.389531389251535E38f, 1.99384199368e+38f, // negative near limit -3.389531389251535E38f, -1.99384199368e+38f, // positive +/- delta 2.015625f, 3.984375f, // negative +/- delta -2.015625f, -3.984375f, // The above have all excess mantissa bits zero, such that // PromoteTo(DemoteTo) matches the input. Also test round to nearest even: 1.0039063f, // only below is set 1.0117188f, // LSB and below are set 1.9921875f, // all bits except below are set 1.9960938f, // all bits and below are set }; constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); const size_t N = Lanes(d); HWY_ASSERT(N != 0); padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors auto in = AllocateAligned(padded); expected = AllocateAligned(padded); HWY_ASSERT(in && expected); size_t i = 0; for (; i < kNumTestCases; ++i) { in[i] = test_cases[i] * static_cast(hwy::Unpredictable1()); expected[i] = hwy::ConvertScalarTo( hwy::ConvertScalarTo(in[i])); } for (; i < padded; ++i) { in[i] = expected[i] = 0.0f; } return in; } struct TestBF16 { template HWY_NOINLINE void operator()(TF32 /*t*/, DF32 df32) { size_t padded; AlignedFreeUniquePtr expected; const auto in = BF16TestCases(df32, padded, expected); using TBF16 = bfloat16_t; #if HWY_TARGET == HWY_SCALAR const Rebind dbf16; // avoid 4/2 = 2 lanes #else const Repartition dbf16; #endif const Half dbf16_half; using VF = Vec; using VBF16 = Vec; using VBF16H = Vec; const size_t N = Lanes(df32); HWY_ASSERT(Lanes(dbf16_half) == N); auto temp16 = AllocateAligned(N); HWY_ASSERT(temp16); for (size_t i = 0; i < padded; i += N) { const VF vin = Load(df32, &in[i]); const VF vexp = Load(df32, &expected[i]); const VBF16H v16 = DemoteTo(dbf16_half, vin); Store(v16, dbf16_half, temp16.get()); const VBF16H v16_loaded = Load(dbf16_half, temp16.get()); HWY_ASSERT_VEC_EQ(df32, vexp, PromoteTo(df32, v16_loaded)); #if HWY_TARGET == HWY_SCALAR const VBF16 v16L = v16_loaded; #else const VBF16 v16L = Combine(dbf16, Zero(dbf16_half), v16_loaded); #endif HWY_ASSERT_VEC_EQ(df32, vexp, PromoteLowerTo(df32, v16L)); #if HWY_TARGET != HWY_SCALAR const VBF16 v16H = Combine(dbf16, v16_loaded, Zero(dbf16_half)); HWY_ASSERT_VEC_EQ(df32, vexp, PromoteUpperTo(df32, v16H)); #endif } } }; HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors()(float()); } struct TestConvertU8 { template HWY_NOINLINE void operator()(T /*unused*/, const D du32) { const Rebind du8; const auto wrap = Set(du32, 0xFF); HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap))); HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F), U8FromU32(And(Iota(du32, 0x7F), wrap))); } }; HWY_NOINLINE void TestAllConvertU8() { ForDemoteVectors()(uint32_t()); } class TestIntFromFloat { template static HWY_NOINLINE void TestHuge(TF /*unused*/, const DF df) { using TI = MakeSigned; const Rebind di; // Huge positive HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax()), ConvertTo(di, Set(df, HighestValue()))); // Huge negative HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin()), ConvertTo(di, Set(df, LowestValue()))); } template static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) { using TI = MakeSigned; const Rebind di; constexpr size_t kBits = sizeof(TF) * 8; // Powers of two, plus offsets to set some mantissa bits. const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)}; for (int sign = 0; sign < 2; ++sign) { for (size_t shift = 0; shift < kBits - 1; ++shift) { for (int64_t ofs : ofs_table) { const int64_t mag = (int64_t{1} << shift) + ofs; const int64_t val = sign ? mag : -mag; const TF val_f = ConvertScalarTo(val); // Convert expected value to account for loss of precision. HWY_ASSERT_VEC_EQ( di, Set(di, static_cast(ConvertScalarTo(val_f))), ConvertTo(di, Set(df, val_f))); } } } } template static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) { using TI = MakeSigned; const Rebind di; const size_t N = Lanes(df); // TF does not have enough precision to represent TI. const double min = static_cast(LimitsMin()); const double max = static_cast(LimitsMax()); // Also check random values. auto from = AllocateAligned(N); auto expected = AllocateAligned(N); HWY_ASSERT(from && expected); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { for (size_t i = 0; i < N; ++i) { do { const uint64_t bits = rng(); CopyBytes(&bits, &from[i]); // not same size } while (!ScalarIsFinite(from[i])); if (from[i] >= max) { expected[i] = LimitsMax(); } else if (from[i] <= min) { expected[i] = LimitsMin(); } else { expected[i] = static_cast(from[i]); } } HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Load(df, from.get()))); } } public: template HWY_NOINLINE void operator()(TF tf, const DF df) { using TI = MakeSigned; const Rebind di; const size_t N = Lanes(df); // Integer positive HWY_ASSERT_VEC_EQ(di, Iota(di, 4), ConvertTo(di, Iota(df, 4.0))); // Integer negative HWY_ASSERT_VEC_EQ(di, Iota(di, -static_cast(N)), ConvertTo(di, Iota(df, -ConvertScalarTo(N)))); // Above positive HWY_ASSERT_VEC_EQ(di, Iota(di, 2), ConvertTo(di, Iota(df, 2.1))); // Below positive HWY_ASSERT_VEC_EQ(di, Iota(di, 3), ConvertTo(di, Iota(df, 3.9))); const double neg = -static_cast(N + 1); const double eps = ConvertScalarTo(Epsilon()) * static_cast(N); // Above negative HWY_ASSERT_VEC_EQ(di, Iota(di, -static_cast(N)), ConvertTo(di, Iota(df, ConvertScalarTo(neg + eps)))); // Below negative HWY_ASSERT_VEC_EQ(di, Iota(di, -static_cast(N + 1)), ConvertTo(di, Iota(df, ConvertScalarTo(neg - eps)))); TestHuge(tf, df); TestPowers(tf, df); TestRandom(tf, df); } }; HWY_NOINLINE void TestAllIntFromFloat() { ForFloatTypes(ForPartialVectors()); } struct TestMaskedIntFromFloat { template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { using TI = MakeSigned; const Rebind di; const size_t N = Lanes(df); auto expected = AllocateAligned(N); auto bool_lanes = AllocateAligned(N); HWY_ASSERT(expected && bool_lanes); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); } const auto mask_i = Load(di, bool_lanes.get()); const auto mask = RebindMask(di, Gt(mask_i, Zero(di))); // This requires a test different to that in TestMaskedFloatFromInt and // TestMaskedFloatFromUint, due to differences in saturation handling // between ConvertTo() and static_cast<> HWY_ASSERT_VEC_EQ(di, IfThenElseZero(mask, Set(di, 1)), MaskedConvertTo(mask, di, Set(df, 1))); } } }; struct TestMaskedFloatFromInt { template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { using TI = MakeSigned; const RebindToSigned di; const size_t N = Lanes(df); auto from = AllocateAligned(N); auto expected = AllocateAligned(N); auto bool_lanes = AllocateAligned(N); HWY_ASSERT(from && expected && bool_lanes); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { const uint64_t bits = rng(); CopyBytes(&bits, &from[i]); // not same size bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (bool_lanes[i]) { expected[i] = ConvertScalarTo(from[i]); } else { expected[i] = ConvertScalarTo(0); } } const auto mask_i = Load(di, bool_lanes.get()); const auto mask = RebindMask(df, Gt(mask_i, Zero(di))); const auto v1 = Load(di, from.get()); // Float from int HWY_ASSERT_VEC_EQ(df, expected.get(), MaskedConvertTo(mask, df, v1)); } } }; struct TestMaskedFloatFromUint { template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { using TI = MakeUnsigned; const RebindToUnsigned di; const size_t N = Lanes(df); auto from = AllocateAligned(N); auto expected = AllocateAligned(N); auto bool_lanes = AllocateAligned(N); HWY_ASSERT(from && expected && bool_lanes); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { const uint64_t bits = rng(); CopyBytes(&bits, &from[i]); // not same size bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (bool_lanes[i]) { expected[i] = ConvertScalarTo(from[i]); } else { expected[i] = ConvertScalarTo(0); } } const auto mask_i = Load(di, bool_lanes.get()); const auto mask = RebindMask(df, Gt(mask_i, Zero(di))); const auto v1 = Load(di, from.get()); // Float from int HWY_ASSERT_VEC_EQ(df, expected.get(), MaskedConvertTo(mask, df, v1)); } } }; HWY_NOINLINE void TestAllMaskedConvertTo() { ForFloatTypes(ForPartialVectors()); ForFloatTypes(ForPartialVectors()); ForFloatTypes(ForPartialVectors()); } class TestUintFromFloat { template static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) { using TU = MakeUnsigned; const Rebind du; constexpr size_t kBits = sizeof(TU) * 8; // Powers of two, plus offsets to set some mantissa bits. const uint64_t ofs_table[3] = {0ULL, 3ULL << (kBits / 2), 1ULL << (kBits - 15)}; for (int sign = 0; sign < 2; ++sign) { for (size_t shift = 0; shift < kBits - 1; ++shift) { for (uint64_t ofs : ofs_table) { const uint64_t mag = (uint64_t{1} << shift) + ofs; const TF flt_mag = static_cast(mag); const TF flt_val = static_cast(sign ? -flt_mag : flt_mag); const TU expected_result = sign ? TU{0} : static_cast(mag); HWY_ASSERT_VEC_EQ(du, Set(du, expected_result), ConvertTo(du, Set(df, flt_val))); } } } } template static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) { using TU = MakeUnsigned; const Rebind du; const size_t N = Lanes(df); // If LimitsMax() can be exactly represented in TF, // kSmallestOutOfTURangePosVal is equal to LimitsMax(). // Otherwise, if LimitsMax() cannot be exactly represented in TF, // kSmallestOutOfTURangePosVal is equal to LimitsMax() + 1, which can // be exactly represented in TF. constexpr TF kSmallestOutOfTURangePosVal = (sizeof(TU) * 8 <= static_cast(MantissaBits()) + 1) ? static_cast(LimitsMax()) : static_cast(static_cast(TU{1} << (sizeof(TU) * 8 - 1)) * ConvertScalarTo(2)); constexpr uint64_t kRandBitsMask = static_cast(LimitsMax>()); // Also check random values. auto from_pos = AllocateAligned(N); auto from_neg = AllocateAligned(N); auto expected = AllocateAligned(N); HWY_ASSERT(from_pos && from_neg && expected); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { for (size_t i = 0; i < N; ++i) { do { const TU bits = static_cast(rng() & kRandBitsMask); CopyBytes(&bits, &from_pos[i]); } while (!std::isfinite(from_pos[i])); from_neg[i] = static_cast(-from_pos[i]); expected[i] = (from_pos[i] < kSmallestOutOfTURangePosVal) ? static_cast(from_pos[i]) : LimitsMax(); } HWY_ASSERT_VEC_EQ(du, expected.get(), ConvertTo(du, Load(df, from_pos.get()))); HWY_ASSERT_VEC_EQ(du, Zero(du), ConvertTo(du, Load(df, from_neg.get()))); } } public: template HWY_NOINLINE void operator()(TF tf, const DF df) { using TU = MakeUnsigned; const Rebind du; const size_t N = Lanes(df); // Integer positive HWY_ASSERT_VEC_EQ(du, Iota(du, 4), ConvertTo(du, Iota(df, 4.0))); // Integer negative HWY_ASSERT_VEC_EQ(du, Zero(du), ConvertTo(du, Iota(df, -ConvertScalarTo(N)))); // Above positive HWY_ASSERT_VEC_EQ(du, Iota(du, 2), ConvertTo(du, Iota(df, 2.001))); // Below positive HWY_ASSERT_VEC_EQ(du, Iota(du, 3), ConvertTo(du, Iota(df, 3.9999))); const TF eps = static_cast(0.0001); // Above negative HWY_ASSERT_VEC_EQ( du, Zero(du), ConvertTo(du, Iota(df, -ConvertScalarTo(N + 1) + eps))); // Below negative HWY_ASSERT_VEC_EQ( du, Zero(du), ConvertTo(du, Iota(df, -ConvertScalarTo(N + 1) - eps))); TestPowers(tf, df); TestRandom(tf, df); } }; HWY_NOINLINE void TestAllUintFromFloat() { // std::isfinite does not support float16_t. ForFloat3264Types(ForPartialVectors()); } struct TestFloatFromInt { template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { using TI = MakeSigned; const RebindToSigned di; const size_t N = Lanes(df); // Integer positive HWY_ASSERT_VEC_EQ(df, Iota(df, 4.0), ConvertTo(df, Iota(di, 4))); // Integer negative HWY_ASSERT_VEC_EQ(df, Iota(df, -ConvertScalarTo(N)), ConvertTo(df, Iota(di, -static_cast(N)))); // Max positive HWY_ASSERT_VEC_EQ(df, Set(df, ConvertScalarTo(LimitsMax())), ConvertTo(df, Set(di, LimitsMax()))); // Min negative HWY_ASSERT_VEC_EQ(df, Set(df, ConvertScalarTo(LimitsMin())), ConvertTo(df, Set(di, LimitsMin()))); } }; HWY_NOINLINE void TestAllFloatFromInt() { ForFloatTypes(ForPartialVectors()); } struct TestFloatFromUint { template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { using TU = MakeUnsigned; const RebindToUnsigned du; // Integer positive HWY_ASSERT_VEC_EQ(df, Iota(df, 4.0), ConvertTo(df, Iota(du, 4))); HWY_ASSERT_VEC_EQ(df, Set(df, ConvertScalarTo(32767)), ConvertTo(df, Set(du, 32767))); // 2^16-1 if (sizeof(TF) > 4) { HWY_ASSERT_VEC_EQ(df, Iota(df, 4294967295.0), ConvertTo(df, Iota(du, 4294967295ULL))); // 2^32-1 } // Max positive HWY_ASSERT_VEC_EQ(df, Set(df, ConvertScalarTo(LimitsMax())), ConvertTo(df, Set(du, LimitsMax()))); // Zero HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du))); } }; HWY_NOINLINE void TestAllFloatFromUint() { ForFloatTypes(ForPartialVectors()); } #undef HWY_F2I_INLINE #if HWY_TARGET == HWY_RVV // Workaround for incorrect rounding mode. #define HWY_F2I_INLINE HWY_NOINLINE #else #define HWY_F2I_INLINE HWY_INLINE #endif template class TestNonFiniteF2IConvertTo { private: static_assert(IsIntegerLaneType() && IsSame>(), "TTo must be an integer type"); template static HWY_F2I_INLINE VFromD> DoF2IConvVec(DF df, VFromD v) { return PromoteTo(Rebind(), v); } template static HWY_F2I_INLINE VFromD> DoF2IConvVec(DF df, VFromD v) { return ConvertTo(Rebind(), v); } template static HWY_F2I_INLINE VFromD> DoF2IConvVec(DF df, VFromD v) { return DemoteTo(Rebind(), v); } template static HWY_INLINE Mask> DoF2IConvMask(DF df, Mask m) { return PromoteMaskTo(Rebind(), df, m); } template static HWY_INLINE Mask> DoF2IConvMask(DF df, Mask m) { return RebindMask(Rebind(), m); } template static HWY_INLINE Mask> DoF2IConvMask(DF df, Mask m) { return DemoteMaskTo(Rebind(), df, m); } template static HWY_INLINE Vec, DF>> DoF2IConvMsbMaskVec( DF /*df*/, Vec v) { return PromoteTo(Rebind, DF>(), BitCast(RebindToSigned(), v)); } template static HWY_INLINE Vec, DF>> DoF2IConvMsbMaskVec( DF /*df*/, Vec v) { return BitCast(Rebind, DF>(), v); } template static HWY_INLINE Vec, DF>> DoF2IConvMsbMaskVec( DF /*df*/, Vec v) { return DemoteTo(Rebind, DF>(), BitCast(RebindToSigned(), v)); } template static HWY_NOINLINE void VerifyNonFiniteF2I(DF df, const VecArg> v, const char* filename, const int line) { using TF = TFromD; using TU = MakeUnsigned; using TTo_I = MakeSigned; const TF kMinOutOfRangePosVal = ConvertScalarTo((-ConvertScalarTo(LimitsMin())) * ConvertScalarTo(IsSigned() ? 1 : 2)); HWY_ASSERT(ConvertScalarTo(kMinOutOfRangePosVal) > 0.0); const Rebind d_to; const RebindToSigned di_to; const RebindToUnsigned du; const auto non_elided_zero = BitCast(df, Set(du, static_cast(Unpredictable1() - 1))); const auto v2 = Or(non_elided_zero, v); const auto is_nan_mask = IsNaN(v2); const auto is_in_range_mask = AndNot(is_nan_mask, Lt(Abs(IfThenZeroElse(is_nan_mask, v2)), Set(df, kMinOutOfRangePosVal))); const auto is_nan_vmask = VecFromMask(d_to, DoF2IConvMask(df, is_nan_mask)); const auto expected_in_range = DoF2IConvVec(df, IfThenElseZero(is_in_range_mask, v2)); const auto expected_out_of_range = Or(is_nan_vmask, BitCast(d_to, IfNegativeThenElse( DoF2IConvMsbMaskVec(df, v2), BitCast(di_to, Set(d_to, LimitsMin())), BitCast(di_to, Set(d_to, LimitsMax()))))); const auto expected = IfThenElse(DoF2IConvMask(df, is_in_range_mask), expected_in_range, expected_out_of_range); AssertVecEqual(d_to, expected, Or(DoF2IConvVec(df, v), is_nan_vmask), filename, line); AssertVecEqual(d_to, expected, Or(DoF2IConvVec(df, v2), is_nan_vmask), filename, line); } public: template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { using TI = MakeSigned; using TU = MakeUnsigned; const RebindToSigned di; // TODO(janwas): workaround for QEMU 7.2 crash on vfwcvt_rtz_x_f_v: // target/riscv/translate.c:213 in void decode_save_opc(DisasContext *): // ctx->insn_start != NULL. #if HWY_TARGET == HWY_RVV || (HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128) if (sizeof(TTo) > sizeof(TF)) { return; } #endif const auto pos_nan = BitCast(df, Set(di, LimitsMax())); const auto neg_nan = BitCast(df, Set(di, static_cast(-1))); const auto pos_inf = BitCast(df, Set(di, static_cast(ExponentMask()))); const auto neg_inf = Neg(pos_inf); VerifyNonFiniteF2I(df, pos_nan, __FILE__, __LINE__); VerifyNonFiniteF2I(df, neg_nan, __FILE__, __LINE__); VerifyNonFiniteF2I(df, pos_inf, __FILE__, __LINE__); VerifyNonFiniteF2I(df, neg_inf, __FILE__, __LINE__); const TI non_elided_one = static_cast(Unpredictable1()); const auto iota1 = Iota(df, ConvertScalarTo(non_elided_one)); VerifyNonFiniteF2I(df, iota1, __FILE__, __LINE__); const size_t N = Lanes(df); #if HWY_TARGET != HWY_SCALAR if (N > 1) { VerifyNonFiniteF2I(df, OddEven(pos_nan, iota1), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(iota1, pos_nan), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(neg_nan, iota1), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(iota1, neg_nan), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(pos_inf, iota1), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(iota1, pos_inf), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(neg_inf, iota1), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(iota1, neg_inf), __FILE__, __LINE__); } #endif auto in_lanes = AllocateAligned(N); HWY_ASSERT(in_lanes); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { for (size_t i = 0; i < N; ++i) { in_lanes[i] = BitCastScalar(static_cast(rng())); } const auto v = Load(df, in_lanes.get()); VerifyNonFiniteF2I(df, v, __FILE__, __LINE__); VerifyNonFiniteF2I(df, Or(v, pos_inf), __FILE__, __LINE__); #if HWY_TARGET != HWY_SCALAR if (N > 1) { VerifyNonFiniteF2I(df, OddEven(pos_nan, v), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(v, pos_nan), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(neg_nan, v), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(v, neg_nan), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(pos_inf, v), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(v, pos_inf), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(neg_inf, v), __FILE__, __LINE__); VerifyNonFiniteF2I(df, OddEven(v, neg_inf), __FILE__, __LINE__); } #endif } } }; HWY_NOINLINE void TestAllNonFiniteF2IConvertTo() { #if HWY_HAVE_FLOAT16 ForPartialVectors>()(hwy::float16_t()); ForPartialVectors>()(hwy::float16_t()); #endif ForPartialVectors>()(float()); ForPartialVectors>()(float()); #if HWY_HAVE_FLOAT64 ForPartialVectors>()(double()); ForPartialVectors>()(double()); #endif #if HWY_HAVE_INTEGER64 ForPromoteVectors>()(float()); ForPromoteVectors>()(float()); #endif #if HWY_HAVE_FLOAT64 ForDemoteVectors>()(double()); ForDemoteVectors>()(double()); #endif } struct TestI32F64 { template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { using TI = int32_t; const Rebind di; const size_t N = Lanes(df); // Integer positive HWY_ASSERT_VEC_EQ(df, Iota(df, 4.0), PromoteTo(df, Iota(di, 4))); // Integer negative HWY_ASSERT_VEC_EQ(df, Iota(df, -ConvertScalarTo(N)), PromoteTo(df, Iota(di, -static_cast(N)))); // Above positive HWY_ASSERT_VEC_EQ(df, Iota(df, 2.0), PromoteTo(df, Iota(di, 2))); // Below positive HWY_ASSERT_VEC_EQ(df, Iota(df, 4.0), PromoteTo(df, Iota(di, 4))); // Above negative HWY_ASSERT_VEC_EQ(df, Iota(df, ConvertScalarTo(-4.0)), PromoteTo(df, Iota(di, -4))); // Below negative HWY_ASSERT_VEC_EQ(df, Iota(df, -2.0), PromoteTo(df, Iota(di, -2))); // Max positive int HWY_ASSERT_VEC_EQ(df, Set(df, TF{LimitsMax()}), PromoteTo(df, Set(di, LimitsMax()))); // Min negative int HWY_ASSERT_VEC_EQ(df, Set(df, TF{LimitsMin()}), PromoteTo(df, Set(di, LimitsMin()))); } }; HWY_NOINLINE void TestAllI32F64() { #if HWY_HAVE_FLOAT64 ForDemoteVectors()(double()); #endif } template struct TestF2IPromoteTo { template HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { const Rebind d_to; // TODO(janwas): workaround for QEMU 7.2 crash on vfwcvt_rtz_x_f_v: // target/riscv/translate.c:213 in void decode_save_opc(DisasContext *): // ctx->insn_start != NULL. #if HWY_TARGET == HWY_RVV || (HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128) return; #endif HWY_ASSERT_VEC_EQ(d_to, Set(d_to, ToT(1)), PromoteTo(d_to, Set(df, TF{1}))); HWY_ASSERT_VEC_EQ(d_to, Zero(d_to), PromoteTo(d_to, Zero(df))); HWY_ASSERT_VEC_EQ(d_to, Set(d_to, IsSigned() ? ToT(-1) : ToT(0)), PromoteTo(d_to, Set(df, TF{-1}))); constexpr size_t kNumOfNonSignBitsInToT = sizeof(ToT) * 8 - static_cast(IsSigned()); // kSmallestInToTRangeVal is the smallest value of TF that is within the // range of ToT. constexpr TF kSmallestInToTRangeVal = static_cast(LimitsMin()); // If LimitsMax() can be exactly represented in TF, // kSmallestOutOfToTRangePosVal is equal to LimitsMax(). // Otherwise, if LimitsMax() cannot be exactly represented in TF, // kSmallestOutOfToTRangePosVal is equal to LimitsMax() + 1, which can // be exactly represented in TF. constexpr TF kSmallestOutOfToTRangePosVal = (kNumOfNonSignBitsInToT <= static_cast(MantissaBits()) + 1) ? static_cast(LimitsMax()) : static_cast( static_cast(ToT{1} << (kNumOfNonSignBitsInToT - 1)) * TF{2}); HWY_ASSERT_VEC_EQ(d_to, Set(d_to, LimitsMax()), PromoteTo(d_to, Set(df, kSmallestOutOfToTRangePosVal))); HWY_ASSERT_VEC_EQ( d_to, Set(d_to, LimitsMax()), PromoteTo(d_to, Set(df, kSmallestOutOfToTRangePosVal * TF{2}))); HWY_ASSERT_VEC_EQ( d_to, Set(d_to, LimitsMin()), PromoteTo(d_to, Set(df, kSmallestOutOfToTRangePosVal * TF{-2}))); const size_t N = Lanes(df); auto in_pos = AllocateAligned(N); auto in_neg = AllocateAligned(N); auto expected_pos_to_int = AllocateAligned(N); auto expected_neg_to_int = AllocateAligned(N); HWY_ASSERT(in_pos && in_neg && expected_pos_to_int && expected_neg_to_int); using FromTU = MakeUnsigned; constexpr uint64_t kRandBitsMask = static_cast(LimitsMax>()); RandomState rng; for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { for (size_t i = 0; i < N; ++i) { do { const FromTU bits = static_cast(rng() & kRandBitsMask); CopyBytes(&bits, &in_pos[i]); // not same size } while (!std::isfinite(in_pos[i])); const TF pos_val = in_pos[i]; const TF neg_val = static_cast(-pos_val); in_neg[i] = neg_val; expected_pos_to_int[i] = (pos_val < kSmallestOutOfToTRangePosVal) ? static_cast(pos_val) : LimitsMax(); expected_neg_to_int[i] = (neg_val > kSmallestInToTRangeVal) ? static_cast(neg_val) : LimitsMin(); } HWY_ASSERT_VEC_EQ(d_to, expected_pos_to_int.get(), PromoteTo(d_to, Load(df, in_pos.get()))); HWY_ASSERT_VEC_EQ(d_to, expected_neg_to_int.get(), PromoteTo(d_to, Load(df, in_neg.get()))); } } }; HWY_NOINLINE void TestAllF2IPromoteTo() { #if HWY_HAVE_INTEGER64 const ForPromoteVectors, 1> to_i64div2; to_i64div2(float()); const ForPromoteVectors, 1> to_u64div2; to_u64div2(float()); #endif } template struct TestF2IPromoteUpperLowerTo { template HWY_NOINLINE void operator()(T /*unused*/, D from_d) { static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); const Repartition to_d; // TODO(janwas): workaround for QEMU 7.2 crash on vfwcvt_rtz_x_f_v: // target/riscv/translate.c:213 in void decode_save_opc(DisasContext *): // ctx->insn_start != NULL. #if HWY_TARGET == HWY_RVV || (HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128) return; #endif const size_t N = Lanes(from_d); auto from = AllocateAligned(N); auto expected = AllocateAligned(N / 2); HWY_ASSERT(from && expected); using TU = MakeUnsigned; constexpr int kNumOfMantBits = MantissaBits(); constexpr TU kMaxBiasedExp = static_cast(MaxExponentField()); constexpr TU kExponentBias = kMaxBiasedExp >> 1; constexpr TU kMaxInToTRangeBiasedExpBits = static_cast(HWY_MIN(kExponentBias + sizeof(ToT) * 8 - static_cast(IsSigned()) - 1u, kMaxBiasedExp - 1) << kNumOfMantBits); constexpr TU kMinOutOfToTRangeBiasedExpBits = static_cast( kMaxInToTRangeBiasedExpBits + (TU{1} << kNumOfMantBits)); constexpr TU kMaxFiniteBiasedExpBits = static_cast