29 static constexpr size_t min_buffer_align =
alignof(uint64_t);
31 template<
size_t DstExtent,
size_t SrcExtent>
32 static void add(std::span<std::byte, DstExtent> dst, std::span<std::byte const, SrcExtent> src)
noexcept {
33 assert(dst.size() == src.size());
34 assert(dst.size() %
sizeof(uint64_t) == 0);
35 assert(
reinterpret_cast<uintptr_t
>(dst.data()) %
alignof(uint64_t) == 0);
36 assert(
reinterpret_cast<uintptr_t
>(src.data()) %
alignof(uint64_t) == 0);
38 std::span<uint64_t> dst64{
reinterpret_cast<uint64_t *
>(dst.data()), dst.size() /
sizeof(uint64_t)};
39 std::span<uint64_t const> src64{
reinterpret_cast<uint64_t
const *
>(src.data()), src.size() /
sizeof(uint64_t)};
41 if constexpr (!Bits::needs_padding) {
42 static_assert(Bits::bits_per_element == 16 || Bits::bits_per_element == 32,
43 "Only 16 and 32 bit elements are implemented for non-padded data");
45 static constexpr uint64_t mask_group_1 = Bits::bits_per_element == 16
46 ? 0xffff0000ffff0000ull
47 : 0xffffffff00000000ull;
48 static constexpr uint64_t mask_group_2 = ~mask_group_1;
50 for (
size_t ix = 0; ix < dst64.size(); ++ix) {
51 auto const ai = little_endian(dst64[ix]);
52 auto const bi = little_endian(src64[ix]);
54 auto const ai1 = ai & mask_group_1;
55 auto const ai2 = ai & mask_group_2;
56 auto const bi1 = bi & mask_group_1;
57 auto const bi2 = bi & mask_group_2;
59 uint64_t oi1 = (ai1 + bi1) & mask_group_1;
60 uint64_t oi2 = (ai2 + bi2) & mask_group_2;
61 dst64[ix] = little_endian(oi1 | oi2);
64 for (
size_t ix = 0; ix < dst64.size(); ++ix) {
65 auto const ai = little_endian(dst64[ix]);
66 auto const bi = little_endian(src64[ix]);
67 dst64[ix] = little_endian((ai + bi) & Bits::data_mask);
72 template<
size_t DstExtent,
size_t SrcExtent>
73 static void sub(std::span<std::byte, DstExtent> dst, std::span<std::byte const, SrcExtent> src)
noexcept {
74 assert(dst.size() == src.size());
75 assert(dst.size() %
sizeof(uint64_t) == 0);
76 assert(
reinterpret_cast<uintptr_t
>(dst.data()) %
alignof(uint64_t) == 0);
77 assert(
reinterpret_cast<uintptr_t
>(src.data()) %
alignof(uint64_t) == 0);
79 std::span<uint64_t> dst64{
reinterpret_cast<uint64_t *
>(dst.data()), dst.size() /
sizeof(uint64_t)};
80 std::span<uint64_t const> src64{
reinterpret_cast<uint64_t
const *
>(src.data()), src.size() /
sizeof(uint64_t)};
82 if constexpr (!Bits::needs_padding) {
83 static_assert(Bits::bits_per_element == 16 || Bits::bits_per_element == 32,
84 "Only 16 and 32 bit elements are implemented for non-padded data");
86 static constexpr uint64_t mask_group_1 = Bits::bits_per_element == 16
87 ? 0xffff0000ffff0000ull
88 : 0xffffffff00000000ull;
89 static constexpr uint64_t mask_group_2 = ~mask_group_1;
91 for (
size_t ix = 0; ix < dst64.size(); ++ix) {
92 auto const ai = little_endian(dst64[ix]);
93 auto const bi = little_endian(src64[ix]);
95 auto const ai1 = ai & mask_group_1;
96 auto const ai2 = ai & mask_group_2;
97 auto const bi1 = bi & mask_group_1;
98 auto const bi2 = bi & mask_group_2;
100 uint64_t oi1 = (ai1 + (mask_group_2 - bi1)) & mask_group_1;
101 uint64_t oi2 = (ai2 + (mask_group_1 - bi2)) & mask_group_2;
102 dst64[ix] = little_endian(oi1 | oi2);
105 for (
size_t ix = 0; ix < dst64.size(); ++ix) {
106 auto const ai = little_endian(dst64[ix]);
107 auto const bi = little_endian(src64[ix]);
108 dst64[ix] = little_endian((ai + ((~Bits::data_mask - bi) & Bits::data_mask)) & Bits::data_mask);
113 template<
size_t Extent>
114 static bool check_padding_bits(std::span<std::byte const, Extent> data)
noexcept requires (Bits::needs_padding) {
115 assert(data.size() %
sizeof(uint64_t) == 0);
116 assert(
reinterpret_cast<uintptr_t
>(data.data()) %
alignof(uint64_t) == 0);
118 std::span<uint64_t const> data64{
reinterpret_cast<uint64_t
const *
>(data.data()), data.size() /
sizeof(uint64_t)};
119 for (
auto const val : data64) {
120 if ((little_endian(val) & ~Bits::data_mask) != 0) {
127 template<
size_t Extent>
128 static void clear_padding_bits(std::span<std::byte, Extent> data)
noexcept requires (Bits::needs_padding) {
129 assert(data.size() %
sizeof(uint64_t) == 0);
130 assert(
reinterpret_cast<uintptr_t
>(data.data()) %
alignof(uint64_t) == 0);
132 if constexpr (Bits::needs_padding) {
133 std::span<uint64_t> data64{
reinterpret_cast<uint64_t *
>(data.data()), data.size() /
sizeof(uint64_t)};
134 for (
auto &val : data64) {
135 val = little_endian(little_endian(val) & Bits::data_mask);