29        static constexpr size_t min_buffer_align = 
alignof(uint64_t);
 
   31        template<
size_t DstExtent, 
size_t SrcExtent>
 
   32        static void add(std::span<std::byte, DstExtent> dst, std::span<std::byte const, SrcExtent> src) 
noexcept {
 
   33            assert(dst.size() == src.size());
 
   34            assert(dst.size() % 
sizeof(uint64_t) == 0);
 
   35            assert(
reinterpret_cast<uintptr_t
>(dst.data()) % 
alignof(uint64_t) == 0);
 
   36            assert(
reinterpret_cast<uintptr_t
>(src.data()) % 
alignof(uint64_t) == 0);
 
   38            std::span<uint64_t> dst64{
reinterpret_cast<uint64_t *
>(dst.data()), dst.size() / 
sizeof(uint64_t)};
 
   39            std::span<uint64_t const> src64{
reinterpret_cast<uint64_t 
const *
>(src.data()), src.size() / 
sizeof(uint64_t)};
 
   41            if constexpr (!Bits::needs_padding) {
 
   42                static_assert(Bits::bits_per_element == 16 || Bits::bits_per_element == 32,
 
   43                              "Only 16 and 32 bit elements are implemented for non-padded data");
 
   45                static constexpr uint64_t mask_group_1 = Bits::bits_per_element == 16
 
   46                                                                 ? 0xffff0000ffff0000ull
 
   47                                                                 : 0xffffffff00000000ull;
 
   48                static constexpr uint64_t mask_group_2 = ~mask_group_1;
 
   50                for (
size_t ix = 0; ix < dst64.size(); ++ix) {
 
   51                    auto const ai = little_endian(dst64[ix]);
 
   52                    auto const bi = little_endian(src64[ix]);
 
   54                    auto const ai1 = ai & mask_group_1;
 
   55                    auto const ai2 = ai & mask_group_2;
 
   56                    auto const bi1 = bi & mask_group_1;
 
   57                    auto const bi2 = bi & mask_group_2;
 
   59                    uint64_t oi1 = (ai1 + bi1) & mask_group_1;
 
   60                    uint64_t oi2 = (ai2 + bi2) & mask_group_2;
 
   61                    dst64[ix] = little_endian(oi1 | oi2);
 
   64                for (
size_t ix = 0; ix < dst64.size(); ++ix) {
 
   65                    auto const ai = little_endian(dst64[ix]);
 
   66                    auto const bi = little_endian(src64[ix]);
 
   67                    dst64[ix] = little_endian((ai + bi) & Bits::data_mask);
 
   72        template<
size_t DstExtent, 
size_t SrcExtent>
 
   73        static void sub(std::span<std::byte, DstExtent> dst, std::span<std::byte const, SrcExtent> src) 
noexcept {
 
   74            assert(dst.size() == src.size());
 
   75            assert(dst.size() % 
sizeof(uint64_t) == 0);
 
   76            assert(
reinterpret_cast<uintptr_t
>(dst.data()) % 
alignof(uint64_t) == 0);
 
   77            assert(
reinterpret_cast<uintptr_t
>(src.data()) % 
alignof(uint64_t) == 0);
 
   79            std::span<uint64_t> dst64{
reinterpret_cast<uint64_t *
>(dst.data()), dst.size() / 
sizeof(uint64_t)};
 
   80            std::span<uint64_t const> src64{
reinterpret_cast<uint64_t 
const *
>(src.data()), src.size() / 
sizeof(uint64_t)};
 
   82            if constexpr (!Bits::needs_padding) {
 
   83                static_assert(Bits::bits_per_element == 16 || Bits::bits_per_element == 32,
 
   84                              "Only 16 and 32 bit elements are implemented for non-padded data");
 
   86                static constexpr uint64_t mask_group_1 = Bits::bits_per_element == 16
 
   87                                                                 ? 0xffff0000ffff0000ull
 
   88                                                                 : 0xffffffff00000000ull;
 
   89                static constexpr uint64_t mask_group_2 = ~mask_group_1;
 
   91                for (
size_t ix = 0; ix < dst64.size(); ++ix) {
 
   92                    auto const ai = little_endian(dst64[ix]);
 
   93                    auto const bi = little_endian(src64[ix]);
 
   95                    auto const ai1 = ai & mask_group_1;
 
   96                    auto const ai2 = ai & mask_group_2;
 
   97                    auto const bi1 = bi & mask_group_1;
 
   98                    auto const bi2 = bi & mask_group_2;
 
  100                    uint64_t oi1 = (ai1 + (mask_group_2 - bi1)) & mask_group_1;
 
  101                    uint64_t oi2 = (ai2 + (mask_group_1 - bi2)) & mask_group_2;
 
  102                    dst64[ix] = little_endian(oi1 | oi2);
 
  105                for (
size_t ix = 0; ix < dst64.size(); ++ix) {
 
  106                    auto const ai = little_endian(dst64[ix]);
 
  107                    auto const bi = little_endian(src64[ix]);
 
  108                    dst64[ix] = little_endian((ai + ((~Bits::data_mask - bi) & Bits::data_mask)) & Bits::data_mask);
 
  113        template<
size_t Extent>
 
  114        static bool check_padding_bits(std::span<std::byte const, Extent> data) 
noexcept requires (Bits::needs_padding) {
 
  115            assert(data.size() % 
sizeof(uint64_t) == 0);
 
  116            assert(
reinterpret_cast<uintptr_t
>(data.data()) % 
alignof(uint64_t) == 0);
 
  118            std::span<uint64_t const> data64{
reinterpret_cast<uint64_t 
const *
>(data.data()), data.size() / 
sizeof(uint64_t)};
 
  119            for (
auto const val : data64) {
 
  120                if ((little_endian(val) & ~Bits::data_mask) != 0) {
 
  127        template<
size_t Extent>
 
  128        static void clear_padding_bits(std::span<std::byte, Extent> data) 
noexcept requires (Bits::needs_padding) {
 
  129            assert(data.size() % 
sizeof(uint64_t) == 0);
 
  130            assert(
reinterpret_cast<uintptr_t
>(data.data()) % 
alignof(uint64_t) == 0);
 
  132            if constexpr (Bits::needs_padding) {
 
  133                std::span<uint64_t> data64{
reinterpret_cast<uint64_t *
>(data.data()), data.size() / 
sizeof(uint64_t)};
 
  134                for (
auto &val : data64) {
 
  135                    val = little_endian(little_endian(val) & Bits::data_mask);