From 0efc230381835ca3e5be560fd65eda023faf2b37 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 9 Apr 2020 02:37:51 -0300 Subject: astc: OutputBitStream style changes and make it constexpr --- src/video_core/textures/astc.cpp | 58 ++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 062b4f252..20706bda0 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -40,17 +40,17 @@ constexpr u32 Popcnt(u32 n) { class InputBitStream { public: explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) - : m_CurByte(ptr), m_NextBit(start_offset % 8) {} + : cur_byte(ptr), next_bit(start_offset % 8) {} std::size_t GetBitsRead() const { return m_BitsRead; } u32 ReadBit() { - u32 bit = *m_CurByte >> m_NextBit++; - while (m_NextBit >= 8) { - m_NextBit -= 8; - m_CurByte++; + u32 bit = *cur_byte >> next_bit++; + while (next_bit >= 8) { + next_bit -= 8; + cur_byte++; } m_BitsRead++; @@ -75,64 +75,58 @@ public: } private: - const u8* m_CurByte; - std::size_t m_NextBit = 0; + const u8* cur_byte; + std::size_t next_bit = 0; std::size_t m_BitsRead = 0; }; class OutputBitStream { public: - explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0) - : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {} + constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0) + : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {} - ~OutputBitStream() = default; - - s32 GetBitsWritten() const { - return m_BitsWritten; + constexpr std::size_t GetBitsWritten() const { + return bits_written; } - void WriteBitsR(u32 val, u32 nBits) { + constexpr void WriteBitsR(u32 val, u32 nBits) { for (u32 i = 0; i < nBits; i++) { WriteBit((val >> (nBits - i - 1)) & 1); } } - void WriteBits(u32 val, u32 nBits) { + constexpr void WriteBits(u32 val, u32 nBits) { for (u32 i = 0; i < nBits; i++) { WriteBit((val >> i) & 1); } } private: - void WriteBit(s32 b) { - - if (done) + constexpr void WriteBit(bool b) { + if (bits_written >= num_bits) { return; + } - const u32 mask = 1 << m_NextBit++; + const u32 mask = 1 << next_bit++; // clear the bit - *m_CurByte &= static_cast(~mask); + *cur_byte &= static_cast(~mask); // Write the bit, if necessary if (b) - *m_CurByte |= static_cast(mask); + *cur_byte |= static_cast(mask); // Next byte? - if (m_NextBit >= 8) { - m_CurByte += 1; - m_NextBit = 0; + if (next_bit >= 8) { + cur_byte += 1; + next_bit = 0; } - - done = done || ++m_BitsWritten >= m_NumBits; } - s32 m_BitsWritten = 0; - const s32 m_NumBits; - u8* m_CurByte; - s32 m_NextBit = 0; - - bool done = false; + u8* cur_byte; + std::size_t num_bits; + std::size_t bits_written = 0; + std::size_t next_bit = 0; }; template -- cgit v1.2.3 From d22a6892506395d7348f42c21288ab5ac9d7be5a Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 9 Apr 2020 02:54:05 -0300 Subject: astc: Make InputBitStream constexpr --- src/video_core/textures/astc.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 20706bda0..91b4c5d7e 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -39,25 +39,25 @@ constexpr u32 Popcnt(u32 n) { class InputBitStream { public: - explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) - : cur_byte(ptr), next_bit(start_offset % 8) {} + constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0) + : cur_byte{ptr}, next_bit{start_offset % 8} {} - std::size_t GetBitsRead() const { - return m_BitsRead; + constexpr std::size_t GetBitsRead() const { + return bits_read; } - u32 ReadBit() { - u32 bit = *cur_byte >> next_bit++; + constexpr bool ReadBit() { + const bool bit = (*cur_byte >> next_bit++) & 1; while (next_bit >= 8) { next_bit -= 8; cur_byte++; } - m_BitsRead++; - return bit & 1; + bits_read++; + return bit; } - u32 ReadBits(std::size_t nBits) { + constexpr u32 ReadBits(std::size_t nBits) { u32 ret = 0; for (std::size_t i = 0; i < nBits; ++i) { ret |= (ReadBit() & 1) << i; @@ -66,7 +66,7 @@ public: } template - u32 ReadBits() { + constexpr u32 ReadBits() { u32 ret = 0; for (std::size_t i = 0; i < nBits; ++i) { ret |= (ReadBit() & 1) << i; @@ -77,7 +77,7 @@ public: private: const u8* cur_byte; std::size_t next_bit = 0; - std::size_t m_BitsRead = 0; + std::size_t bits_read = 0; }; class OutputBitStream { -- cgit v1.2.3 From 6b4d4473be7ec428ae3501534c6f5e19bfda35ee Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 9 Apr 2020 03:35:07 -0300 Subject: astc: Move Replicate to a constexpr LUT when possible --- src/video_core/textures/astc.cpp | 46 +++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 91b4c5d7e..0985cb578 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -628,12 +628,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)] // is the same as [(numBits - 1):0] and repeats all the way down. template -static IntType Replicate(IntType val, u32 numBits, u32 toBit) { - if (numBits == 0) +static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) { + if (numBits == 0) { return 0; - if (toBit == 0) + } + if (toBit == 0) { return 0; - IntType v = val & static_cast((1 << numBits) - 1); + } + const IntType v = val & static_cast((1 << numBits) - 1); IntType res = v; u32 reslen = numBits; while (reslen < toBit) { @@ -650,6 +652,34 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) { return res; } +static constexpr std::size_t NumReplicateEntries(u32 num_bits) { + return std::size_t(1) << num_bits; +} + +template +static constexpr auto MakeReplicateTable() { + std::array table{}; + for (IntType value = 0; value < static_cast(std::size(table)); ++value) { + table[value] = Replicate(value, num_bits, to_bit); + } + return table; +} + +static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateByteTo16(std::size_t value) { + return REPLICATE_BYTE_TO_16_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateBitTo7(std::size_t value) { + return REPLICATE_BIT_TO_7_TABLE[value]; +} + +static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable(); +static constexpr u32 ReplicateBitTo9(std::size_t value) { + return REPLICATE_BIT_TO_9_TABLE[value]; +} + class Pixel { protected: using ChannelType = s16; @@ -833,7 +863,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP u32 A = 0, B = 0, C = 0, D = 0; // A is just the lsb replicated 9 times. - A = Replicate(bitval & 1, 1, 9); + A = ReplicateBitTo9(bitval & 1); switch (val.encoding) { // Replicate bits @@ -956,7 +986,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { u32 bitval = val.bit_value; u32 bitlen = val.num_bits; - u32 A = Replicate(bitval & 1, 1, 7); + u32 A = ReplicateBitTo7(bitval & 1); u32 B = 0, C = 0, D = 0; u32 result = 0; @@ -1562,9 +1592,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 Pixel p; for (u32 c = 0; c < 4; c++) { u32 C0 = endpos32s[partition][0].Component(c); - C0 = Replicate(C0, 8, 16); + C0 = ReplicateByteTo16(C0); u32 C1 = endpos32s[partition][1].Component(c); - C1 = Replicate(C1, 8, 16); + C1 = ReplicateByteTo16(C1); u32 plane = 0; if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) { -- cgit v1.2.3 From 5de130beea5abaafd4d6d80e8b06e63d23a98c97 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 9 Apr 2020 03:58:25 -0300 Subject: astc: Implement a fast precompiled alternative for Replicate --- src/video_core/textures/astc.cpp | 59 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 0985cb578..55f9aa0e4 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -680,6 +680,61 @@ static constexpr u32 ReplicateBitTo9(std::size_t value) { return REPLICATE_BIT_TO_9_TABLE[value]; } +static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable(); +/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback +/// to the runtime implementation +static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_8_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_8_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_8_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_8_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_8_TABLE[value]; + case 6: + return REPLICATE_6_BIT_TO_8_TABLE[value]; + case 7: + return REPLICATE_7_BIT_TO_8_TABLE[value]; + case 8: + return REPLICATE_8_BIT_TO_8_TABLE[value]; + default: + return Replicate(value, num_bits, 8); + } +} + +static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable(); +static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_6_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_6_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_6_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_6_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_6_TABLE[value]; + default: + return Replicate(value, num_bits, 6); + } +} + class Pixel { protected: using ChannelType = s16; @@ -868,7 +923,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP switch (val.encoding) { // Replicate bits case IntegerEncoding::JustBits: - out[outIdx++] = Replicate(bitval, bitlen, 8); + out[outIdx++] = FastReplicateTo8(bitval, bitlen); break; // Use algorithm in C.2.13 @@ -992,7 +1047,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { u32 result = 0; switch (val.encoding) { case IntegerEncoding::JustBits: - result = Replicate(bitval, bitlen, 6); + result = FastReplicateTo6(bitval, bitlen); break; case IntegerEncoding::Trit: { -- cgit v1.2.3 From bd2c1ab8a01794013bdec6fff56e956d5e07172a Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 9 Apr 2020 05:27:57 -0300 Subject: astc: Use boost's static_vector to avoid heap allocations --- src/video_core/textures/astc.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 55f9aa0e4..f62d5c987 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -20,6 +20,8 @@ #include #include +#include + #include "common/common_types.h" #include "video_core/textures/astc.h" @@ -189,9 +191,13 @@ struct IntegerEncodedValue { u32 trit_value; }; }; +using IntegerEncodedVector = boost::container::static_vector< + IntegerEncodedValue, 64, + boost::container::static_vector_options< + boost::container::inplace_alignment, + boost::container::throw_on_overflow>::type>; -static void DecodeTritBlock(InputBitStream& bits, std::vector& result, - u32 nBitsPerValue) { +static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { // Implement the algorithm in section C.2.12 u32 m[5]; u32 t[5]; @@ -249,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector& result, +static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) { // Implement the algorithm in section C.2.12 u32 m[3]; @@ -337,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues(); // Fills result with the values that are encoded in the given // bitstream. We must know beforehand what the maximum possible // value is, and how many values we're decoding. -static void DecodeIntegerSequence(std::vector& result, InputBitStream& bits, - u32 maxRange, u32 nValues) { +static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange, + u32 nValues) { // Determine encoding parameters IntegerEncodedValue val = EncodingsValues[maxRange]; @@ -895,8 +901,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP } // We now have enough to decode our integer sequence. - std::vector decodedColorValues; - decodedColorValues.reserve(32); + IntegerEncodedVector decodedColorValues; InputBitStream colorStream(data); DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues); @@ -1126,7 +1131,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) { return result; } -static void UnquantizeTexelWeights(u32 out[2][144], const std::vector& weights, +static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights, const TexelWeightParams& params, const u32 blockWidth, const u32 blockHeight) { u32 weightIdx = 0; @@ -1624,8 +1629,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 static_cast((1 << (weightParams.GetPackedBitSize() % 8)) - 1); memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); - std::vector texelWeightValues; - texelWeightValues.reserve(64); + IntegerEncodedVector texelWeightValues; InputBitStream weightStream(texelWeightData); -- cgit v1.2.3 From 6bf5d2b0113b824c6babccc1e44c42a65917af5f Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 9 Apr 2020 05:47:34 -0300 Subject: astc: Hard code bit depth changes to 8 and use fast replicate --- src/video_core/textures/astc.cpp | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index f62d5c987..365bde2f1 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -759,10 +759,10 @@ public: // significant bits when going from larger to smaller bit depth // or by repeating the most significant bits when going from // smaller to larger bit depths. - void ChangeBitDepth(const u8 (&depth)[4]) { + void ChangeBitDepth() { for (u32 i = 0; i < 4; i++) { - Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]); - m_BitDepth[i] = depth[i]; + Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]); + m_BitDepth[i] = 8; } } @@ -774,28 +774,23 @@ public: // Changes the bit depth of a single component. See the comment // above for how we do this. - static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) { - assert(newDepth <= 8); + static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) { assert(oldDepth <= 8); - if (oldDepth == newDepth) { + if (oldDepth == 8) { // Do nothing return val; - } else if (oldDepth == 0 && newDepth != 0) { - return static_cast((1 << newDepth) - 1); - } else if (newDepth > oldDepth) { - return Replicate(val, oldDepth, newDepth); + } else if (oldDepth == 0) { + return static_cast((1 << 8) - 1); + } else if (8 > oldDepth) { + return static_cast(FastReplicateTo8(static_cast(val), oldDepth)); } else { // oldDepth > newDepth - if (newDepth == 0) { - return 0xFF; - } else { - u8 bitsWasted = static_cast(oldDepth - newDepth); - u16 v = static_cast(val); - v = static_cast((v + (1 << (bitsWasted - 1))) >> bitsWasted); - v = ::std::min(::std::max(0, v), static_cast((1 << newDepth) - 1)); - return static_cast(v); - } + const u8 bitsWasted = static_cast(oldDepth - 8); + u16 v = static_cast(val); + v = static_cast((v + (1 << (bitsWasted - 1))) >> bitsWasted); + v = ::std::min(::std::max(0, v), static_cast((1 << 8) - 1)); + return static_cast(v); } assert(false && "We shouldn't get here."); @@ -845,8 +840,7 @@ public: // up in the most-significant byte. u32 Pack() const { Pixel eightBit(*this); - const u8 eightBitDepth[4] = {8, 8, 8, 8}; - eightBit.ChangeBitDepth(eightBitDepth); + eightBit.ChangeBitDepth(); u32 r = 0; r |= eightBit.A(); -- cgit v1.2.3