From 0efc230381835ca3e5be560fd65eda023faf2b37 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 9 Apr 2020 02:37:51 -0300
Subject: astc: OutputBitStream style changes and make it constexpr

---
 src/video_core/textures/astc.cpp | 58 ++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

(limited to 'src/video_core')
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 062b4f252..20706bda0 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -40,17 +40,17 @@ constexpr u32 Popcnt(u32 n) {
 class InputBitStream {
 public:
     explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
-        : m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+        : cur_byte(ptr), next_bit(start_offset % 8) {}
 
     std::size_t GetBitsRead() const {
         return m_BitsRead;
     }
 
     u32 ReadBit() {
-        u32 bit = *m_CurByte >> m_NextBit++;
-        while (m_NextBit >= 8) {
-            m_NextBit -= 8;
-            m_CurByte++;
+        u32 bit = *cur_byte >> next_bit++;
+        while (next_bit >= 8) {
+            next_bit -= 8;
+            cur_byte++;
         }
 
         m_BitsRead++;
@@ -75,64 +75,58 @@ public:
     }
 
 private:
-    const u8* m_CurByte;
-    std::size_t m_NextBit = 0;
+    const u8* cur_byte;
+    std::size_t next_bit = 0;
     std::size_t m_BitsRead = 0;
 };
 
 class OutputBitStream {
 public:
-    explicit OutputBitStream(u8* ptr, s32 nBits = 0, s32 start_offset = 0)
-        : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+    constexpr explicit OutputBitStream(u8* ptr, std::size_t bits = 0, std::size_t start_offset = 0)
+        : cur_byte{ptr}, num_bits{bits}, next_bit{start_offset % 8} {}
 
-    ~OutputBitStream() = default;
-
-    s32 GetBitsWritten() const {
-        return m_BitsWritten;
+    constexpr std::size_t GetBitsWritten() const {
+        return bits_written;
     }
 
-    void WriteBitsR(u32 val, u32 nBits) {
+    constexpr void WriteBitsR(u32 val, u32 nBits) {
         for (u32 i = 0; i < nBits; i++) {
             WriteBit((val >> (nBits - i - 1)) & 1);
         }
     }
 
-    void WriteBits(u32 val, u32 nBits) {
+    constexpr void WriteBits(u32 val, u32 nBits) {
         for (u32 i = 0; i < nBits; i++) {
             WriteBit((val >> i) & 1);
         }
     }
 
 private:
-    void WriteBit(s32 b) {
-
-        if (done)
+    constexpr void WriteBit(bool b) {
+        if (bits_written >= num_bits) {
             return;
+        }
 
-        const u32 mask = 1 << m_NextBit++;
+        const u32 mask = 1 << next_bit++;
 
         // clear the bit
-        *m_CurByte &= static_cast<u8>(~mask);
+        *cur_byte &= static_cast<u8>(~mask);
 
         // Write the bit, if necessary
         if (b)
-            *m_CurByte |= static_cast<u8>(mask);
+            *cur_byte |= static_cast<u8>(mask);
 
         // Next byte?
-        if (m_NextBit >= 8) {
-            m_CurByte += 1;
-            m_NextBit = 0;
+        if (next_bit >= 8) {
+            cur_byte += 1;
+            next_bit = 0;
         }
-
-        done = done || ++m_BitsWritten >= m_NumBits;
     }
 
-    s32 m_BitsWritten = 0;
-    const s32 m_NumBits;
-    u8* m_CurByte;
-    s32 m_NextBit = 0;
-
-    bool done = false;
+    u8* cur_byte;
+    std::size_t num_bits;
+    std::size_t bits_written = 0;
+    std::size_t next_bit = 0;
 };
 
 template <typename IntType>
-- 
cgit v1.2.3


From d22a6892506395d7348f42c21288ab5ac9d7be5a Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 9 Apr 2020 02:54:05 -0300
Subject: astc: Make InputBitStream constexpr

---
 src/video_core/textures/astc.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 20706bda0..91b4c5d7e 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -39,25 +39,25 @@ constexpr u32 Popcnt(u32 n) {
 
 class InputBitStream {
 public:
-    explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
-        : cur_byte(ptr), next_bit(start_offset % 8) {}
+    constexpr explicit InputBitStream(const u8* ptr, std::size_t start_offset = 0)
+        : cur_byte{ptr}, next_bit{start_offset % 8} {}
 
-    std::size_t GetBitsRead() const {
-        return m_BitsRead;
+    constexpr std::size_t GetBitsRead() const {
+        return bits_read;
     }
 
-    u32 ReadBit() {
-        u32 bit = *cur_byte >> next_bit++;
+    constexpr bool ReadBit() {
+        const bool bit = (*cur_byte >> next_bit++) & 1;
         while (next_bit >= 8) {
             next_bit -= 8;
             cur_byte++;
         }
 
-        m_BitsRead++;
-        return bit & 1;
+        bits_read++;
+        return bit;
     }
 
-    u32 ReadBits(std::size_t nBits) {
+    constexpr u32 ReadBits(std::size_t nBits) {
         u32 ret = 0;
         for (std::size_t i = 0; i < nBits; ++i) {
             ret |= (ReadBit() & 1) << i;
@@ -66,7 +66,7 @@ public:
     }
 
     template <std::size_t nBits>
-    u32 ReadBits() {
+    constexpr u32 ReadBits() {
         u32 ret = 0;
         for (std::size_t i = 0; i < nBits; ++i) {
             ret |= (ReadBit() & 1) << i;
@@ -77,7 +77,7 @@ public:
 private:
     const u8* cur_byte;
     std::size_t next_bit = 0;
-    std::size_t m_BitsRead = 0;
+    std::size_t bits_read = 0;
 };
 
 class OutputBitStream {
-- 
cgit v1.2.3


From 6b4d4473be7ec428ae3501534c6f5e19bfda35ee Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 9 Apr 2020 03:35:07 -0300
Subject: astc: Move Replicate to a constexpr LUT when possible

---
 src/video_core/textures/astc.cpp | 46 +++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 8 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 91b4c5d7e..0985cb578 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -628,12 +628,14 @@ static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) {
 // Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
 // is the same as [(numBits - 1):0] and repeats all the way down.
 template <typename IntType>
-static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
-    if (numBits == 0)
+static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
+    if (numBits == 0) {
         return 0;
-    if (toBit == 0)
+    }
+    if (toBit == 0) {
         return 0;
-    IntType v = val & static_cast<IntType>((1 << numBits) - 1);
+    }
+    const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
     IntType res = v;
     u32 reslen = numBits;
     while (reslen < toBit) {
@@ -650,6 +652,34 @@ static IntType Replicate(IntType val, u32 numBits, u32 toBit) {
     return res;
 }
 
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
+    }
+    return table;
+}
+
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+    return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+    return REPLICATE_BIT_TO_9_TABLE[value];
+}
+
 class Pixel {
 protected:
     using ChannelType = s16;
@@ -833,7 +863,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
 
         u32 A = 0, B = 0, C = 0, D = 0;
         // A is just the lsb replicated 9 times.
-        A = Replicate(bitval & 1, 1, 9);
+        A = ReplicateBitTo9(bitval & 1);
 
         switch (val.encoding) {
         // Replicate bits
@@ -956,7 +986,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
     u32 bitval = val.bit_value;
     u32 bitlen = val.num_bits;
 
-    u32 A = Replicate(bitval & 1, 1, 7);
+    u32 A = ReplicateBitTo7(bitval & 1);
     u32 B = 0, C = 0, D = 0;
 
     u32 result = 0;
@@ -1562,9 +1592,9 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
             Pixel p;
             for (u32 c = 0; c < 4; c++) {
                 u32 C0 = endpos32s[partition][0].Component(c);
-                C0 = Replicate(C0, 8, 16);
+                C0 = ReplicateByteTo16(C0);
                 u32 C1 = endpos32s[partition][1].Component(c);
-                C1 = Replicate(C1, 8, 16);
+                C1 = ReplicateByteTo16(C1);
 
                 u32 plane = 0;
                 if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
-- 
cgit v1.2.3


From 5de130beea5abaafd4d6d80e8b06e63d23a98c97 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 9 Apr 2020 03:58:25 -0300
Subject: astc: Implement a fast precompiled alternative for Replicate

---
 src/video_core/textures/astc.cpp | 59 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 0985cb578..55f9aa0e4 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -680,6 +680,61 @@ static constexpr u32 ReplicateBitTo9(std::size_t value) {
     return REPLICATE_BIT_TO_9_TABLE[value];
 }
 
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 8);
+    }
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 6);
+    }
+}
+
 class Pixel {
 protected:
     using ChannelType = s16;
@@ -868,7 +923,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
         switch (val.encoding) {
         // Replicate bits
         case IntegerEncoding::JustBits:
-            out[outIdx++] = Replicate(bitval, bitlen, 8);
+            out[outIdx++] = FastReplicateTo8(bitval, bitlen);
             break;
 
         // Use algorithm in C.2.13
@@ -992,7 +1047,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
     u32 result = 0;
     switch (val.encoding) {
     case IntegerEncoding::JustBits:
-        result = Replicate(bitval, bitlen, 6);
+        result = FastReplicateTo6(bitval, bitlen);
         break;
 
     case IntegerEncoding::Trit: {
-- 
cgit v1.2.3


From bd2c1ab8a01794013bdec6fff56e956d5e07172a Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 9 Apr 2020 05:27:57 -0300
Subject: astc: Use boost's static_vector to avoid heap allocations

---
 src/video_core/textures/astc.cpp | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 55f9aa0e4..f62d5c987 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -20,6 +20,8 @@
 #include <cstring>
 #include <vector>
 
+#include <boost/container/static_vector.hpp>
+
 #include "common/common_types.h"
 
 #include "video_core/textures/astc.h"
@@ -189,9 +191,13 @@ struct IntegerEncodedValue {
         u32 trit_value;
     };
 };
+using IntegerEncodedVector = boost::container::static_vector<
+    IntegerEncodedValue, 64,
+    boost::container::static_vector_options<
+        boost::container::inplace_alignment<alignof(IntegerEncodedValue)>,
+        boost::container::throw_on_overflow<false>>::type>;
 
-static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
-                            u32 nBitsPerValue) {
+static void DecodeTritBlock(InputBitStream& bits, IntegerEncodedVector& result, u32 nBitsPerValue) {
     // Implement the algorithm in section C.2.12
     u32 m[5];
     u32 t[5];
@@ -249,7 +255,7 @@ static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValu
     }
 }
 
-static void DecodeQus32Block(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
+static void DecodeQus32Block(InputBitStream& bits, IntegerEncodedVector& result,
                              u32 nBitsPerValue) {
     // Implement the algorithm in section C.2.12
     u32 m[3];
@@ -337,8 +343,8 @@ static constexpr std::array EncodingsValues = MakeEncodedValues();
 // Fills result with the values that are encoded in the given
 // bitstream. We must know beforehand what the maximum possible
 // value is, and how many values we're decoding.
-static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, InputBitStream& bits,
-                                  u32 maxRange, u32 nValues) {
+static void DecodeIntegerSequence(IntegerEncodedVector& result, InputBitStream& bits, u32 maxRange,
+                                  u32 nValues) {
     // Determine encoding parameters
     IntegerEncodedValue val = EncodingsValues[maxRange];
 
@@ -895,8 +901,7 @@ static void DecodeColorValues(u32* out, u8* data, const u32* modes, const u32 nP
     }
 
     // We now have enough to decode our integer sequence.
-    std::vector<IntegerEncodedValue> decodedColorValues;
-    decodedColorValues.reserve(32);
+    IntegerEncodedVector decodedColorValues;
 
     InputBitStream colorStream(data);
     DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
@@ -1126,7 +1131,7 @@ static u32 UnquantizeTexelWeight(const IntegerEncodedValue& val) {
     return result;
 }
 
-static void UnquantizeTexelWeights(u32 out[2][144], const std::vector<IntegerEncodedValue>& weights,
+static void UnquantizeTexelWeights(u32 out[2][144], const IntegerEncodedVector& weights,
                                    const TexelWeightParams& params, const u32 blockWidth,
                                    const u32 blockHeight) {
     u32 weightIdx = 0;
@@ -1624,8 +1629,7 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32
         static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1);
     memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
 
-    std::vector<IntegerEncodedValue> texelWeightValues;
-    texelWeightValues.reserve(64);
+    IntegerEncodedVector texelWeightValues;
 
     InputBitStream weightStream(texelWeightData);
 
-- 
cgit v1.2.3


From 6bf5d2b0113b824c6babccc1e44c42a65917af5f Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 9 Apr 2020 05:47:34 -0300
Subject: astc: Hard code bit depth changes to 8 and use fast replicate

---
 src/video_core/textures/astc.cpp | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'src/video_core')

diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index f62d5c987..365bde2f1 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -759,10 +759,10 @@ public:
     // significant bits when going from larger to smaller bit depth
     // or by repeating the most significant bits when going from
     // smaller to larger bit depths.
-    void ChangeBitDepth(const u8 (&depth)[4]) {
+    void ChangeBitDepth() {
         for (u32 i = 0; i < 4; i++) {
-            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
-            m_BitDepth[i] = depth[i];
+            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i]);
+            m_BitDepth[i] = 8;
         }
     }
 
@@ -774,28 +774,23 @@ public:
 
     // Changes the bit depth of a single component. See the comment
     // above for how we do this.
-    static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth, u8 newDepth) {
-        assert(newDepth <= 8);
+    static ChannelType ChangeBitDepth(Pixel::ChannelType val, u8 oldDepth) {
         assert(oldDepth <= 8);
 
-        if (oldDepth == newDepth) {
+        if (oldDepth == 8) {
             // Do nothing
             return val;
-        } else if (oldDepth == 0 && newDepth != 0) {
-            return static_cast<ChannelType>((1 << newDepth) - 1);
-        } else if (newDepth > oldDepth) {
-            return Replicate(val, oldDepth, newDepth);
+        } else if (oldDepth == 0) {
+            return static_cast<ChannelType>((1 << 8) - 1);
+        } else if (8 > oldDepth) {
+            return static_cast<ChannelType>(FastReplicateTo8(static_cast<u32>(val), oldDepth));
         } else {
             // oldDepth > newDepth
-            if (newDepth == 0) {
-                return 0xFF;
-            } else {
-                u8 bitsWasted = static_cast<u8>(oldDepth - newDepth);
-                u16 v = static_cast<u16>(val);
-                v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
-                v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << newDepth) - 1));
-                return static_cast<u8>(v);
-            }
+            const u8 bitsWasted = static_cast<u8>(oldDepth - 8);
+            u16 v = static_cast<u16>(val);
+            v = static_cast<u16>((v + (1 << (bitsWasted - 1))) >> bitsWasted);
+            v = ::std::min<u16>(::std::max<u16>(0, v), static_cast<u16>((1 << 8) - 1));
+            return static_cast<u8>(v);
         }
 
         assert(false && "We shouldn't get here.");
@@ -845,8 +840,7 @@ public:
     // up in the most-significant byte.
     u32 Pack() const {
         Pixel eightBit(*this);
-        const u8 eightBitDepth[4] = {8, 8, 8, 8};
-        eightBit.ChangeBitDepth(eightBitDepth);
+        eightBit.ChangeBitDepth();
 
         u32 r = 0;
         r |= eightBit.A();
-- 
cgit v1.2.3