30 files changed, 663 insertions, 182 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index ac4ec36de..f5ae57039 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -5,6 +5,8 @@ add_library(video_core STATIC
     debug_utils/debug_utils.h
     engines/fermi_2d.cpp
     engines/fermi_2d.h
+    engines/kepler_memory.cpp
+    engines/kepler_memory.h
     engines/maxwell_3d.cpp
     engines/maxwell_3d.h
     engines/maxwell_compute.cpp
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 2625ddfdc..f1aa6091b 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -14,6 +14,7 @@
 #include "core/tracer/recorder.h"
 #include "video_core/command_processor.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
 #include "video_core/engines/maxwell_dma.h"
@@ -69,6 +70,9 @@ void GPU::ProcessCommandLists(const std::vector<CommandListHeader>& commands) {
         case EngineID::MAXWELL_DMA_COPY_A:
             maxwell_dma->WriteReg(method, value);
             break;
+        case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+            kepler_memory->WriteReg(method, value);
+            break;
         default:
             UNIMPLEMENTED_MSG("Unimplemented engine");
         }
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index dcf9ef8b9..021b83eaa 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -26,7 +26,7 @@ public:
     void WriteReg(u32 method, u32 value);
 
     struct Regs {
-        static constexpr size_t NUM_REGS = 0x258;
+        static constexpr std::size_t NUM_REGS = 0x258;
 
         struct Surface {
             RenderTargetFormat format;
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
new file mode 100644
index 000000000..66ae6332d
--- /dev/null
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -0,0 +1,45 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/logging/log.h"
+#include "core/memory.h"
+#include "video_core/engines/kepler_memory.h"
+
+namespace Tegra::Engines {
+
+KeplerMemory::KeplerMemory(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+KeplerMemory::~KeplerMemory() = default;
+
+void KeplerMemory::WriteReg(u32 method, u32 value) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid KeplerMemory register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = value;
+
+    switch (method) {
+    case KEPLERMEMORY_REG_INDEX(exec): {
+        state.write_offset = 0;
+        break;
+    }
+    case KEPLERMEMORY_REG_INDEX(data): {
+        ProcessData(value);
+        break;
+    }
+    }
+}
+
+void KeplerMemory::ProcessData(u32 data) {
+    ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
+    ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
+
+    GPUVAddr address = regs.dest.Address();
+    VAddr dest_address =
+        *memory_manager.GpuToCpuAddress(address + state.write_offset * sizeof(u32));
+
+    Memory::Write32(dest_address, data);
+
+    state.write_offset++;
+}
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
new file mode 100644
index 000000000..b0d0078cf
--- /dev/null
+++ b/src/video_core/engines/kepler_memory.h
@@ -0,0 +1,90 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra::Engines {
+
+#define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
+    (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
+
+class KeplerMemory final {
+public:
+    KeplerMemory(MemoryManager& memory_manager);
+    ~KeplerMemory();
+
+    /// Write the value to the register identified by method.
+    void WriteReg(u32 method, u32 value);
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x7F;
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0x60);
+
+                u32 line_length_in;
+                u32 line_count;
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 pitch;
+                    u32 block_dimensions;
+                    u32 width;
+                    u32 height;
+                    u32 depth;
+                    u32 z;
+                    u32 x;
+                    u32 y;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } dest;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec;
+
+                u32 data;
+
+                INSERT_PADDING_WORDS(0x11);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    struct {
+        u32 write_offset = 0;
+    } state{};
+
+private:
+    MemoryManager& memory_manager;
+
+    void ProcessData(u32 data);
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(line_length_in, 0x60);
+ASSERT_REG_POSITION(line_count, 0x61);
+ASSERT_REG_POSITION(dest, 0x62);
+ASSERT_REG_POSITION(exec, 0x6C);
+ASSERT_REG_POSITION(data, 0x6D);
+#undef ASSERT_REG_POSITION
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 329079ddd..8afd26fe9 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -248,8 +248,8 @@ void Maxwell3D::DrawArrays() {
 
 void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
     // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
-    auto& shader = state.shader_stages[static_cast<size_t>(stage)];
-    auto& bind_data = regs.cb_bind[static_cast<size_t>(stage)];
+    auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
+    auto& bind_data = regs.cb_bind[static_cast<std::size_t>(stage)];
 
     auto& buffer = shader.const_buffers[bind_data.index];
 
@@ -316,14 +316,14 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
 std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderStage stage) const {
     std::vector<Texture::FullTextureInfo> textures;
 
-    auto& fragment_shader = state.shader_stages[static_cast<size_t>(stage)];
+    auto& fragment_shader = state.shader_stages[static_cast<std::size_t>(stage)];
     auto& tex_info_buffer = fragment_shader.const_buffers[regs.tex_cb_index];
     ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
 
     GPUVAddr tex_info_buffer_end = tex_info_buffer.address + tex_info_buffer.size;
 
     // Offset into the texture constbuffer where the texture info begins.
-    static constexpr size_t TextureInfoOffset = 0x20;
+    static constexpr std::size_t TextureInfoOffset = 0x20;
 
     for (GPUVAddr current_texture = tex_info_buffer.address + TextureInfoOffset;
          current_texture < tex_info_buffer_end; current_texture += sizeof(Texture::TextureHandle)) {
@@ -360,8 +360,9 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
     return textures;
 }
 
-Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage, size_t offset) const {
-    auto& shader = state.shader_stages[static_cast<size_t>(stage)];
+Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
+                                                    std::size_t offset) const {
+    auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
     ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
 
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index d3be900a4..b81b0723d 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -34,17 +34,17 @@ public:
     /// Register structure of the Maxwell3D engine.
     /// TODO(Subv): This structure will need to be made bigger as more registers are discovered.
     struct Regs {
-        static constexpr size_t NUM_REGS = 0xE00;
-
-        static constexpr size_t NumRenderTargets = 8;
-        static constexpr size_t NumViewports = 16;
-        static constexpr size_t NumCBData = 16;
-        static constexpr size_t NumVertexArrays = 32;
-        static constexpr size_t NumVertexAttributes = 32;
-        static constexpr size_t MaxShaderProgram = 6;
-        static constexpr size_t MaxShaderStage = 5;
+        static constexpr std::size_t NUM_REGS = 0xE00;
+
+        static constexpr std::size_t NumRenderTargets = 8;
+        static constexpr std::size_t NumViewports = 16;
+        static constexpr std::size_t NumCBData = 16;
+        static constexpr std::size_t NumVertexArrays = 32;
+        static constexpr std::size_t NumVertexAttributes = 32;
+        static constexpr std::size_t MaxShaderProgram = 6;
+        static constexpr std::size_t MaxShaderStage = 5;
         // Maximum number of const buffers per shader stage.
-        static constexpr size_t MaxConstBuffers = 18;
+        static constexpr std::size_t MaxConstBuffers = 18;
 
         enum class QueryMode : u32 {
             Write = 0,
@@ -443,9 +443,9 @@ public:
             }
         };
 
-        bool IsShaderConfigEnabled(size_t index) const {
+        bool IsShaderConfigEnabled(std::size_t index) const {
             // The VertexB is always enabled.
-            if (index == static_cast<size_t>(Regs::ShaderProgram::VertexB)) {
+            if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) {
                 return true;
             }
             return shader_config[index].enable != 0;
@@ -571,7 +571,7 @@ public:
                         BitField<25, 3, u32> map_7;
                     };
 
-                    u32 GetMap(size_t index) const {
+                    u32 GetMap(std::size_t index) const {
                         const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3,
                                                                      map_4, map_5, map_6, map_7};
                         ASSERT(index < maps.size());
@@ -925,7 +925,7 @@ public:
     std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;
 
     /// Returns the texture information for a specific texture in a specific shader stage.
-    Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, size_t offset) const;
+    Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const;
 
 private:
     VideoCore::RasterizerInterface& rasterizer;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index c24d33d5c..aa7481b8c 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -50,7 +50,7 @@ void MaxwellDMA::HandleCopy() {
     ASSERT(regs.dst_params.pos_y == 0);
 
     if (regs.exec.is_dst_linear == regs.exec.is_src_linear) {
-        size_t copy_size = regs.x_count;
+        std::size_t copy_size = regs.x_count;
 
         // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
         // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count).
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 7882f16e0..311ccb616 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -23,7 +23,7 @@ public:
     void WriteReg(u32 method, u32 value);
 
     struct Regs {
-        static constexpr size_t NUM_REGS = 0x1D6;
+        static constexpr std::size_t NUM_REGS = 0x1D6;
 
         struct Parameters {
             union {
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 9176a8dbc..6e555ea03 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -20,10 +20,10 @@ namespace Tegra::Shader {
 
 struct Register {
     /// Number of registers
-    static constexpr size_t NumRegisters = 256;
+    static constexpr std::size_t NumRegisters = 256;
 
     /// Register 255 is special cased to always be 0
-    static constexpr size_t ZeroIndex = 255;
+    static constexpr std::size_t ZeroIndex = 255;
 
     enum class Size : u64 {
         Byte = 0,
@@ -67,6 +67,13 @@ private:
     u64 value{};
 };
 
+enum class AttributeSize : u64 {
+    Word = 0,
+    DoubleWord = 1,
+    TripleWord = 2,
+    QuadWord = 3,
+};
+
 union Attribute {
     Attribute() = default;
 
@@ -87,9 +94,10 @@ union Attribute {
     };
 
     union {
+        BitField<20, 10, u64> immediate;
         BitField<22, 2, u64> element;
         BitField<24, 6, Index> index;
-        BitField<47, 3, u64> size;
+        BitField<47, 3, AttributeSize> size;
     } fmt20;
 
     union {
@@ -254,6 +262,15 @@ enum class TextureQueryType : u64 {
     BorderColor = 22,
 };
 
+enum class TextureProcessMode : u64 {
+    None = 0,
+    LZ = 1,  // Unknown, appears to be the same as none.
+    LB = 2,  // Load Bias.
+    LL = 3,  // Load LOD (LevelOfDetail)
+    LBA = 6, // Load Bias. The A is unknown, does not appear to differ with LB
+    LLA = 7  // Load LOD. The A is unknown, does not appear to differ with LL
+};
+
 enum class IpaInterpMode : u64 { Linear = 0, Perspective = 1, Flat = 2, Sc = 3 };
 enum class IpaSampleMode : u64 { Default = 0, Centroid = 1, Offset = 2 };
 
@@ -424,6 +441,45 @@ union Instruction {
     } bfe;
 
     union {
+        BitField<48, 3, u64> pred48;
+
+        union {
+            BitField<20, 20, u64> entry_a;
+            BitField<39, 5, u64> entry_b;
+            BitField<45, 1, u64> neg;
+            BitField<46, 1, u64> uses_cc;
+        } imm;
+
+        union {
+            BitField<20, 14, u64> cb_index;
+            BitField<34, 5, u64> cb_offset;
+            BitField<56, 1, u64> neg;
+            BitField<57, 1, u64> uses_cc;
+        } hi;
+
+        union {
+            BitField<20, 14, u64> cb_index;
+            BitField<34, 5, u64> cb_offset;
+            BitField<39, 5, u64> entry_a;
+            BitField<45, 1, u64> neg;
+            BitField<46, 1, u64> uses_cc;
+        } rz;
+
+        union {
+            BitField<39, 5, u64> entry_a;
+            BitField<45, 1, u64> neg;
+            BitField<46, 1, u64> uses_cc;
+        } r1;
+
+        union {
+            BitField<28, 8, u64> entry_a;
+            BitField<37, 1, u64> neg;
+            BitField<38, 1, u64> uses_cc;
+        } r2;
+
+    } lea;
+
+    union {
         BitField<0, 5, FlowCondition> cond;
     } flow;
 
@@ -478,6 +534,18 @@ union Instruction {
     } psetp;
 
     union {
+        BitField<12, 3, u64> pred12;
+        BitField<15, 1, u64> neg_pred12;
+        BitField<24, 2, PredOperation> cond;
+        BitField<29, 3, u64> pred29;
+        BitField<32, 1, u64> neg_pred29;
+        BitField<39, 3, u64> pred39;
+        BitField<42, 1, u64> neg_pred39;
+        BitField<44, 1, u64> bf;
+        BitField<45, 2, PredOperation> op;
+    } pset;
+
+    union {
         BitField<39, 3, u64> pred39;
         BitField<42, 1, u64> neg_pred;
         BitField<43, 1, u64> neg_a;
@@ -522,8 +590,9 @@ union Instruction {
         BitField<28, 1, u64> array;
         BitField<29, 2, TextureType> texture_type;
         BitField<31, 4, u64> component_mask;
+        BitField<55, 3, TextureProcessMode> process_mode;
 
-        bool IsComponentEnabled(size_t component) const {
+        bool IsComponentEnabled(std::size_t component) const {
             return ((1ull << component) & component_mask) != 0;
         }
     } tex;
@@ -538,7 +607,7 @@ union Instruction {
         BitField<29, 2, TextureType> texture_type;
         BitField<31, 4, u64> component_mask;
 
-        bool IsComponentEnabled(size_t component) const {
+        bool IsComponentEnabled(std::size_t component) const {
             return ((1ull << component) & component_mask) != 0;
         }
     } tmml;
@@ -585,7 +654,7 @@ union Instruction {
             return gpr28.Value() != Register::ZeroIndex;
         }
 
-        bool IsComponentEnabled(size_t component) const {
+        bool IsComponentEnabled(std::size_t component) const {
             static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{
                 {},
                 {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc},
@@ -593,7 +662,7 @@ union Instruction {
                 {0x7, 0xb, 0xd, 0xe, 0xf},
             }};
 
-            size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U};
+            std::size_t index{gpr0.Value() != Register::ZeroIndex ? 1U : 0U};
             index |= gpr28.Value() != Register::ZeroIndex ? 2 : 0;
 
             u32 mask = mask_lut[index][component_mask_selector];
@@ -726,6 +795,11 @@ public:
         ISCADD_C, // Scale and Add
         ISCADD_R,
         ISCADD_IMM,
+        LEA_R1,
+        LEA_R2,
+        LEA_RZ,
+        LEA_IMM,
+        LEA_HI,
         POPC_C,
         POPC_R,
         POPC_IMM,
@@ -784,6 +858,7 @@ public:
         ISET_C,
         ISET_IMM,
         PSETP,
+        PSET,
         XMAD_IMM,
         XMAD_CR,
         XMAD_RC,
@@ -807,6 +882,7 @@ public:
         IntegerSet,
         IntegerSetPredicate,
         PredicateSetPredicate,
+        PredicateSetRegister,
         Conversion,
         Xmad,
         Unknown,
@@ -871,7 +947,7 @@ public:
 private:
     struct Detail {
     private:
-        static constexpr size_t opcode_bitsize = 16;
+        static constexpr std::size_t opcode_bitsize = 16;
 
         /**
          * Generates the mask and the expected value after masking from a given bitstring.
@@ -880,8 +956,8 @@ private:
          */
         static auto GetMaskAndExpect(const char* const bitstring) {
             u16 mask = 0, expect = 0;
-            for (size_t i = 0; i < opcode_bitsize; i++) {
-                const size_t bit_position = opcode_bitsize - i - 1;
+            for (std::size_t i = 0; i < opcode_bitsize; i++) {
+                const std::size_t bit_position = opcode_bitsize - i - 1;
                 switch (bitstring[i]) {
                 case '0':
                     mask |= 1 << bit_position;
@@ -958,6 +1034,11 @@ private:
             INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"),
             INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"),
             INST("0011100-10100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"),
+            INST("0101101111011---", Id::LEA_R2, Type::ArithmeticInteger, "LEA_R2"),
+            INST("0101101111010---", Id::LEA_R1, Type::ArithmeticInteger, "LEA_R1"),
+            INST("001101101101----", Id::LEA_IMM, Type::ArithmeticInteger, "LEA_IMM"),
+            INST("010010111101----", Id::LEA_RZ, Type::ArithmeticInteger, "LEA_RZ"),
+            INST("00011000--------", Id::LEA_HI, Type::ArithmeticInteger, "LEA_HI"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
             INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
@@ -1012,6 +1093,7 @@ private:
             INST("010110110101----", Id::ISET_R, Type::IntegerSet, "ISET_R"),
             INST("010010110101----", Id::ISET_C, Type::IntegerSet, "ISET_C"),
             INST("0011011-0101----", Id::ISET_IMM, Type::IntegerSet, "ISET_IMM"),
+            INST("0101000010001---", Id::PSET, Type::PredicateSetRegister, "PSET"),
             INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"),
             INST("0011011-00------", Id::XMAD_IMM, Type::Xmad, "XMAD_IMM"),
             INST("0100111---------", Id::XMAD_CR, Type::Xmad, "XMAD_CR"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 86a809f86..baa8b63b7 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
 #include "video_core/engines/maxwell_dma.h"
@@ -27,6 +28,7 @@ GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
     fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
     maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(*memory_manager);
 }
 
 GPU::~GPU() = default;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 589a59b4f..5cc1e19ca 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -42,6 +42,7 @@ enum class RenderTargetFormat : u32 {
     R32_UINT = 0xE4,
     R32_FLOAT = 0xE5,
     B5G6R5_UNORM = 0xE8,
+    BGR5A1_UNORM = 0xE9,
     RG8_UNORM = 0xEA,
     RG8_SNORM = 0xEB,
     R16_UNORM = 0xEE,
@@ -102,6 +103,7 @@ class Fermi2D;
 class Maxwell3D;
 class MaxwellCompute;
 class MaxwellDMA;
+class KeplerMemory;
 } // namespace Engines
 
 enum class EngineID {
@@ -146,6 +148,8 @@ private:
     std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
     /// DMA engine
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
+    /// Inline memory engine
+    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro_interpreter.h
index 7d836b816..cee0baaf3 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro_interpreter.h
@@ -152,7 +152,7 @@ private:
     boost::optional<u32>
         delayed_pc; ///< Program counter to execute at after the delay slot is executed.
 
-    static constexpr size_t NumMacroRegisters = 8;
+    static constexpr std::size_t NumMacroRegisters = 8;
 
     /// General purpose macro registers.
     std::array<u32, NumMacroRegisters> registers = {};
diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp
index be17a2b9c..0df3725c2 100644
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -19,6 +19,7 @@ void RendererBase::RefreshBaseSettings() {
     UpdateCurrentFramebufferLayout();
 
     renderer_settings.use_framelimiter = Settings::values.use_frame_limit;
+    renderer_settings.set_background_color = true;
 }
 
 void RendererBase::UpdateCurrentFramebufferLayout() {
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 2a357f9d0..2cd0738ff 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -19,6 +19,7 @@ namespace VideoCore {
 
 struct RendererSettings {
     std::atomic_bool use_framelimiter{false};
+    std::atomic_bool set_background_color{false};
 };
 
 class RendererBase : NonCopyable {
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 0b5d18bcb..578aca789 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -12,10 +12,10 @@
 
 namespace OpenGL {
 
-OGLBufferCache::OGLBufferCache(size_t size) : stream_buffer(GL_ARRAY_BUFFER, size) {}
+OGLBufferCache::OGLBufferCache(std::size_t size) : stream_buffer(GL_ARRAY_BUFFER, size) {}
 
-GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, size_t size, size_t alignment,
-                                      bool cache) {
+GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size,
+                                      std::size_t alignment, bool cache) {
     auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
     const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
 
@@ -53,7 +53,8 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, size_t size, siz
     return uploaded_offset;
 }
 
-GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, size_t size, size_t alignment) {
+GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                          std::size_t alignment) {
     AlignBuffer(alignment);
     std::memcpy(buffer_ptr, raw_pointer, size);
     GLintptr uploaded_offset = buffer_offset;
@@ -63,7 +64,7 @@ GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, size_t size,
     return uploaded_offset;
 }
 
-void OGLBufferCache::Map(size_t max_size) {
+void OGLBufferCache::Map(std::size_t max_size) {
     bool invalidate;
     std::tie(buffer_ptr, buffer_offset_base, invalidate) =
         stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4);
@@ -81,10 +82,10 @@ GLuint OGLBufferCache::GetHandle() const {
     return stream_buffer.GetHandle();
 }
 
-void OGLBufferCache::AlignBuffer(size_t alignment) {
+void OGLBufferCache::AlignBuffer(std::size_t alignment) {
     // Align the offset, not the mapped pointer
     GLintptr offset_aligned =
-        static_cast<GLintptr>(Common::AlignUp(static_cast<size_t>(buffer_offset), alignment));
+        static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment));
     buffer_ptr += offset_aligned - buffer_offset;
     buffer_offset = offset_aligned;
 }
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 6da862902..6c18461f4 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -19,32 +19,32 @@ struct CachedBufferEntry final {
         return addr;
     }
 
-    size_t GetSizeInBytes() const {
+    std::size_t GetSizeInBytes() const {
         return size;
     }
 
     VAddr addr;
-    size_t size;
+    std::size_t size;
     GLintptr offset;
-    size_t alignment;
+    std::size_t alignment;
 };
 
 class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
 public:
-    explicit OGLBufferCache(size_t size);
+    explicit OGLBufferCache(std::size_t size);
 
-    GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, size_t size, size_t alignment = 4,
+    GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                           bool cache = true);
 
-    GLintptr UploadHostMemory(const void* raw_pointer, size_t size, size_t alignment = 4);
+    GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
 
-    void Map(size_t max_size);
+    void Map(std::size_t max_size);
     void Unmap();
 
     GLuint GetHandle() const;
 
 protected:
-    void AlignBuffer(size_t alignment);
+    void AlignBuffer(std::size_t alignment);
 
 private:
     OGLStreamBuffer stream_buffer;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index c59f3af1b..274c2dbcf 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <array>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -45,7 +46,7 @@ MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100,
 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo& info)
     : emu_window{window}, screen_info{info}, buffer_cache(STREAM_BUFFER_SIZE) {
     // Create sampler objects
-    for (size_t i = 0; i < texture_samplers.size(); ++i) {
+    for (std::size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
         state.texture_units[i].sampler = texture_samplers[i].sampler.handle;
     }
@@ -58,6 +59,8 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo
 
         if (extension == "GL_ARB_direct_state_access") {
             has_ARB_direct_state_access = true;
+        } else if (extension == "GL_ARB_multi_bind") {
+            has_ARB_multi_bind = true;
         } else if (extension == "GL_ARB_separate_shader_objects") {
             has_ARB_separate_shader_objects = true;
         } else if (extension == "GL_ARB_vertex_attrib_binding") {
@@ -178,7 +181,7 @@ void RasterizerOpenGL::SetupShaders() {
     u32 current_constbuffer_bindpoint = Tegra::Engines::Maxwell3D::Regs::MaxShaderStage;
     u32 current_texture_bindpoint = 0;
 
-    for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
+    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
         const auto& shader_config = gpu.regs.shader_config[index];
         const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
 
@@ -187,12 +190,12 @@ void RasterizerOpenGL::SetupShaders() {
             continue;
         }
 
-        const size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
+        const std::size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu.state.shader_stages[stage]);
         const GLintptr offset = buffer_cache.UploadHostMemory(
-            &ubo, sizeof(ubo), static_cast<size_t>(uniform_buffer_alignment));
+            &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
 
         // Bind the buffer
         glBindBufferRange(GL_UNIFORM_BUFFER, stage, buffer_cache.GetHandle(), offset, sizeof(ubo));
@@ -235,10 +238,10 @@ void RasterizerOpenGL::SetupShaders() {
     shader_program_manager->UseTrivialGeometryShader();
 }
 
-size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
+std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
 
-    size_t size = 0;
+    std::size_t size = 0;
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
         if (!regs.vertex_array[index].IsEnabled())
             continue;
@@ -296,7 +299,7 @@ void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
 
 void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_depth_fb,
                                              bool preserve_contents,
-                                             boost::optional<size_t> single_color_target) {
+                                             boost::optional<std::size_t> single_color_target) {
     MICROPROFILE_SCOPE(OpenGL_Framebuffer);
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
 
@@ -327,7 +330,7 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep
         } else {
             // Multiple color attachments are enabled
             std::array<GLenum, Maxwell::NumRenderTargets> buffers;
-            for (size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
+            for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
                 Surface color_surface = res_cache.GetColorBufferSurface(index, preserve_contents);
                 buffers[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index);
                 glFramebufferTexture2D(
@@ -339,7 +342,7 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep
         }
     } else {
         // No color attachments are enabled - zero out all of them
-        for (size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
+        for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
             glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER,
                                    GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index), GL_TEXTURE_2D,
                                    0, 0);
@@ -459,15 +462,15 @@ void RasterizerOpenGL::DrawArrays() {
     state.draw.vertex_buffer = buffer_cache.GetHandle();
     state.Apply();
 
-    size_t buffer_size = CalculateVertexArraysSize();
+    std::size_t buffer_size = CalculateVertexArraysSize();
 
     if (is_indexed) {
-        buffer_size = Common::AlignUp<size_t>(buffer_size, 4) + index_buffer_size;
+        buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + index_buffer_size;
     }
 
     // Uniform space for the 5 shader stages
     buffer_size =
-        Common::AlignUp<size_t>(buffer_size, 4) +
+        Common::AlignUp<std::size_t>(buffer_size, 4) +
         (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
@@ -641,19 +644,30 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shad
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
-    const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<size_t>(stage)];
+    const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)];
     const auto& entries = shader->GetShaderEntries().const_buffer_entries;
 
+    constexpr u64 max_binds = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers;
+    std::array<GLuint, max_binds> bind_buffers;
+    std::array<GLintptr, max_binds> bind_offsets;
+    std::array<GLsizeiptr, max_binds> bind_sizes;
+
+    ASSERT_MSG(entries.size() <= max_binds, "Exceeded expected number of binding points.");
+
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& used_buffer = entries[bindpoint];
         const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
 
         if (!buffer.enabled) {
+            // With disabled buffers set values as zero to unbind them
+            bind_buffers[bindpoint] = 0;
+            bind_offsets[bindpoint] = 0;
+            bind_sizes[bindpoint] = 0;
             continue;
         }
 
-        size_t size = 0;
+        std::size_t size = 0;
 
         if (used_buffer.IsIndirect()) {
             // Buffer is accessed indirectly, so upload the entire thing
@@ -675,17 +689,22 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shad
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
         GLintptr const_buffer_offset = buffer_cache.UploadMemory(
-            buffer.address, size, static_cast<size_t>(uniform_buffer_alignment));
-
-        glBindBufferRange(GL_UNIFORM_BUFFER, current_bindpoint + bindpoint,
-                          buffer_cache.GetHandle(), const_buffer_offset, size);
+            buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment));
 
         // Now configure the bindpoint of the buffer inside the shader
         glUniformBlockBinding(shader->GetProgramHandle(),
                               shader->GetProgramResourceIndex(used_buffer),
                               current_bindpoint + bindpoint);
+
+        // Prepare values for multibind
+        bind_buffers[bindpoint] = buffer_cache.GetHandle();
+        bind_offsets[bindpoint] = const_buffer_offset;
+        bind_sizes[bindpoint] = size;
     }
 
+    glBindBuffersRange(GL_UNIFORM_BUFFER, current_bindpoint, static_cast<GLsizei>(entries.size()),
+                       bind_buffers.data(), bind_offsets.data(), bind_sizes.data());
+
     return current_bindpoint + static_cast<u32>(entries.size());
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 745c3dc0c..bf9560bdc 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -73,7 +73,7 @@ public:
     };
 
     /// Maximum supported size that a constbuffer can have in bytes.
-    static constexpr size_t MaxConstbufferSize = 0x10000;
+    static constexpr std::size_t MaxConstbufferSize = 0x10000;
     static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
                   "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
 
@@ -106,7 +106,7 @@ private:
      */
     void ConfigureFramebuffers(bool use_color_fb = true, bool using_depth_fb = true,
                                bool preserve_contents = true,
-                               boost::optional<size_t> single_color_target = {});
+                               boost::optional<std::size_t> single_color_target = {});
 
     /*
      * Configures the current constbuffers to use for the draw command.
@@ -159,6 +159,7 @@ private:
     void SyncLogicOpState();
 
     bool has_ARB_direct_state_access = false;
+    bool has_ARB_multi_bind = false;
     bool has_ARB_separate_shader_objects = false;
     bool has_ARB_vertex_attrib_binding = false;
 
@@ -179,12 +180,12 @@ private:
 
     std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers;
 
-    static constexpr size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
+    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
     OGLFramebuffer framebuffer;
     GLint uniform_buffer_alignment;
 
-    size_t CalculateVertexArraysSize() const;
+    std::size_t CalculateVertexArraysSize() const;
 
     void SetupVertexArrays();
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 29d61eccd..86682d7cb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -53,8 +53,6 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format));
     params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format));
     params.unaligned_height = config.tic.Height();
-    params.cache_width = Common::AlignUp(params.width, 8);
-    params.cache_height = Common::AlignUp(params.height, 8);
     params.target = SurfaceTargetFromTextureType(config.tic.texture_type);
 
     switch (params.target) {
@@ -77,7 +75,7 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     return params;
 }
 
-/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(size_t index) {
+/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(std::size_t index) {
     const auto& config{Core::System::GetInstance().GPU().Maxwell3D().regs.rt[index]};
     SurfaceParams params{};
     params.addr = TryGetCpuAddr(config.Address());
@@ -89,8 +87,6 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     params.width = config.width;
     params.height = config.height;
     params.unaligned_height = config.height;
-    params.cache_width = Common::AlignUp(params.width, 8);
-    params.cache_height = Common::AlignUp(params.height, 8);
     params.target = SurfaceTarget::Texture2D;
     params.depth = 1;
     params.size_in_bytes = params.SizeInBytes();
@@ -110,8 +106,6 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     params.width = zeta_width;
     params.height = zeta_height;
     params.unaligned_height = zeta_height;
-    params.cache_width = Common::AlignUp(params.width, 8);
-    params.cache_height = Common::AlignUp(params.height, 8);
     params.target = SurfaceTarget::Texture2D;
     params.depth = 1;
     params.size_in_bytes = params.SizeInBytes();
@@ -122,7 +116,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
     {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
     {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false},   // ABGR8UI
-    {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5U
+    {GL_RGB8, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},   // B5G6R5U
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
      false}, // A2B10G10R10U
     {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5U
@@ -173,6 +167,7 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false},                                // RG8S
     {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // RG32UI
     {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // R32UI
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X8
 
     // Depth formats
     {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F
@@ -209,7 +204,7 @@ static GLenum SurfaceTargetToGL(SurfaceParams::SurfaceTarget target) {
 }
 
 static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
-    ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
+    ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
     auto& format = tex_format_tuples[static_cast<unsigned int>(pixel_format)];
     ASSERT(component_type == format.component_type);
 
@@ -219,6 +214,7 @@ static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType
 static bool IsPixelFormatASTC(PixelFormat format) {
     switch (format) {
     case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::ASTC_2D_8X8:
         return true;
     default:
         return false;
@@ -229,6 +225,8 @@ static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) {
     switch (format) {
     case PixelFormat::ASTC_2D_4X4:
         return {4, 4};
+    case PixelFormat::ASTC_2D_8X8:
+        return {8, 8};
     default:
         LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format));
         UNREACHABLE();
@@ -262,7 +260,7 @@ static bool IsFormatBCn(PixelFormat format) {
 }
 
 template <bool morton_to_gl, PixelFormat format>
-void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, size_t gl_buffer_size,
+void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, std::size_t gl_buffer_size,
                 VAddr addr) {
     constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / CHAR_BIT;
     constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
@@ -273,7 +271,7 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, size_t
         const u32 tile_size{IsFormatBCn(format) ? 4U : 1U};
         const std::vector<u8> data = Tegra::Texture::UnswizzleTexture(
             addr, tile_size, bytes_per_pixel, stride, height, block_height);
-        const size_t size_to_copy{std::min(gl_buffer_size, data.size())};
+        const std::size_t size_to_copy{std::min(gl_buffer_size, data.size())};
         memcpy(gl_buffer, data.data(), size_to_copy);
     } else {
         // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should
@@ -284,7 +282,7 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, size_t
     }
 }
 
-static constexpr std::array<void (*)(u32, u32, u32, u8*, size_t, VAddr),
+static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr),
                             SurfaceParams::MaxPixelFormat>
     morton_to_gl_fns = {
         // clang-format off
@@ -333,6 +331,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, size_t, VAddr),
         MortonCopy<true, PixelFormat::RG8S>,
         MortonCopy<true, PixelFormat::RG32UI>,
         MortonCopy<true, PixelFormat::R32UI>,
+        MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
         MortonCopy<true, PixelFormat::Z32F>,
         MortonCopy<true, PixelFormat::Z16>,
         MortonCopy<true, PixelFormat::Z24S8>,
@@ -341,7 +340,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, size_t, VAddr),
         // clang-format on
 };
 
-static constexpr std::array<void (*)(u32, u32, u32, u8*, size_t, VAddr),
+static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr),
                             SurfaceParams::MaxPixelFormat>
     gl_to_morton_fns = {
         // clang-format off
@@ -392,6 +391,7 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, size_t, VAddr),
         MortonCopy<false, PixelFormat::RG8S>,
         MortonCopy<false, PixelFormat::RG32UI>,
         MortonCopy<false, PixelFormat::R32UI>,
+        nullptr,
         MortonCopy<false, PixelFormat::Z32F>,
         MortonCopy<false, PixelFormat::Z16>,
         MortonCopy<false, PixelFormat::Z24S8>,
@@ -477,30 +477,27 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
         // Only pre-create the texture for non-compressed textures.
         switch (params.target) {
         case SurfaceParams::SurfaceTarget::Texture1D:
-            glTexImage1D(SurfaceTargetToGL(params.target), 0, format_tuple.internal_format,
-                         rect.GetWidth(), 0, format_tuple.format, format_tuple.type, nullptr);
+            glTexStorage1D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format,
+                           rect.GetWidth());
             break;
         case SurfaceParams::SurfaceTarget::Texture2D:
-            glTexImage2D(SurfaceTargetToGL(params.target), 0, format_tuple.internal_format,
-                         rect.GetWidth(), rect.GetHeight(), 0, format_tuple.format,
-                         format_tuple.type, nullptr);
+            glTexStorage2D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format,
+                           rect.GetWidth(), rect.GetHeight());
             break;
         case SurfaceParams::SurfaceTarget::Texture3D:
         case SurfaceParams::SurfaceTarget::Texture2DArray:
-            glTexImage3D(SurfaceTargetToGL(params.target), 0, format_tuple.internal_format,
-                         rect.GetWidth(), rect.GetHeight(), params.depth, 0, format_tuple.format,
-                         format_tuple.type, nullptr);
+            glTexStorage3D(SurfaceTargetToGL(params.target), 1, format_tuple.internal_format,
+                           rect.GetWidth(), rect.GetHeight(), params.depth);
             break;
         default:
             LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
                          static_cast<u32>(params.target));
             UNREACHABLE();
-            glTexImage2D(GL_TEXTURE_2D, 0, format_tuple.internal_format, rect.GetWidth(),
-                         rect.GetHeight(), 0, format_tuple.format, format_tuple.type, nullptr);
+            glTexStorage2D(GL_TEXTURE_2D, 1, format_tuple.internal_format, rect.GetWidth(),
+                           rect.GetHeight());
         }
     }
 
-    glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_MAX_LEVEL, 0);
     glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_MIN_FILTER, GL_LINEAR);
     glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
     glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
@@ -522,9 +519,9 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
     S8Z24 input_pixel{};
     Z24S8 output_pixel{};
     constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::S8Z24)};
-    for (size_t y = 0; y < height; ++y) {
-        for (size_t x = 0; x < width; ++x) {
-            const size_t offset{bpp * (y * width + x)};
+    for (std::size_t y = 0; y < height; ++y) {
+        for (std::size_t x = 0; x < width; ++x) {
+            const std::size_t offset{bpp * (y * width + x)};
             std::memcpy(&input_pixel, &data[offset], sizeof(S8Z24));
             output_pixel.s8.Assign(input_pixel.s8);
             output_pixel.z24.Assign(input_pixel.z24);
@@ -535,9 +532,9 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
 
 static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) {
     constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8U)};
-    for (size_t y = 0; y < height; ++y) {
-        for (size_t x = 0; x < width; ++x) {
-            const size_t offset{bpp * (y * width + x)};
+    for (std::size_t y = 0; y < height; ++y) {
+        for (std::size_t x = 0; x < width; ++x) {
+            const std::size_t offset{bpp * (y * width + x)};
             const u8 temp{data[offset]};
             data[offset] = data[offset + 1];
             data[offset + 1] = temp;
@@ -553,7 +550,8 @@ static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) {
 static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
                                                u32 width, u32 height) {
     switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4: {
+    case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::ASTC_2D_8X8: {
         // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC.
         u32 block_width{};
         u32 block_height{};
@@ -600,13 +598,13 @@ void CachedSurface::LoadGLBuffer() {
             UNREACHABLE();
         }
 
-        gl_buffer.resize(static_cast<size_t>(params.depth) * copy_size);
-        morton_to_gl_fns[static_cast<size_t>(params.pixel_format)](
+        gl_buffer.resize(static_cast<std::size_t>(params.depth) * copy_size);
+        morton_to_gl_fns[static_cast<std::size_t>(params.pixel_format)](
             params.width, params.block_height, params.height, gl_buffer.data(), copy_size,
             params.addr);
     } else {
         const u8* const texture_src_data_end{texture_src_data +
-                                             (static_cast<size_t>(params.depth) * copy_size)};
+                                             (static_cast<std::size_t>(params.depth) * copy_size)};
         gl_buffer.assign(texture_src_data, texture_src_data_end);
     }
 
@@ -625,7 +623,7 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle
 
     MICROPROFILE_SCOPE(OpenGL_TextureUL);
 
-    ASSERT(gl_buffer.size() == static_cast<size_t>(params.width) * params.height *
+    ASSERT(gl_buffer.size() == static_cast<std::size_t>(params.width) * params.height *
                                    GetGLBytesPerPixel(params.pixel_format) * params.depth);
 
     const auto& rect{params.GetRect()};
@@ -633,8 +631,9 @@ void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle
     // Load data from memory to the surface
     const GLint x0 = static_cast<GLint>(rect.left);
     const GLint y0 = static_cast<GLint>(rect.bottom);
-    const size_t buffer_offset =
-        static_cast<size_t>(static_cast<size_t>(y0) * params.width + static_cast<size_t>(x0)) *
+    const std::size_t buffer_offset =
+        static_cast<std::size_t>(static_cast<std::size_t>(y0) * params.width +
+                                 static_cast<std::size_t>(x0)) *
         GetGLBytesPerPixel(params.pixel_format);
 
     const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
@@ -736,7 +735,7 @@ Surface RasterizerCacheOpenGL::GetDepthBufferSurface(bool preserve_contents) {
     return GetSurface(depth_params, preserve_contents);
 }
 
-Surface RasterizerCacheOpenGL::GetColorBufferSurface(size_t index, bool preserve_contents) {
+Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool preserve_contents) {
     const auto& regs{Core::System::GetInstance().GPU().Maxwell3D().regs};
 
     ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
@@ -817,20 +816,24 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface,
     // Get a new surface with the new parameters, and blit the previous surface to it
     Surface new_surface{GetUncachedSurface(new_params)};
 
-    // If format is unchanged, we can do a faster blit without reinterpreting pixel data
-    if (params.pixel_format == new_params.pixel_format) {
+    if (params.pixel_format == new_params.pixel_format ||
+        !Settings::values.use_accurate_framebuffers) {
+        // If the format is the same, just do a framebuffer blit. This is significantly faster than
+        // using PBOs. The is also likely less accurate, as textures will be converted rather than
+        // reinterpreted.
+
         BlitTextures(surface->Texture().handle, params.GetRect(), new_surface->Texture().handle,
                      params.GetRect(), params.type, read_framebuffer.handle,
                      draw_framebuffer.handle);
-        return new_surface;
-    }
+    } else {
+        // When use_accurate_framebuffers setting is enabled, perform a more accurate surface copy,
+        // where pixels are reinterpreted as a new format (without conversion). This code path uses
+        // OpenGL PBOs and is quite slow.
 
-    // When using accurate framebuffers, always copy old data to new surface, regardless of format
-    if (Settings::values.use_accurate_framebuffers) {
         auto source_format = GetFormatTuple(params.pixel_format, params.component_type);
         auto dest_format = GetFormatTuple(new_params.pixel_format, new_params.component_type);
 
-        size_t buffer_size = std::max(params.SizeInBytes(), new_params.SizeInBytes());
+        std::size_t buffer_size = std::max(params.SizeInBytes(), new_params.SizeInBytes());
 
         glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo.handle);
         glBufferData(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, GL_STREAM_DRAW_ARB);
@@ -854,7 +857,7 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface,
                 LOG_DEBUG(HW_GPU, "Trying to upload extra texture data from the CPU during "
                                   "reinterpretation but the texture is tiled.");
             }
-            size_t remaining_size = new_params.SizeInBytes() - params.SizeInBytes();
+            std::size_t remaining_size = new_params.SizeInBytes() - params.SizeInBytes();
             std::vector<u8> data(remaining_size);
             Memory::ReadBlock(new_params.addr + params.SizeInBytes(), data.data(), data.size());
             glBufferSubData(GL_PIXEL_PACK_BUFFER, params.SizeInBytes(), remaining_size,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index e660998d0..d7a4bc37f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -70,19 +70,20 @@ struct SurfaceParams {
         RG8S = 42,
         RG32UI = 43,
         R32UI = 44,
+        ASTC_2D_8X8 = 45,
 
         MaxColorFormat,
 
         // Depth formats
-        Z32F = 45,
-        Z16 = 46,
+        Z32F = 46,
+        Z16 = 47,
 
         MaxDepthFormat,
 
         // DepthStencil formats
-        Z24S8 = 47,
-        S8Z24 = 48,
-        Z32FS8 = 49,
+        Z24S8 = 48,
+        S8Z24 = 49,
+        Z32FS8 = 50,
 
         MaxDepthStencilFormat,
 
@@ -90,7 +91,7 @@ struct SurfaceParams {
         Invalid = 255,
     };
 
-    static constexpr size_t MaxPixelFormat = static_cast<size_t>(PixelFormat::Max);
+    static constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max);
 
     enum class ComponentType {
         Invalid = 0,
@@ -192,6 +193,7 @@ struct SurfaceParams {
             1, // RG8S
             1, // RG32UI
             1, // R32UI
+            4, // ASTC_2D_8X8
             1, // Z32F
             1, // Z16
             1, // Z24S8
@@ -199,8 +201,8 @@ struct SurfaceParams {
             1, // Z32FS8
         }};
 
-        ASSERT(static_cast<size_t>(format) < compression_factor_table.size());
-        return compression_factor_table[static_cast<size_t>(format)];
+        ASSERT(static_cast<std::size_t>(format) < compression_factor_table.size());
+        return compression_factor_table[static_cast<std::size_t>(format)];
     }
 
     static constexpr u32 GetFormatBpp(PixelFormat format) {
@@ -253,6 +255,7 @@ struct SurfaceParams {
             16,  // RG8S
             64,  // RG32UI
             32,  // R32UI
+            16,  // ASTC_2D_8X8
             32,  // Z32F
             16,  // Z16
             32,  // Z24S8
@@ -260,8 +263,8 @@ struct SurfaceParams {
             64,  // Z32FS8
         }};
 
-        ASSERT(static_cast<size_t>(format) < bpp_table.size());
-        return bpp_table[static_cast<size_t>(format)];
+        ASSERT(static_cast<std::size_t>(format) < bpp_table.size());
+        return bpp_table[static_cast<std::size_t>(format)];
     }
 
     u32 GetFormatBpp() const {
@@ -316,6 +319,8 @@ struct SurfaceParams {
             return PixelFormat::R11FG11FB10F;
         case Tegra::RenderTargetFormat::B5G6R5_UNORM:
             return PixelFormat::B5G6R5U;
+        case Tegra::RenderTargetFormat::BGR5A1_UNORM:
+            return PixelFormat::A1B5G5R5U;
         case Tegra::RenderTargetFormat::RGBA32_UINT:
             return PixelFormat::RGBA32UI;
         case Tegra::RenderTargetFormat::R8_UNORM:
@@ -522,6 +527,8 @@ struct SurfaceParams {
             return PixelFormat::BC6H_SF16;
         case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
             return PixelFormat::ASTC_2D_4X4;
+        case Tegra::Texture::TextureFormat::ASTC_2D_8X8:
+            return PixelFormat::ASTC_2D_8X8;
         case Tegra::Texture::TextureFormat::R16_G16:
             switch (component_type) {
             case Tegra::Texture::ComponentType::FLOAT:
@@ -576,6 +583,7 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::RG16_UNORM:
         case Tegra::RenderTargetFormat::R16_UNORM:
         case Tegra::RenderTargetFormat::B5G6R5_UNORM:
+        case Tegra::RenderTargetFormat::BGR5A1_UNORM:
         case Tegra::RenderTargetFormat::RG8_UNORM:
         case Tegra::RenderTargetFormat::RGBA16_UNORM:
             return ComponentType::UNorm;
@@ -636,16 +644,18 @@ struct SurfaceParams {
     }
 
     static SurfaceType GetFormatType(PixelFormat pixel_format) {
-        if (static_cast<size_t>(pixel_format) < static_cast<size_t>(PixelFormat::MaxColorFormat)) {
+        if (static_cast<std::size_t>(pixel_format) <
+            static_cast<std::size_t>(PixelFormat::MaxColorFormat)) {
             return SurfaceType::ColorTexture;
         }
 
-        if (static_cast<size_t>(pixel_format) < static_cast<size_t>(PixelFormat::MaxDepthFormat)) {
+        if (static_cast<std::size_t>(pixel_format) <
+            static_cast<std::size_t>(PixelFormat::MaxDepthFormat)) {
             return SurfaceType::Depth;
         }
 
-        if (static_cast<size_t>(pixel_format) <
-            static_cast<size_t>(PixelFormat::MaxDepthStencilFormat)) {
+        if (static_cast<std::size_t>(pixel_format) <
+            static_cast<std::size_t>(PixelFormat::MaxDepthStencilFormat)) {
             return SurfaceType::DepthStencil;
         }
 
@@ -659,7 +669,7 @@ struct SurfaceParams {
     MathUtil::Rectangle<u32> GetRect() const;
 
     /// Returns the size of this surface in bytes, adjusted for compression
-    size_t SizeInBytes() const {
+    std::size_t SizeInBytes() const {
         const u32 compression_factor{GetCompressionFactor(pixel_format)};
         ASSERT(width % compression_factor == 0);
         ASSERT(height % compression_factor == 0);
@@ -671,7 +681,7 @@ struct SurfaceParams {
     static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config);
 
     /// Creates SurfaceParams from a framebuffer configuration
-    static SurfaceParams CreateForFramebuffer(size_t index);
+    static SurfaceParams CreateForFramebuffer(std::size_t index);
 
     /// Creates SurfaceParams for a depth buffer configuration
     static SurfaceParams CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,
@@ -680,8 +690,8 @@ struct SurfaceParams {
 
     /// Checks if surfaces are compatible for caching
     bool IsCompatibleSurface(const SurfaceParams& other) const {
-        return std::tie(pixel_format, type, cache_width, cache_height) ==
-               std::tie(other.pixel_format, other.type, other.cache_width, other.cache_height);
+        return std::tie(pixel_format, type, width, height) ==
+               std::tie(other.pixel_format, other.type, other.width, other.height);
     }
 
     VAddr addr;
@@ -694,12 +704,8 @@ struct SurfaceParams {
     u32 height;
     u32 depth;
     u32 unaligned_height;
-    size_t size_in_bytes;
+    std::size_t size_in_bytes;
     SurfaceTarget target;
-
-    // Parameters used for caching only
-    u32 cache_width;
-    u32 cache_height;
 };
 
 }; // namespace OpenGL
@@ -715,7 +721,7 @@ struct SurfaceReserveKey : Common::HashableStruct<OpenGL::SurfaceParams> {
 namespace std {
 template <>
 struct hash<SurfaceReserveKey> {
-    size_t operator()(const SurfaceReserveKey& k) const {
+    std::size_t operator()(const SurfaceReserveKey& k) const {
         return k.Hash();
     }
 };
@@ -731,7 +737,7 @@ public:
         return params.addr;
     }
 
-    size_t GetSizeInBytes() const {
+    std::size_t GetSizeInBytes() const {
         return params.size_in_bytes;
     }
 
@@ -779,7 +785,7 @@ public:
     Surface GetDepthBufferSurface(bool preserve_contents);
 
     /// Get the color surface based on the framebuffer configuration and the specified render target
-    Surface GetColorBufferSurface(size_t index, bool preserve_contents);
+    Surface GetColorBufferSurface(std::size_t index, bool preserve_contents);
 
     /// Flushes the surface to Switch memory
     void FlushSurface(const Surface& surface);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 61080f5cc..894fe6eae 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -14,7 +14,7 @@ namespace OpenGL {
 /// Gets the address for the specified shader stage program
 static VAddr GetShaderAddress(Maxwell::ShaderProgram program) {
     const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
-    const auto& shader_config = gpu.regs.shader_config[static_cast<size_t>(program)];
+    const auto& shader_config = gpu.regs.shader_config[static_cast<std::size_t>(program)];
     return *gpu.memory_manager.GpuToCpuAddress(gpu.regs.code_address.CodeAddress() +
                                                shader_config.offset);
 }
@@ -28,7 +28,7 @@ static GLShader::ProgramCode GetShaderCode(VAddr addr) {
 
 /// Helper function to set shader uniform block bindings for a single shader stage
 static void SetShaderUniformBlockBinding(GLuint shader, const char* name,
-                                         Maxwell::ShaderStage binding, size_t expected_size) {
+                                         Maxwell::ShaderStage binding, std::size_t expected_size) {
     const GLuint ub_index = glGetUniformBlockIndex(shader, name);
     if (ub_index == GL_INVALID_INDEX) {
         return;
@@ -36,7 +36,7 @@ static void SetShaderUniformBlockBinding(GLuint shader, const char* name,
 
     GLint ub_size = 0;
     glGetActiveUniformBlockiv(shader, ub_index, GL_UNIFORM_BLOCK_DATA_SIZE, &ub_size);
-    ASSERT_MSG(static_cast<size_t>(ub_size) == expected_size,
+    ASSERT_MSG(static_cast<std::size_t>(ub_size) == expected_size,
                "Uniform block size did not match! Got {}, expected {}", ub_size, expected_size);
     glUniformBlockBinding(shader, ub_index, static_cast<GLuint>(binding));
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 6e6febcbc..9bafe43a9 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -28,7 +28,7 @@ public:
     }
 
     /// Gets the size of the shader in guest memory, required for cache management
-    size_t GetSizeInBytes() const {
+    std::size_t GetSizeInBytes() const {
         return GLShader::MAX_PROGRAM_CODE_LENGTH * sizeof(u64);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 434faf9d4..a1638c12e 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -190,7 +190,7 @@ public:
 
 private:
     void AppendIndentation() {
-        shader_source.append(static_cast<size_t>(scope) * 4, ' ');
+        shader_source.append(static_cast<std::size_t>(scope) * 4, ' ');
     }
 
     std::string shader_source;
@@ -209,7 +209,7 @@ public:
         UnsignedInteger,
     };
 
-    GLSLRegister(size_t index, const std::string& suffix) : index{index}, suffix{suffix} {}
+    GLSLRegister(std::size_t index, const std::string& suffix) : index{index}, suffix{suffix} {}
 
     /// Gets the GLSL type string for a register
     static std::string GetTypeString() {
@@ -227,12 +227,12 @@ public:
     }
 
     /// Returns the index of the register
-    size_t GetIndex() const {
+    std::size_t GetIndex() const {
         return index;
     }
 
 private:
-    const size_t index;
+    const std::size_t index;
     const std::string& suffix;
 };
 
@@ -469,7 +469,7 @@ public:
     /// necessary.
     std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type,
                               bool is_array) {
-        const size_t offset = static_cast<size_t>(sampler.index.Value());
+        const std::size_t offset = static_cast<std::size_t>(sampler.index.Value());
 
         // If this sampler has already been used, return the existing mapping.
         const auto itr =
@@ -482,7 +482,7 @@ public:
         }
 
         // Otherwise create a new mapping for this sampler
-        const size_t next_index = used_samplers.size();
+        const std::size_t next_index = used_samplers.size();
         const SamplerEntry entry{stage, offset, next_index, type, is_array};
         used_samplers.emplace_back(entry);
         return entry.GetName();
@@ -532,7 +532,7 @@ private:
     void BuildRegisterList() {
         regs.reserve(Register::NumRegisters);
 
-        for (size_t index = 0; index < Register::NumRegisters; ++index) {
+        for (std::size_t index = 0; index < Register::NumRegisters; ++index) {
             regs.emplace_back(index, suffix);
         }
     }
@@ -846,7 +846,7 @@ private:
      */
     bool IsSchedInstruction(u32 offset) const {
         // sched instructions appear once every 4 instructions.
-        static constexpr size_t SchedPeriod = 4;
+        static constexpr std::size_t SchedPeriod = 4;
         u32 absolute_offset = offset - main_offset;
 
         return (absolute_offset % SchedPeriod) == 0;
@@ -914,7 +914,7 @@ private:
         std::string result;
         result += '(';
 
-        for (size_t i = 0; i < shift_amounts.size(); ++i) {
+        for (std::size_t i = 0; i < shift_amounts.size(); ++i) {
             if (i)
                 result += '|';
             result += "(((" + imm_lut + " >> (((" + op_c + " >> " + shift_amounts[i] +
@@ -940,7 +940,7 @@ private:
 
         ASSERT_MSG(instr.texs.nodep == 0, "TEXS nodep not implemented");
 
-        size_t written_components = 0;
+        std::size_t written_components = 0;
         for (u32 component = 0; component < 4; ++component) {
             if (!instr.texs.IsComponentEnabled(component)) {
                 continue;
@@ -1487,6 +1487,71 @@ private:
                                           1, 1);
                 break;
             }
+            case OpCode::Id::LEA_R2:
+            case OpCode::Id::LEA_R1:
+            case OpCode::Id::LEA_IMM:
+            case OpCode::Id::LEA_RZ:
+            case OpCode::Id::LEA_HI: {
+                std::string op_c;
+
+                switch (opcode->GetId()) {
+                case OpCode::Id::LEA_R2: {
+                    op_a = regs.GetRegisterAsInteger(instr.gpr20);
+                    op_b = regs.GetRegisterAsInteger(instr.gpr39);
+                    op_c = std::to_string(instr.lea.r2.entry_a);
+                    break;
+                }
+
+                case OpCode::Id::LEA_R1: {
+                    const bool neg = instr.lea.r1.neg != 0;
+                    op_a = regs.GetRegisterAsInteger(instr.gpr8);
+                    if (neg)
+                        op_a = "-(" + op_a + ')';
+                    op_b = regs.GetRegisterAsInteger(instr.gpr20);
+                    op_c = std::to_string(instr.lea.r1.entry_a);
+                    break;
+                }
+
+                case OpCode::Id::LEA_IMM: {
+                    const bool neg = instr.lea.imm.neg != 0;
+                    op_b = regs.GetRegisterAsInteger(instr.gpr8);
+                    if (neg)
+                        op_b = "-(" + op_b + ')';
+                    op_a = std::to_string(instr.lea.imm.entry_a);
+                    op_c = std::to_string(instr.lea.imm.entry_b);
+                    break;
+                }
+
+                case OpCode::Id::LEA_RZ: {
+                    const bool neg = instr.lea.rz.neg != 0;
+                    op_b = regs.GetRegisterAsInteger(instr.gpr8);
+                    if (neg)
+                        op_b = "-(" + op_b + ')';
+                    op_a = regs.GetUniform(instr.lea.rz.cb_index, instr.lea.rz.cb_offset,
+                                           GLSLRegister::Type::Integer);
+                    op_c = std::to_string(instr.lea.rz.entry_a);
+
+                    break;
+                }
+
+                case OpCode::Id::LEA_HI:
+                default: {
+                    op_b = regs.GetRegisterAsInteger(instr.gpr8);
+                    op_a = std::to_string(instr.lea.imm.entry_a);
+                    op_c = std::to_string(instr.lea.imm.entry_b);
+                    LOG_CRITICAL(HW_GPU, "Unhandled LEA subinstruction: {}", opcode->GetName());
+                    UNREACHABLE();
+                }
+                }
+                if (instr.lea.pred48 != static_cast<u64>(Pred::UnusedIndex)) {
+                    LOG_ERROR(HW_GPU, "Unhandled LEA Predicate");
+                    UNREACHABLE();
+                }
+                const std::string value = '(' + op_a + " + (" + op_b + "*(1 << " + op_c + ")))";
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, value, 1, 1);
+
+                break;
+            }
             default: {
                 LOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}",
                              opcode->GetName());
@@ -1687,13 +1752,34 @@ private:
         case OpCode::Type::Memory: {
             switch (opcode->GetId()) {
             case OpCode::Id::LD_A: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
                 // Note: Shouldn't this be interp mode flat? As in no interpolation made.
+                ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex,
+                           "Indirect attribute loads are not supported");
+                ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0,
+                           "Unaligned attribute loads are not supported");
 
                 Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Perspective,
                                                   Tegra::Shader::IpaSampleMode::Default};
-                regs.SetRegisterToInputAttibute(instr.gpr0, instr.attribute.fmt20.element,
-                                                instr.attribute.fmt20.index, input_mode);
+
+                u32 next_element = instr.attribute.fmt20.element;
+                u32 next_index = static_cast<u32>(instr.attribute.fmt20.index.Value());
+
+                const auto LoadNextElement = [&](u32 reg_offset) {
+                    regs.SetRegisterToInputAttibute(instr.gpr0.Value() + reg_offset, next_element,
+                                                    static_cast<Attribute::Index>(next_index),
+                                                    input_mode);
+
+                    // Load the next attribute element into the following register. If the element
+                    // to load goes beyond the vec4 size, load the first element of the next
+                    // attribute.
+                    next_element = (next_element + 1) % 4;
+                    next_index = next_index + (next_element == 0 ? 1 : 0);
+                };
+
+                const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
+                for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
+                    LoadNextElement(reg_offset);
+                }
                 break;
             }
             case OpCode::Id::LD_C: {
@@ -1735,9 +1821,31 @@ private:
                 break;
             }
             case OpCode::Id::ST_A: {
-                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
-                regs.SetOutputAttributeToRegister(instr.attribute.fmt20.index,
-                                                  instr.attribute.fmt20.element, instr.gpr0);
+                ASSERT_MSG(instr.gpr8.Value() == Register::ZeroIndex,
+                           "Indirect attribute loads are not supported");
+                ASSERT_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) == 0,
+                           "Unaligned attribute loads are not supported");
+
+                u32 next_element = instr.attribute.fmt20.element;
+                u32 next_index = static_cast<u32>(instr.attribute.fmt20.index.Value());
+
+                const auto StoreNextElement = [&](u32 reg_offset) {
+                    regs.SetOutputAttributeToRegister(static_cast<Attribute::Index>(next_index),
+                                                      next_element,
+                                                      instr.gpr0.Value() + reg_offset);
+
+                    // Load the next attribute element into the following register. If the element
+                    // to load goes beyond the vec4 size, load the first element of the next
+                    // attribute.
+                    next_element = (next_element + 1) % 4;
+                    next_index = next_index + (next_element == 0 ? 1 : 0);
+                };
+
+                const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
+                for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
+                    StoreNextElement(reg_offset);
+                }
+
                 break;
             }
             case OpCode::Id::TEX: {
@@ -1768,17 +1876,49 @@ private:
                     coord = "vec2 coords = vec2(" + x + ", " + y + ");";
                     texture_type = Tegra::Shader::TextureType::Texture2D;
                 }
+                // TODO: make sure coordinates are always indexed to gpr8 and gpr20 is always bias
+                // or lod.
+                const std::string op_c = regs.GetRegisterAsFloat(instr.gpr20);
 
                 const std::string sampler = GetSampler(instr.sampler, texture_type, false);
                 // Add an extra scope and declare the texture coords inside to prevent
                 // overwriting them in case they are used as outputs of the texs instruction.
+
                 shader.AddLine("{");
                 ++shader.scope;
                 shader.AddLine(coord);
-                const std::string texture = "texture(" + sampler + ", coords)";
+                std::string texture;
 
-                size_t dest_elem{};
-                for (size_t elem = 0; elem < 4; ++elem) {
+                switch (instr.tex.process_mode) {
+                case Tegra::Shader::TextureProcessMode::None: {
+                    texture = "texture(" + sampler + ", coords)";
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LZ: {
+                    texture = "textureLod(" + sampler + ", coords, 0.0)";
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LB:
+                case Tegra::Shader::TextureProcessMode::LBA: {
+                    // TODO: Figure if A suffix changes the equation at all.
+                    texture = "texture(" + sampler + ", coords, " + op_c + ')';
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LL:
+                case Tegra::Shader::TextureProcessMode::LLA: {
+                    // TODO: Figure if A suffix changes the equation at all.
+                    texture = "textureLod(" + sampler + ", coords, " + op_c + ')';
+                    break;
+                }
+                default: {
+                    texture = "texture(" + sampler + ", coords)";
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}",
+                                 static_cast<u32>(instr.tex.process_mode.Value()));
+                    UNREACHABLE();
+                }
+                }
+                std::size_t dest_elem{};
+                for (std::size_t elem = 0; elem < 4; ++elem) {
                     if (!instr.tex.IsComponentEnabled(elem)) {
                         // Skip disabled components
                         continue;
@@ -1882,8 +2022,8 @@ private:
                 const std::string texture = "textureGather(" + sampler + ", coords, " +
                                             std::to_string(instr.tld4.component) + ')';
 
-                size_t dest_elem{};
-                for (size_t elem = 0; elem < 4; ++elem) {
+                std::size_t dest_elem{};
+                for (std::size_t elem = 0; elem < 4; ++elem) {
                     if (!instr.tex.IsComponentEnabled(elem)) {
                         // Skip disabled components
                         continue;
@@ -2069,6 +2209,30 @@ private:
             }
             break;
         }
+        case OpCode::Type::PredicateSetRegister: {
+            const std::string op_a =
+                GetPredicateCondition(instr.pset.pred12, instr.pset.neg_pred12 != 0);
+            const std::string op_b =
+                GetPredicateCondition(instr.pset.pred29, instr.pset.neg_pred29 != 0);
+
+            const std::string second_pred =
+                GetPredicateCondition(instr.pset.pred39, instr.pset.neg_pred39 != 0);
+
+            const std::string combiner = GetPredicateCombiner(instr.pset.op);
+
+            const std::string predicate =
+                '(' + op_a + ") " + GetPredicateCombiner(instr.pset.cond) + " (" + op_b + ')';
+            const std::string result = '(' + predicate + ") " + combiner + " (" + second_pred + ')';
+            if (instr.pset.bf == 0) {
+                const std::string value = '(' + result + ") ? 0xFFFFFFFF : 0";
+                regs.SetRegisterToInteger(instr.gpr0, false, 0, value, 1, 1);
+            } else {
+                const std::string value = '(' + result + ") ? 1.0 : 0.0";
+                regs.SetRegisterToFloat(instr.gpr0, 0, value, 1, 1);
+            }
+
+            break;
+        }
         case OpCode::Type::PredicateSetPredicate: {
             const std::string op_a =
                 GetPredicateCondition(instr.psetp.pred12, instr.psetp.neg_pred12 != 0);
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index a43e2997b..d53b93ad5 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -13,7 +13,7 @@
 
 namespace OpenGL::GLShader {
 
-constexpr size_t MAX_PROGRAM_CODE_LENGTH{0x1000};
+constexpr std::size_t MAX_PROGRAM_CODE_LENGTH{0x1000};
 using ProgramCode = std::vector<u64>;
 
 class ConstBufferEntry {
@@ -51,7 +51,7 @@ public:
     }
 
     std::string GetName() const {
-        return BufferBaseNames[static_cast<size_t>(stage)] + std::to_string(index);
+        return BufferBaseNames[static_cast<std::size_t>(stage)] + std::to_string(index);
     }
 
     u32 GetHash() const {
@@ -74,15 +74,15 @@ class SamplerEntry {
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 public:
-    SamplerEntry(Maxwell::ShaderStage stage, size_t offset, size_t index,
+    SamplerEntry(Maxwell::ShaderStage stage, std::size_t offset, std::size_t index,
                  Tegra::Shader::TextureType type, bool is_array)
         : offset(offset), stage(stage), sampler_index(index), type(type), is_array(is_array) {}
 
-    size_t GetOffset() const {
+    std::size_t GetOffset() const {
         return offset;
     }
 
-    size_t GetIndex() const {
+    std::size_t GetIndex() const {
         return sampler_index;
     }
 
@@ -91,7 +91,7 @@ public:
     }
 
     std::string GetName() const {
-        return std::string(TextureSamplerNames[static_cast<size_t>(stage)]) + '_' +
+        return std::string(TextureSamplerNames[static_cast<std::size_t>(stage)]) + '_' +
                std::to_string(sampler_index);
     }
 
@@ -133,7 +133,7 @@ public:
     }
 
     static std::string GetArrayName(Maxwell::ShaderStage stage) {
-        return TextureSamplerNames[static_cast<size_t>(stage)];
+        return TextureSamplerNames[static_cast<std::size_t>(stage)];
     }
 
 private:
@@ -143,9 +143,9 @@ private:
 
     /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
     /// instruction.
-    size_t offset;
+    std::size_t offset;
     Maxwell::ShaderStage stage;      ///< Shader stage where this sampler was used.
-    size_t sampler_index;            ///< Value used to index into the generated GLSL sampler array.
+    std::size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
     Tegra::Shader::TextureType type; ///< The type used to sample this texture (Texture2D, etc)
     bool is_array; ///< Whether the texture is being sampled as an array texture or not.
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index 533e42caa..b86cd96e8 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -12,7 +12,7 @@
 namespace OpenGL::GLShader {
 
 /// Number of OpenGL texture samplers that can be used in the fragment shader
-static constexpr size_t NumTextureSamplers = 32;
+static constexpr std::size_t NumTextureSamplers = 32;
 
 using Tegra::Engines::Maxwell3D;
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 6f70deb96..af99132ba 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -272,7 +272,7 @@ void OpenGLState::Apply() const {
     }
 
     // Clip distance
-    for (size_t i = 0; i < clip_distance.size(); ++i) {
+    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
         if (clip_distance[i] != cur_state.clip_distance[i]) {
             if (clip_distance[i]) {
                 glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index aadf68f16..664f3ca20 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -61,7 +61,7 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
     mapped_size = size;
 
     if (alignment > 0) {
-        buffer_pos = Common::AlignUp<size_t>(buffer_pos, alignment);
+        buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
     }
 
     bool invalidate = false;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index ccff3e342..96d916b07 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -369,6 +369,12 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
  * Draws the emulated screens to the emulator window.
  */
 void RendererOpenGL::DrawScreen() {
+    if (renderer_settings.set_background_color) {
+        // Update background color before drawing
+        glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
+                     0.0f);
+    }
+
     const auto& layout = render_window.GetFramebufferLayout();
     const auto& screen = layout.screen;
 
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 272294c62..20ba6d4f6 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -46,6 +46,48 @@ void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_
     }
 }
 
+template <std::size_t N, std::size_t M>
+struct alignas(64) SwizzleTable {
+    constexpr SwizzleTable() {
+        for (u32 y = 0; y < N; ++y) {
+            for (u32 x = 0; x < M; ++x) {
+                const u32 x2 = x * 16;
+                values[y][x] = static_cast<u16>(((x2 % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
+                                                ((x2 % 32) / 16) * 32 + (y % 2) * 16);
+            }
+        }
+    }
+    const std::array<u16, M>& operator[](std::size_t index) const {
+        return values[index];
+    }
+    std::array<std::array<u16, M>, N> values{};
+};
+
+constexpr auto swizzle_table = SwizzleTable<8, 4>();
+
+void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_data,
+                     u8* unswizzled_data, bool unswizzle, u32 block_height) {
+    std::array<u8*, 2> data_ptrs;
+    const std::size_t stride{width * bytes_per_pixel};
+    const std::size_t image_width_in_gobs{(stride + 63) / 64};
+    const std::size_t copy_size{16};
+    for (std::size_t y = 0; y < height; ++y) {
+        const std::size_t initial_gob =
+            (y / (8 * block_height)) * 512 * block_height * image_width_in_gobs +
+            (y % (8 * block_height) / 8) * 512;
+        const std::size_t pixel_base{y * width * bytes_per_pixel};
+        const auto& table = swizzle_table[y % 8];
+        for (std::size_t xb = 0; xb < stride; xb += copy_size) {
+            const std::size_t gob_address{initial_gob + (xb / 64) * 512 * block_height};
+            const std::size_t swizzle_offset{gob_address + table[(xb / 16) % 4]};
+            const std::size_t pixel_index{xb + pixel_base};
+            data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
+            data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
+            std::memcpy(data_ptrs[0], data_ptrs[1], copy_size);
+        }
+    }
+}
+
 u32 BytesPerPixel(TextureFormat format) {
     switch (format) {
     case TextureFormat::DXT1:
@@ -63,6 +105,7 @@ u32 BytesPerPixel(TextureFormat format) {
     case TextureFormat::R32_G32_B32:
         return 12;
     case TextureFormat::ASTC_2D_4X4:
+    case TextureFormat::ASTC_2D_8X8:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::BF10GF11RF11:
@@ -91,8 +134,13 @@ u32 BytesPerPixel(TextureFormat format) {
 std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size, u32 bytes_per_pixel, u32 width,
                                  u32 height, u32 block_height) {
     std::vector<u8> unswizzled_data(width * height * bytes_per_pixel);
-    CopySwizzledData(width / tile_size, height / tile_size, bytes_per_pixel, bytes_per_pixel,
-                     Memory::GetPointer(address), unswizzled_data.data(), true, block_height);
+    if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % 16 == 0) {
+        FastSwizzleData(width / tile_size, height / tile_size, bytes_per_pixel,
+                        Memory::GetPointer(address), unswizzled_data.data(), true, block_height);
+    } else {
+        CopySwizzledData(width / tile_size, height / tile_size, bytes_per_pixel, bytes_per_pixel,
+                         Memory::GetPointer(address), unswizzled_data.data(), true, block_height);
+    }
     return unswizzled_data;
 }
 
@@ -111,6 +159,7 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
     case TextureFormat::BC6H_UF16:
     case TextureFormat::BC6H_SF16:
     case TextureFormat::ASTC_2D_4X4:
+    case TextureFormat::ASTC_2D_8X8:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::A1B5G5R5: