59 files changed, 1372 insertions, 448 deletions
diff --git a/.gitmodules b/.gitmodules
index bf3b80d59..2ec9dda62 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -28,3 +28,6 @@
 [submodule "libzip"]
     path = externals/libzip/libzip
     url = https://github.com/nih-at/libzip.git
+[submodule "xbyak"]
+    path = externals/xbyak
+    url = https://github.com/herumi/xbyak.git
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 0b40cd1b0..df7a5e0a9 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -75,3 +75,11 @@ if (ENABLE_WEB_SERVICE)
     target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
     target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
 endif()
+
+if (NOT TARGET xbyak)
+    if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+        add_library(xbyak INTERFACE)
+        target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+        target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
+    endif()
+endif()
diff --git a/externals/sirit b/externals/sirit
-Subproject 414fc4dbd28d8fe48f735a0c389db8a234f733c
+Subproject a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167c
diff --git a/externals/xbyak b/externals/xbyak
new file mode 160000
+Subproject 82b70e665918efc2ee348091742fd0237b3b68c
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index e6769a5f3..24b7a083c 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -123,6 +123,8 @@ add_library(common STATIC
     lz4_compression.cpp
     lz4_compression.h
     math_util.h
+    memory_detect.cpp
+    memory_detect.h
     memory_hook.cpp
     memory_hook.h
     microprofile.cpp
@@ -169,10 +171,12 @@ if(ARCHITECTURE_x86_64)
         PRIVATE
             x64/cpu_detect.cpp
             x64/cpu_detect.h
+            x64/xbyak_abi.h
+            x64/xbyak_util.h
     )
 endif()
 
 create_target_directory_groups(common)
 
 target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
-target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd)
+target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
diff --git a/src/common/memory_detect.cpp b/src/common/memory_detect.cpp
new file mode 100644
index 000000000..3fdc309a2
--- /dev/null
+++ b/src/common/memory_detect.cpp
@@ -0,0 +1,60 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#ifdef _WIN32
+// clang-format off
+#include <windows.h>
+#include <sysinfoapi.h>
+// clang-format on
+#else
+#include <sys/types.h>
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#else
+#include <sys/sysinfo.h>
+#endif
+#endif
+
+#include "common/memory_detect.h"
+
+namespace Common {
+
+// Detects the RAM and Swapfile sizes
+static MemoryInfo Detect() {
+    MemoryInfo mem_info{};
+
+#ifdef _WIN32
+    MEMORYSTATUSEX memorystatus;
+    memorystatus.dwLength = sizeof(memorystatus);
+    GlobalMemoryStatusEx(&memorystatus);
+    mem_info.TotalPhysicalMemory = memorystatus.ullTotalPhys;
+    mem_info.TotalSwapMemory = memorystatus.ullTotalPageFile - mem_info.TotalPhysicalMemory;
+#elif defined(__APPLE__)
+    u64 ramsize;
+    struct xsw_usage vmusage;
+    std::size_t sizeof_ramsize = sizeof(ramsize);
+    std::size_t sizeof_vmusage = sizeof(vmusage);
+    // hw and vm are defined in sysctl.h
+    // https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
+    // sysctlbyname(const char *, void *, size_t *, void *, size_t);
+    sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
+    sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
+    mem_info.TotalPhysicalMemory = ramsize;
+    mem_info.TotalSwapMemory = vmusage.xsu_total;
+#else
+    struct sysinfo meminfo;
+    sysinfo(&meminfo);
+    mem_info.TotalPhysicalMemory = meminfo.totalram;
+    mem_info.TotalSwapMemory = meminfo.totalswap;
+#endif
+
+    return mem_info;
+}
+
+const MemoryInfo& GetMemInfo() {
+    static MemoryInfo mem_info = Detect();
+    return mem_info;
+}
+
+} // namespace Common
+\ No newline at end of file
diff --git a/src/common/memory_detect.h b/src/common/memory_detect.h
new file mode 100644
index 000000000..a73c0f3f4
--- /dev/null
+++ b/src/common/memory_detect.h
@@ -0,0 +1,22 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Common {
+
+struct MemoryInfo {
+    u64 TotalPhysicalMemory{};
+    u64 TotalSwapMemory{};
+};
+
+/**
+ * Gets the memory info of the host system
+ * @return Reference to a MemoryInfo struct with the physical and swap memory sizes in bytes
+ */
+const MemoryInfo& GetMemInfo();
+
+} // namespace Common
+\ No newline at end of file
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h
new file mode 100644
index 000000000..794da8a52
--- /dev/null
+++ b/src/common/x64/xbyak_abi.h
@@ -0,0 +1,266 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <bitset>
+#include <initializer_list>
+#include <xbyak.h>
+#include "common/assert.h"
+
+namespace Common::X64 {
+
+inline int RegToIndex(const Xbyak::Reg& reg) {
+    using Kind = Xbyak::Reg::Kind;
+    ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
+               "RegSet only support GPRs and XMM registers.");
+    ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
+    return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
+}
+
+inline Xbyak::Reg64 IndexToReg64(int reg_index) {
+    ASSERT(reg_index < 16);
+    return Xbyak::Reg64(reg_index);
+}
+
+inline Xbyak::Xmm IndexToXmm(int reg_index) {
+    ASSERT(reg_index >= 16 && reg_index < 32);
+    return Xbyak::Xmm(reg_index - 16);
+}
+
+inline Xbyak::Reg IndexToReg(int reg_index) {
+    if (reg_index < 16) {
+        return IndexToReg64(reg_index);
+    } else {
+        return IndexToXmm(reg_index);
+    }
+}
+
+inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
+    std::bitset<32> bits;
+    for (const Xbyak::Reg& reg : regs) {
+        bits[RegToIndex(reg)] = true;
+    }
+    return bits;
+}
+
+const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
+const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
+
+#ifdef _WIN32
+
+// Microsoft x64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rcx,
+    Xbyak::util::rdx,
+    Xbyak::util::r8,
+    Xbyak::util::r9,
+    Xbyak::util::r10,
+    Xbyak::util::r11,
+    // XMMs
+    Xbyak::util::xmm0,
+    Xbyak::util::xmm1,
+    Xbyak::util::xmm2,
+    Xbyak::util::xmm3,
+    Xbyak::util::xmm4,
+    Xbyak::util::xmm5,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rbx,
+    Xbyak::util::rsi,
+    Xbyak::util::rdi,
+    Xbyak::util::rbp,
+    Xbyak::util::r12,
+    Xbyak::util::r13,
+    Xbyak::util::r14,
+    Xbyak::util::r15,
+    // XMMs
+    Xbyak::util::xmm6,
+    Xbyak::util::xmm7,
+    Xbyak::util::xmm8,
+    Xbyak::util::xmm9,
+    Xbyak::util::xmm10,
+    Xbyak::util::xmm11,
+    Xbyak::util::xmm12,
+    Xbyak::util::xmm13,
+    Xbyak::util::xmm14,
+    Xbyak::util::xmm15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0x20;
+
+#else
+
+// System V x86-64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rcx,
+    Xbyak::util::rdx,
+    Xbyak::util::rdi,
+    Xbyak::util::rsi,
+    Xbyak::util::r8,
+    Xbyak::util::r9,
+    Xbyak::util::r10,
+    Xbyak::util::r11,
+    // XMMs
+    Xbyak::util::xmm0,
+    Xbyak::util::xmm1,
+    Xbyak::util::xmm2,
+    Xbyak::util::xmm3,
+    Xbyak::util::xmm4,
+    Xbyak::util::xmm5,
+    Xbyak::util::xmm6,
+    Xbyak::util::xmm7,
+    Xbyak::util::xmm8,
+    Xbyak::util::xmm9,
+    Xbyak::util::xmm10,
+    Xbyak::util::xmm11,
+    Xbyak::util::xmm12,
+    Xbyak::util::xmm13,
+    Xbyak::util::xmm14,
+    Xbyak::util::xmm15,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rbx,
+    Xbyak::util::rbp,
+    Xbyak::util::r12,
+    Xbyak::util::r13,
+    Xbyak::util::r14,
+    Xbyak::util::r15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0;
+
+#endif
+
+inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
+                                   size_t needed_frame_size, s32* out_subtraction,
+                                   s32* out_xmm_offset) {
+    const auto count = (regs & ABI_ALL_GPRS).count();
+    rsp_alignment -= count * 8;
+    size_t subtraction = 0;
+    const auto xmm_count = (regs & ABI_ALL_XMMS).count();
+    if (xmm_count) {
+        // If we have any XMMs to save, we must align the stack here.
+        subtraction = rsp_alignment & 0xF;
+    }
+    subtraction += 0x10 * xmm_count;
+    size_t xmm_base_subtraction = subtraction;
+    subtraction += needed_frame_size;
+    subtraction += ABI_SHADOW_SPACE;
+    // Final alignment.
+    rsp_alignment -= subtraction;
+    subtraction += rsp_alignment & 0xF;
+
+    *out_subtraction = (s32)subtraction;
+    *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
+}
+
+inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_GPRS[i]) {
+            code.push(IndexToReg64(static_cast<int>(i)));
+        }
+    }
+    if (subtraction != 0) {
+        code.sub(code.rsp, subtraction);
+    }
+
+    for (int i = 0; i < regs.count(); i++) {
+        if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
+            code.push(IndexToReg64(i));
+        }
+    }
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_XMMS[i]) {
+            code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
+            xmm_offset += 0x10;
+        }
+    }
+
+    return ABI_SHADOW_SPACE;
+}
+
+inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                           size_t rsp_alignment, size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_XMMS[i]) {
+            code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
+            xmm_offset += 0x10;
+        }
+    }
+
+    if (subtraction != 0) {
+        code.add(code.rsp, subtraction);
+    }
+
+    // GPRs need to be popped in reverse order
+    for (int i = 15; i >= 0; i--) {
+        if (regs[i]) {
+            code.pop(IndexToReg64(i));
+        }
+    }
+}
+
+inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                                 size_t rsp_alignment,
+                                                 size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_GPRS[i]) {
+            code.push(IndexToReg64(static_cast<int>(i)));
+        }
+    }
+
+    if (subtraction != 0) {
+        code.sub(code.rsp, subtraction);
+    }
+
+    return ABI_SHADOW_SPACE;
+}
+
+inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+    if (subtraction != 0) {
+        code.add(code.rsp, subtraction);
+    }
+
+    // GPRs need to be popped in reverse order
+    for (int i = 15; i >= 0; i--) {
+        if (regs[i]) {
+            code.pop(IndexToReg64(i));
+        }
+    }
+}
+
+} // namespace Common::X64
diff --git a/src/common/x64/xbyak_util.h b/src/common/x64/xbyak_util.h
new file mode 100644
index 000000000..df17f8cbe
--- /dev/null
+++ b/src/common/x64/xbyak_util.h
@@ -0,0 +1,47 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include <xbyak.h>
+#include "common/x64/xbyak_abi.h"
+
+namespace Common::X64 {
+
+// Constants for use with cmpps/cmpss
+enum {
+    CMP_EQ = 0,
+    CMP_LT = 1,
+    CMP_LE = 2,
+    CMP_UNORD = 3,
+    CMP_NEQ = 4,
+    CMP_NLT = 5,
+    CMP_NLE = 6,
+    CMP_ORD = 7,
+};
+
+constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
+    const u64 distance = target - (ref + 5);
+    return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
+}
+
+inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
+    return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
+}
+
+template <typename T>
+inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
+    static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
+    size_t addr = reinterpret_cast<size_t>(f);
+    if (IsWithin2G(code, addr)) {
+        code.call(f);
+    } else {
+        // ABI_RETURN is a safe temp register to use before a call
+        code.mov(ABI_RETURN, addr);
+        code.call(ABI_RETURN);
+    }
+}
+
+} // namespace Common::X64
diff --git a/src/core/file_sys/patch_manager.cpp b/src/core/file_sys/patch_manager.cpp
index b93aa6935..c47ff863e 100644
--- a/src/core/file_sys/patch_manager.cpp
+++ b/src/core/file_sys/patch_manager.cpp
@@ -10,6 +10,7 @@
 #include "common/file_util.h"
 #include "common/hex_util.h"
 #include "common/logging/log.h"
+#include "common/string_util.h"
 #include "core/core.h"
 #include "core/file_sys/content_archive.h"
 #include "core/file_sys/control_metadata.h"
@@ -48,6 +49,23 @@ std::string FormatTitleVersion(u32 version, TitleVersionFormat format) {
     return fmt::format("v{}.{}.{}", bytes[3], bytes[2], bytes[1]);
 }
 
+std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
+                                                       std::string_view name) {
+#ifdef _WIN32
+    return dir->GetSubdirectory(name);
+#else
+    const auto subdirs = dir->GetSubdirectories();
+    for (const auto& subdir : subdirs) {
+        std::string dir_name = Common::ToLower(subdir->GetName());
+        if (dir_name == name) {
+            return subdir;
+        }
+    }
+
+    return nullptr;
+#endif
+}
+
 PatchManager::PatchManager(u64 title_id) : title_id(title_id) {}
 
 PatchManager::~PatchManager() = default;
@@ -104,7 +122,7 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
             if (std::find(disabled.begin(), disabled.end(), subdir->GetName()) != disabled.end())
                 continue;
 
-            auto exefs_dir = subdir->GetSubdirectory("exefs");
+            auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
             if (exefs_dir != nullptr)
                 layers.push_back(std::move(exefs_dir));
         }
@@ -130,7 +148,7 @@ std::vector<VirtualFile> PatchManager::CollectPatches(const std::vector<VirtualD
         if (std::find(disabled.cbegin(), disabled.cend(), subdir->GetName()) != disabled.cend())
             continue;
 
-        auto exefs_dir = subdir->GetSubdirectory("exefs");
+        auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
         if (exefs_dir != nullptr) {
             for (const auto& file : exefs_dir->GetFiles()) {
                 if (file->GetExtension() == "ips") {
@@ -295,7 +313,7 @@ std::vector<Core::Memory::CheatEntry> PatchManager::CreateCheatList(
             continue;
         }
 
-        auto cheats_dir = subdir->GetSubdirectory("cheats");
+        auto cheats_dir = FindSubdirectoryCaseless(subdir, "cheats");
         if (cheats_dir != nullptr) {
             auto res = ReadCheatFileFromFolder(system, title_id, build_id_, cheats_dir, true);
             if (res.has_value()) {
@@ -340,11 +358,11 @@ static void ApplyLayeredFS(VirtualFile& romfs, u64 title_id, ContentRecordType t
             continue;
         }
 
-        auto romfs_dir = subdir->GetSubdirectory("romfs");
+        auto romfs_dir = FindSubdirectoryCaseless(subdir, "romfs");
         if (romfs_dir != nullptr)
             layers.push_back(std::move(romfs_dir));
 
-        auto ext_dir = subdir->GetSubdirectory("romfs_ext");
+        auto ext_dir = FindSubdirectoryCaseless(subdir, "romfs_ext");
         if (ext_dir != nullptr)
             layers_ext.push_back(std::move(ext_dir));
     }
@@ -470,7 +488,7 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
         for (const auto& mod : mod_dir->GetSubdirectories()) {
             std::string types;
 
-            const auto exefs_dir = mod->GetSubdirectory("exefs");
+            const auto exefs_dir = FindSubdirectoryCaseless(mod, "exefs");
             if (IsDirValidAndNonEmpty(exefs_dir)) {
                 bool ips = false;
                 bool ipswitch = false;
@@ -494,9 +512,9 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
                 if (layeredfs)
                     AppendCommaIfNotEmpty(types, "LayeredExeFS");
             }
-            if (IsDirValidAndNonEmpty(mod->GetSubdirectory("romfs")))
+            if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "romfs")))
                 AppendCommaIfNotEmpty(types, "LayeredFS");
-            if (IsDirValidAndNonEmpty(mod->GetSubdirectory("cheats")))
+            if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "cheats")))
                 AppendCommaIfNotEmpty(types, "Cheats");
 
             if (types.empty())
diff --git a/src/core/file_sys/patch_manager.h b/src/core/file_sys/patch_manager.h
index ec6db524d..f4cb918dd 100644
--- a/src/core/file_sys/patch_manager.h
+++ b/src/core/file_sys/patch_manager.h
@@ -29,6 +29,11 @@ enum class TitleVersionFormat : u8 {
 std::string FormatTitleVersion(u32 version,
                                TitleVersionFormat format = TitleVersionFormat::ThreeElements);
 
+// Returns a directory with name matching name case-insensitive. Returns nullptr if directory
+// doesn't have a directory with name.
+std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
+                                                       std::string_view name);
+
 // A centralized class to manage patches to games.
 class PatchManager {
 public:
diff --git a/src/core/file_sys/system_archive/system_version.cpp b/src/core/file_sys/system_archive/system_version.cpp
index 6e22f97b0..aa313de66 100644
--- a/src/core/file_sys/system_archive/system_version.cpp
+++ b/src/core/file_sys/system_archive/system_version.cpp
@@ -12,17 +12,17 @@ namespace SystemVersionData {
 // This section should reflect the best system version to describe yuzu's HLE api.
 // TODO(DarkLordZach): Update when HLE gets better.
 
-constexpr u8 VERSION_MAJOR = 5;
-constexpr u8 VERSION_MINOR = 1;
-constexpr u8 VERSION_MICRO = 0;
+constexpr u8 VERSION_MAJOR = 10;
+constexpr u8 VERSION_MINOR = 0;
+constexpr u8 VERSION_MICRO = 2;
 
-constexpr u8 REVISION_MAJOR = 3;
+constexpr u8 REVISION_MAJOR = 1;
 constexpr u8 REVISION_MINOR = 0;
 
 constexpr char PLATFORM_STRING[] = "NX";
-constexpr char VERSION_HASH[] = "23f9df53e25709d756e0c76effcb2473bd3447dd";
-constexpr char DISPLAY_VERSION[] = "5.1.0";
-constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 5.1.0-3.0";
+constexpr char VERSION_HASH[] = "f90143fa8bbc061d4f68c35f95f04f8080c0ecdc";
+constexpr char DISPLAY_VERSION[] = "10.0.2";
+constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 10.0.2-1.0";
 
 } // namespace SystemVersionData
 
diff --git a/src/core/hle/service/hid/controllers/keyboard.cpp b/src/core/hle/service/hid/controllers/keyboard.cpp
index 358cb9329..9a8d354ba 100644
--- a/src/core/hle/service/hid/controllers/keyboard.cpp
+++ b/src/core/hle/service/hid/controllers/keyboard.cpp
@@ -38,10 +38,11 @@ void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing,
     cur_entry.sampling_number = last_entry.sampling_number + 1;
     cur_entry.sampling_number2 = cur_entry.sampling_number;
 
+    cur_entry.key.fill(0);
+    cur_entry.modifier = 0;
+
     for (std::size_t i = 0; i < keyboard_keys.size(); ++i) {
-        for (std::size_t k = 0; k < KEYS_PER_BYTE; ++k) {
-            cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << k);
-        }
+        cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << (i % KEYS_PER_BYTE));
     }
 
     for (std::size_t i = 0; i < keyboard_mods.size(); ++i) {
diff --git a/src/core/hle/service/nifm/nifm.cpp b/src/core/hle/service/nifm/nifm.cpp
index 767158444..01ddcdbd6 100644
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -177,7 +177,8 @@ private:
     void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
         LOG_DEBUG(Service_NIFM, "called");
 
-        ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c, "NetworkProfileData is not the correct size");
+        ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c,
+                   "SfNetworkProfileData is not the correct size");
         u128 uuid{};
         auto buffer = ctx.ReadBuffer();
         std::memcpy(&uuid, buffer.data() + 8, sizeof(u128));
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index da53cde05..4edff9cd8 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -112,6 +112,7 @@ void LogSettings() {
     LogSetting("Renderer_UseAsynchronousGpuEmulation",
                Settings::values.use_asynchronous_gpu_emulation);
     LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
+    LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
     LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
     LogSetting("Audio_OutputEngine", Settings::values.sink_id);
     LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
diff --git a/src/core/settings.h b/src/core/settings.h
index c1266b341..78eb33737 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -446,6 +446,7 @@ struct Values {
     GPUAccuracy gpu_accuracy;
     bool use_asynchronous_gpu_emulation;
     bool use_vsync;
+    bool use_assembly_shaders;
     bool force_30fps_mode;
     bool use_fast_gpu_time;
 
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index 1c3b03a1c..c781b3cfc 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
     AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
              Settings::values.use_asynchronous_gpu_emulation);
     AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
+    AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
     AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
 }
 
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index d23c53843..d6ee82836 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_library(video_core STATIC
     buffer_cache/buffer_block.h
     buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.cpp
     buffer_cache/map_interval.h
     dirty_flags.cpp
     dirty_flags.h
@@ -228,7 +229,7 @@ endif()
 create_target_directory_groups(video_core)
 
 target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PRIVATE glad)
+target_link_libraries(video_core PRIVATE glad xbyak)
 
 if (ENABLE_VULKAN)
     target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 56e570994..d9a4a1b4d 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -12,11 +12,12 @@
 #include <utility>
 #include <vector>
 
-#include <boost/icl/interval_map.hpp>
+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_set.hpp>
-#include <boost/range/iterator_range.hpp>
+#include <boost/intrusive/set.hpp>
 
 #include "common/alignment.h"
+#include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/core.h"
@@ -29,10 +30,12 @@
 
 namespace VideoCommon {
 
-using MapInterval = std::shared_ptr<MapIntervalBase>;
-
 template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
 class BufferCache {
+    using IntervalSet = boost::icl::interval_set<VAddr>;
+    using IntervalType = typename IntervalSet::interval_type;
+    using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+
 public:
     using BufferInfo = std::pair<BufferType, u64>;
 
@@ -40,14 +43,12 @@ public:
                             bool is_written = false, bool use_fast_cbuf = false) {
         std::lock_guard lock{mutex};
 
-        const std::optional<VAddr> cpu_addr_opt =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
-
+        const auto& memory_manager = system.GPU().MemoryManager();
+        const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
         if (!cpu_addr_opt) {
             return {GetEmptyBuffer(size), 0};
         }
-
-        VAddr cpu_addr = *cpu_addr_opt;
+        const VAddr cpu_addr = *cpu_addr_opt;
 
         // Cache management is a big overhead, so only cache entries with a given size.
         // TODO: Figure out which size is the best for given games.
@@ -77,16 +78,19 @@ public:
             }
         }
 
-        auto block = GetBlock(cpu_addr, size);
-        auto map = MapAddress(block, gpu_addr, cpu_addr, size);
+        OwnerBuffer block = GetBlock(cpu_addr, size);
+        MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
+        if (!map) {
+            return {GetEmptyBuffer(size), 0};
+        }
         if (is_written) {
             map->MarkAsModified(true, GetModifiedTicks());
             if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
                 MarkForAsyncFlush(map);
             }
-            if (!map->IsWritten()) {
-                map->MarkAsWritten(true);
-                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+            if (!map->is_written) {
+                map->is_written = true;
+                MarkRegionAsWritten(map->start, map->end - 1);
             }
         }
 
@@ -132,12 +136,11 @@ public:
     void FlushRegion(VAddr addr, std::size_t size) {
         std::lock_guard lock{mutex};
 
-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
-            return a->GetModificationTick() < b->GetModificationTick();
-        });
-        for (auto& object : objects) {
-            if (object->IsModified() && object->IsRegistered()) {
+        VectorMapInterval objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(),
+                  [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
+        for (MapInterval* object : objects) {
+            if (object->is_modified && object->is_registered) {
                 mutex.unlock();
                 FlushMap(object);
                 mutex.lock();
@@ -148,9 +151,9 @@ public:
     bool MustFlushRegion(VAddr addr, std::size_t size) {
         std::lock_guard lock{mutex};
 
-        const std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval& map) {
-            return map->IsModified() && map->IsRegistered();
+        const VectorMapInterval objects = GetMapsInRange(addr, size);
+        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
+            return map->is_modified && map->is_registered;
         });
     }
 
@@ -158,9 +161,8 @@ public:
     void InvalidateRegion(VAddr addr, u64 size) {
         std::lock_guard lock{mutex};
 
-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        for (auto& object : objects) {
-            if (object->IsRegistered()) {
+        for (auto& object : GetMapsInRange(addr, size)) {
+            if (object->is_registered) {
                 Unregister(object);
             }
         }
@@ -169,10 +171,10 @@ public:
     void OnCPUWrite(VAddr addr, std::size_t size) {
         std::lock_guard lock{mutex};
 
-        for (const auto& object : GetMapsInRange(addr, size)) {
-            if (object->IsMemoryMarked() && object->IsRegistered()) {
+        for (MapInterval* object : GetMapsInRange(addr, size)) {
+            if (object->is_memory_marked && object->is_registered) {
                 UnmarkMemory(object);
-                object->SetSyncPending(true);
+                object->is_sync_pending = true;
                 marked_for_unregister.emplace_back(object);
             }
         }
@@ -181,9 +183,9 @@ public:
     void SyncGuestHost() {
         std::lock_guard lock{mutex};
 
-        for (const auto& object : marked_for_unregister) {
-            if (object->IsRegistered()) {
-                object->SetSyncPending(false);
+        for (auto& object : marked_for_unregister) {
+            if (object->is_registered) {
+                object->is_sync_pending = false;
                 Unregister(object);
             }
         }
@@ -192,9 +194,9 @@ public:
 
     void CommitAsyncFlushes() {
         if (uncommitted_flushes) {
-            auto commit_list = std::make_shared<std::list<MapInterval>>();
-            for (auto& map : *uncommitted_flushes) {
-                if (map->IsRegistered() && map->IsModified()) {
+            auto commit_list = std::make_shared<std::list<MapInterval*>>();
+            for (MapInterval* map : *uncommitted_flushes) {
+                if (map->is_registered && map->is_modified) {
                     // TODO(Blinkhawk): Implement backend asynchronous flushing
                     // AsyncFlushMap(map)
                     commit_list->push_back(map);
@@ -228,8 +230,8 @@ public:
             committed_flushes.pop_front();
             return;
         }
-        for (MapInterval& map : *flush_list) {
-            if (map->IsRegistered()) {
+        for (MapInterval* map : *flush_list) {
+            if (map->is_registered) {
                 // TODO(Blinkhawk): Replace this for reading the asynchronous flush
                 FlushMap(map);
             }
@@ -265,61 +267,60 @@ protected:
     }
 
     /// Register an object into the cache
-    void Register(const MapInterval& new_map, bool inherit_written = false) {
-        const VAddr cpu_addr = new_map->GetStart();
+    MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
+        const VAddr cpu_addr = new_map.start;
         if (!cpu_addr) {
             LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
-                         new_map->GetGpuAddress());
-            return;
+                         new_map.gpu_addr);
+            return nullptr;
         }
-        const std::size_t size = new_map->GetEnd() - new_map->GetStart();
-        new_map->MarkAsRegistered(true);
-        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
-        mapped_addresses.insert({interval, new_map});
+        const std::size_t size = new_map.end - new_map.start;
+        new_map.is_registered = true;
         rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
-        new_map->SetMemoryMarked(true);
+        new_map.is_memory_marked = true;
         if (inherit_written) {
-            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
-            new_map->MarkAsWritten(true);
+            MarkRegionAsWritten(new_map.start, new_map.end - 1);
+            new_map.is_written = true;
         }
+        MapInterval* const storage = mapped_addresses_allocator.Allocate();
+        *storage = new_map;
+        mapped_addresses.insert(*storage);
+        return storage;
     }
 
-    void UnmarkMemory(const MapInterval& map) {
-        if (!map->IsMemoryMarked()) {
+    void UnmarkMemory(MapInterval* map) {
+        if (!map->is_memory_marked) {
             return;
         }
-        const std::size_t size = map->GetEnd() - map->GetStart();
-        rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1);
-        map->SetMemoryMarked(false);
+        const std::size_t size = map->end - map->start;
+        rasterizer.UpdatePagesCachedCount(map->start, size, -1);
+        map->is_memory_marked = false;
     }
 
     /// Unregisters an object from the cache
-    void Unregister(const MapInterval& map) {
+    void Unregister(MapInterval* map) {
         UnmarkMemory(map);
-        map->MarkAsRegistered(false);
-        if (map->IsSyncPending()) {
+        map->is_registered = false;
+        if (map->is_sync_pending) {
+            map->is_sync_pending = false;
             marked_for_unregister.remove(map);
-            map->SetSyncPending(false);
         }
-        if (map->IsWritten()) {
-            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+        if (map->is_written) {
+            UnmarkRegionAsWritten(map->start, map->end - 1);
         }
-        const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
-        mapped_addresses.erase(delete_interval);
+        const auto it = mapped_addresses.find(*map);
+        ASSERT(it != mapped_addresses.end());
+        mapped_addresses.erase(it);
+        mapped_addresses_allocator.Release(map);
     }
 
 private:
-    MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) {
-        return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
-    }
-
-    MapInterval MapAddress(const OwnerBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
-                           const std::size_t size) {
-        std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
+    MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr,
+                            std::size_t size) {
+        const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
         if (overlaps.empty()) {
             auto& memory_manager = system.GPU().MemoryManager();
             const VAddr cpu_addr_end = cpu_addr + size;
-            MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
             if (memory_manager.IsGranularRange(gpu_addr, size)) {
                 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
                 UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
@@ -328,13 +329,12 @@ private:
                 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
                 UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
             }
-            Register(new_map);
-            return new_map;
+            return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
         }
 
         const VAddr cpu_addr_end = cpu_addr + size;
         if (overlaps.size() == 1) {
-            MapInterval& current_map = overlaps[0];
+            MapInterval* const current_map = overlaps[0];
             if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
                 return current_map;
             }
@@ -344,35 +344,39 @@ private:
         bool write_inheritance = false;
         bool modified_inheritance = false;
         // Calculate new buffer parameters
-        for (auto& overlap : overlaps) {
-            new_start = std::min(overlap->GetStart(), new_start);
-            new_end = std::max(overlap->GetEnd(), new_end);
-            write_inheritance |= overlap->IsWritten();
-            modified_inheritance |= overlap->IsModified();
+        for (MapInterval* overlap : overlaps) {
+            new_start = std::min(overlap->start, new_start);
+            new_end = std::max(overlap->end, new_end);
+            write_inheritance |= overlap->is_written;
+            modified_inheritance |= overlap->is_modified;
         }
         GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
         for (auto& overlap : overlaps) {
             Unregister(overlap);
         }
         UpdateBlock(block, new_start, new_end, overlaps);
-        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
+
+        const MapInterval new_map{new_start, new_end, new_gpu_addr};
+        MapInterval* const map = Register(new_map, write_inheritance);
+        if (!map) {
+            return nullptr;
+        }
         if (modified_inheritance) {
-            new_map->MarkAsModified(true, GetModifiedTicks());
+            map->MarkAsModified(true, GetModifiedTicks());
             if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
-                MarkForAsyncFlush(new_map);
+                MarkForAsyncFlush(map);
             }
         }
-        Register(new_map, write_inheritance);
-        return new_map;
+        return map;
     }
 
     void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
-                     std::vector<MapInterval>& overlaps) {
+                     const VectorMapInterval& overlaps) {
         const IntervalType base_interval{start, end};
         IntervalSet interval_set{};
         interval_set.add(base_interval);
         for (auto& overlap : overlaps) {
-            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
+            const IntervalType subtract{overlap->start, overlap->end};
             interval_set.subtract(subtract);
         }
         for (auto& interval : interval_set) {
@@ -386,18 +390,24 @@ private:
         }
     }
 
-    std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) {
+    VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
+        VectorMapInterval result;
         if (size == 0) {
-            return {};
+            return result;
         }
 
-        std::vector<MapInterval> objects{};
-        const IntervalType interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
-            objects.push_back(pair.second);
+        const VAddr addr_end = addr + size;
+        auto it = mapped_addresses.lower_bound(addr);
+        if (it != mapped_addresses.begin()) {
+            --it;
         }
-
-        return objects;
+        while (it != mapped_addresses.end() && it->start < addr_end) {
+            if (it->Overlaps(addr, addr_end)) {
+                result.push_back(&*it);
+            }
+            ++it;
+        }
+        return result;
     }
 
     /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -405,12 +415,12 @@ private:
         return ++modified_ticks;
     }
 
-    void FlushMap(MapInterval map) {
-        std::size_t size = map->GetEnd() - map->GetStart();
-        OwnerBuffer block = blocks[map->GetStart() >> block_page_bits];
+    void FlushMap(MapInterval* map) {
+        const std::size_t size = map->end - map->start;
+        OwnerBuffer block = blocks[map->start >> block_page_bits];
         staging_buffer.resize(size);
-        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data());
-        system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size);
+        DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data());
+        system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
 
@@ -515,7 +525,7 @@ private:
             } else {
                 written_pages[page_start] = 1;
             }
-            page_start++;
+            ++page_start;
         }
     }
 
@@ -531,7 +541,7 @@ private:
                     written_pages.erase(it);
                 }
             }
-            page_start++;
+            ++page_start;
         }
     }
 
@@ -542,14 +552,14 @@ private:
             if (written_pages.count(page_start) > 0) {
                 return true;
             }
-            page_start++;
+            ++page_start;
         }
         return false;
     }
 
-    void MarkForAsyncFlush(MapInterval& map) {
+    void MarkForAsyncFlush(MapInterval* map) {
         if (!uncommitted_flushes) {
-            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval>>();
+            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
         }
         uncommitted_flushes->insert(map);
     }
@@ -566,10 +576,9 @@ private:
     u64 buffer_offset = 0;
     u64 buffer_offset_base = 0;
 
-    using IntervalSet = boost::icl::interval_set<VAddr>;
-    using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>;
-    using IntervalType = typename IntervalCache::interval_type;
-    IntervalCache mapped_addresses;
+    MapIntervalAllocator mapped_addresses_allocator;
+    boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
+        mapped_addresses;
 
     static constexpr u64 write_page_bit = 11;
     std::unordered_map<u64, u32> written_pages;
@@ -583,10 +592,10 @@ private:
     u64 modified_ticks = 0;
 
     std::vector<u8> staging_buffer;
-    std::list<MapInterval> marked_for_unregister;
+    std::list<MapInterval*> marked_for_unregister;
 
-    std::shared_ptr<std::unordered_set<MapInterval>> uncommitted_flushes{};
-    std::list<std::shared_ptr<std::list<MapInterval>>> committed_flushes;
+    std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
+    std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
 
     std::recursive_mutex mutex;
 };
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
new file mode 100644
index 000000000..62587e18a
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.cpp
@@ -0,0 +1,33 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "video_core/buffer_cache/map_interval.h"
+
+namespace VideoCommon {
+
+MapIntervalAllocator::MapIntervalAllocator() {
+    FillFreeList(first_chunk);
+}
+
+MapIntervalAllocator::~MapIntervalAllocator() = default;
+
+void MapIntervalAllocator::AllocateNewChunk() {
+    *new_chunk = std::make_unique<Chunk>();
+    FillFreeList(**new_chunk);
+    new_chunk = &(*new_chunk)->next;
+}
+
+void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
+    const std::size_t old_size = free_list.size();
+    free_list.resize(old_size + chunk.data.size());
+    std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
+                   [](MapInterval& interval) { return &interval; });
+}
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
index 29d8b26f3..fe0bcd1d8 100644
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -4,104 +4,89 @@
 
 #pragma once
 
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <boost/intrusive/set_hook.hpp>
+
 #include "common/common_types.h"
 #include "video_core/gpu.h"
 
 namespace VideoCommon {
 
-class MapIntervalBase {
-public:
-    MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
-        : start{start}, end{end}, gpu_addr{gpu_addr} {}
-
-    void SetCpuAddress(VAddr new_cpu_addr) {
-        cpu_addr = new_cpu_addr;
-    }
-
-    VAddr GetCpuAddress() const {
-        return cpu_addr;
-    }
-
-    GPUVAddr GetGpuAddress() const {
-        return gpu_addr;
-    }
-
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
-        return (start <= other_start && other_end <= end);
-    }
-
-    bool operator==(const MapIntervalBase& rhs) const {
-        return std::tie(start, end) == std::tie(rhs.start, rhs.end);
-    }
-
-    bool operator!=(const MapIntervalBase& rhs) const {
-        return !operator==(rhs);
-    }
+struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
+    MapInterval() = default;
 
-    void MarkAsRegistered(const bool registered) {
-        is_registered = registered;
-    }
+    /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
 
-    bool IsRegistered() const {
-        return is_registered;
-    }
+    explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
+        : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
 
-    void SetMemoryMarked(bool is_memory_marked_) {
-        is_memory_marked = is_memory_marked_;
+    bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
+        return start <= other_start && other_end <= end;
     }
 
-    bool IsMemoryMarked() const {
-        return is_memory_marked;
+    bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
+        return start < other_end && other_start < end;
     }
 
-    void SetSyncPending(bool is_sync_pending_) {
-        is_sync_pending = is_sync_pending_;
-    }
+    void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
+        is_modified = is_modified_;
+        ticks = ticks_;
+    }
+
+    boost::intrusive::set_member_hook<> member_hook_;
+    VAddr start = 0;
+    VAddr end = 0;
+    GPUVAddr gpu_addr = 0;
+    u64 ticks = 0;
+    bool is_written = false;
+    bool is_modified = false;
+    bool is_registered = false;
+    bool is_memory_marked = false;
+    bool is_sync_pending = false;
+};
 
-    bool IsSyncPending() const {
-        return is_sync_pending;
+struct MapIntervalCompare {
+    constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
+        return lhs.start < rhs.start;
     }
+};
 
-    VAddr GetStart() const {
-        return start;
-    }
+class MapIntervalAllocator {
+public:
+    MapIntervalAllocator();
+    ~MapIntervalAllocator();
 
-    VAddr GetEnd() const {
-        return end;
+    MapInterval* Allocate() {
+        if (free_list.empty()) {
+            AllocateNewChunk();
+        }
+        MapInterval* const interval = free_list.back();
+        free_list.pop_back();
+        return interval;
     }
 
-    void MarkAsModified(const bool is_modified_, const u64 tick) {
-        is_modified = is_modified_;
-        ticks = tick;
+    void Release(MapInterval* interval) {
+        free_list.push_back(interval);
     }
 
-    bool IsModified() const {
-        return is_modified;
-    }
+private:
+    struct Chunk {
+        std::unique_ptr<Chunk> next;
+        std::array<MapInterval, 0x8000> data;
+    };
 
-    u64 GetModificationTick() const {
-        return ticks;
-    }
+    void AllocateNewChunk();
 
-    void MarkAsWritten(const bool is_written_) {
-        is_written = is_written_;
-    }
+    void FillFreeList(Chunk& chunk);
 
-    bool IsWritten() const {
-        return is_written;
-    }
+    std::vector<MapInterval*> free_list;
+    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
 
-private:
-    VAddr start;
-    VAddr end;
-    GPUVAddr gpu_addr;
-    VAddr cpu_addr{};
-    bool is_written{};
-    bool is_modified{};
-    bool is_registered{};
-    bool is_memory_marked{};
-    bool is_sync_pending{};
-    u64 ticks{};
+    Chunk first_chunk;
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 024c9e43b..004f6b261 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -457,8 +457,9 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 
 void Maxwell3D::ProcessQueryGet() {
     // TODO(Subv): Support the other query units.
-    ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
-               "Units other than CROP are unimplemented");
+    if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
+        LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
+    }
 
     switch (regs.query.query_get.operation) {
     case Regs::QueryOperation::Release:
@@ -534,8 +535,8 @@ void Maxwell3D::ProcessCounterReset() {
         rasterizer.ResetCounter(QueryType::SamplesPassed);
         break;
     default:
-        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
-                    static_cast<int>(regs.counter_reset));
+        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
+                  static_cast<int>(regs.counter_reset));
         break;
     }
 }
@@ -592,8 +593,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
                          system.GPU().GetTicks());
         return {};
     default:
-        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                          static_cast<u32>(regs.query.query_get.select.Value()));
+        LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
+                  static_cast<u32>(regs.query.query_get.select.Value()));
         return 1;
     }
 }
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index 22987751e..096ee337c 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -56,9 +56,27 @@ public:
         last_modified_ticks = cache.GetModifiedTicks();
     }
 
+    void SetMemoryMarked(bool is_memory_marked_) {
+        is_memory_marked = is_memory_marked_;
+    }
+
+    bool IsMemoryMarked() const {
+        return is_memory_marked;
+    }
+
+    void SetSyncPending(bool is_sync_pending_) {
+        is_sync_pending = is_sync_pending_;
+    }
+
+    bool IsSyncPending() const {
+        return is_sync_pending;
+    }
+
 private:
     bool is_registered{};      ///< Whether the object is currently registered with the cache
     bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
+    bool is_memory_marked{};   ///< Whether the object is marking rasterizer memory.
+    bool is_sync_pending{};    ///< Whether the object is pending deletion.
     u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
     VAddr cpu_addr{};          ///< Cpu address memory, unique from emulated virtual address space
 };
@@ -94,6 +112,30 @@ public:
         }
     }
 
+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
+            if (object->IsRegistered()) {
+                UnmarkMemory(object);
+                object->SetSyncPending(true);
+                marked_for_unregister.emplace_back(object);
+            }
+        }
+    }
+
+    void SyncGuestHost() {
+        std::lock_guard lock{mutex};
+
+        for (const auto& object : marked_for_unregister) {
+            if (object->IsRegistered()) {
+                object->SetSyncPending(false);
+                Unregister(object);
+            }
+        }
+        marked_for_unregister.clear();
+    }
+
     /// Invalidates everything in the cache
     void InvalidateAll() {
         std::lock_guard lock{mutex};
@@ -120,19 +162,32 @@ protected:
         interval_cache.add({GetInterval(object), ObjectSet{object}});
         map_cache.insert({object->GetCpuAddr(), object});
         rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
+        object->SetMemoryMarked(true);
     }
 
     /// Unregisters an object from the cache
     virtual void Unregister(const T& object) {
         std::lock_guard lock{mutex};
 
+        UnmarkMemory(object);
         object->SetIsRegistered(false);
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
+        if (object->IsSyncPending()) {
+            marked_for_unregister.remove(object);
+            object->SetSyncPending(false);
+        }
         const VAddr addr = object->GetCpuAddr();
         interval_cache.subtract({GetInterval(object), ObjectSet{object}});
         map_cache.erase(addr);
     }
 
+    void UnmarkMemory(const T& object) {
+        if (!object->IsMemoryMarked()) {
+            return;
+        }
+        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
+        object->SetMemoryMarked(false);
+    }
+
     /// Returns a ticks counter used for tracking when cached objects were last modified
     u64 GetModifiedTicks() {
         std::lock_guard lock{mutex};
@@ -194,4 +249,5 @@ private:
     IntervalCache interval_cache; ///< Cache of objects
     u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
     VideoCore::RasterizerInterface& rasterizer;
+    std::list<T> marked_for_unregister;
 };
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index d2cab50bd..9964ea894 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,6 +8,7 @@
 
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index d83dca25a..e1b245288 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -13,6 +13,7 @@
 
 #include "common/logging/log.h"
 #include "common/scope_exit.h"
+#include "core/settings.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
@@ -165,8 +166,6 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_intel = vendor == "Intel";
-    const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
 
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -181,12 +180,17 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_broken_compute = is_intel_proprietary;
     has_fast_buffer_sub_data = is_nvidia;
+    use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
+                           GLAD_GL_NV_compute_program5;
 
     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
     LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
     LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
+
+    if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
+        LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
+    }
 }
 
 Device::Device(std::nullptr_t) {
@@ -199,7 +203,6 @@ Device::Device(std::nullptr_t) {
     has_image_load_formatted = true;
     has_variable_aoffi = true;
     has_component_indexing_bug = false;
-    has_broken_compute = false;
     has_precise_bug = false;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index a55050cb5..683ed9002 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -80,14 +80,14 @@ public:
         return has_precise_bug;
     }
 
-    bool HasBrokenCompute() const {
-        return has_broken_compute;
-    }
-
     bool HasFastBufferSubData() const {
         return has_fast_buffer_sub_data;
     }
 
+    bool UseAssemblyShaders() const {
+        return use_assembly_shaders;
+    }
+
 private:
     static bool TestVariableAoffi();
     static bool TestPreciseBug();
@@ -105,8 +105,8 @@ private:
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
     bool has_precise_bug{};
-    bool has_broken_compute{};
     bool has_fast_buffer_sub_data{};
+    bool use_assembly_shaders{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 99ddcb3f8..ec5421afa 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -4,6 +4,7 @@
 
 #include "common/assert.h"
 
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_fence_manager.h"
 
 namespace OpenGL {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 69dcf952f..61cf99b9d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -94,17 +94,30 @@ void oglEnable(GLenum cap, bool state) {
 } // Anonymous namespace
 
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                                   ScreenInfo& info, GLShader::ProgramManager& program_manager,
-                                   StateTracker& state_tracker)
-    : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker},
+                                   const Device& device, ScreenInfo& info,
+                                   ProgramManager& program_manager, StateTracker& state_tracker)
+    : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
+                                                                            state_tracker},
       shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
       buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
       fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
       screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
     CheckExtensions();
+
+    if (device.UseAssemblyShaders()) {
+        glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+        for (const GLuint cbuf : staging_cbufs) {
+            glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
+                                 nullptr, 0);
+        }
+    }
 }
 
-RasterizerOpenGL::~RasterizerOpenGL() {}
+RasterizerOpenGL::~RasterizerOpenGL() {
+    if (device.UseAssemblyShaders()) {
+        glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+    }
+}
 
 void RasterizerOpenGL::CheckExtensions() {
     if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -230,6 +243,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     MICROPROFILE_SCOPE(OpenGL_Shader);
     auto& gpu = system.GPU().Maxwell3D();
+    std::size_t num_ssbos = 0;
     u32 clip_distances = 0;
 
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -261,6 +275,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         Shader shader{shader_cache.GetStageProgram(program)};
 
+        if (device.UseAssemblyShaders()) {
+            // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
+            // all stages share the same bindings.
+            const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
+            ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
+            num_ssbos += num_stage_ssbos;
+        }
+
         // Stage indices are 0 - 5
         const std::size_t stage = index == 0 ? 0 : index - 1;
         SetupDrawConstBuffers(stage, shader);
@@ -526,6 +548,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     SyncFramebufferSRGB();
 
     buffer_cache.Acquire();
+    current_cbuf = 0;
 
     std::size_t buffer_size = CalculateVertexArraysSize();
 
@@ -535,9 +558,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     }
 
     // Uniform space for the 5 shader stages
-    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
-                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
-                      Maxwell::MaxShaderStage;
+    buffer_size =
+        Common::AlignUp<std::size_t>(buffer_size, 4) +
+        (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
     buffer_size += Maxwell::MaxConstBuffers *
@@ -558,12 +581,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     }
 
     // Setup emulation uniform buffer.
-    GLShader::MaxwellUniformData ubo;
-    ubo.SetFromRegs(gpu);
-    const auto [buffer, offset] =
-        buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-    glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
-                      static_cast<GLsizeiptr>(sizeof(ubo)));
+    if (!device.UseAssemblyShaders()) {
+        MaxwellUniformData ubo;
+        ubo.SetFromRegs(gpu);
+        const auto [buffer, offset] =
+            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+                          static_cast<GLsizeiptr>(sizeof(ubo)));
+    }
 
     // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
@@ -630,16 +655,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    if (device.HasBrokenCompute()) {
-        return;
-    }
-
     buffer_cache.Acquire();
+    current_cbuf = 0;
 
     auto kernel = shader_cache.GetComputeKernel(code_addr);
     SetupComputeTextures(kernel);
     SetupComputeImages(kernel);
-    program_manager.BindComputeShader(kernel->GetHandle());
 
     const std::size_t buffer_size =
         Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -652,6 +673,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
     buffer_cache.Unmap();
 
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    program_manager.BindCompute(kernel->GetHandle());
     glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
     ++num_queued_commands;
 }
@@ -701,15 +723,15 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
         return;
     }
     texture_cache.OnCPUWrite(addr, size);
-    shader_cache.InvalidateRegion(addr, size);
+    shader_cache.OnCPUWrite(addr, size);
     buffer_cache.OnCPUWrite(addr, size);
-    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::SyncGuestHost() {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     texture_cache.SyncGuestHost();
     buffer_cache.SyncGuestHost();
+    shader_cache.SyncGuestHost();
 }
 
 void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -812,14 +834,20 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 }
 
 void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+    static constexpr std::array PARAMETER_LUT = {
+        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
+
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
     const auto& shader_stage = stages[stage_index];
 
-    u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
+    u32 binding =
+        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;
     for (const auto& entry : shader->GetEntries().const_buffers) {
         const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+        SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);
     }
 }
 
@@ -835,16 +863,21 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
         buffer.address = config.Address();
         buffer.size = config.size;
         buffer.enabled = mask[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);
     }
 }
 
-void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
+void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
+                                        const Tegra::Engines::ConstBufferInfo& buffer,
                                         const ConstBufferEntry& entry) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
-                          sizeof(float));
+        if (device.UseAssemblyShaders()) {
+            glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
+        } else {
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
+                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+        }
         return;
     }
 
@@ -853,9 +886,19 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
     const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
 
     const auto alignment = device.GetUniformBufferAlignment();
-    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
-                                                          device.HasFastBufferSubData());
-    glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+    auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
+                                                    device.HasFastBufferSubData());
+    if (!device.UseAssemblyShaders()) {
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+        return;
+    }
+    if (offset != 0) {
+        const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+        glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
+        cbuf = staging_cbuf;
+        offset = 0;
+    }
+    glBindBufferRangeNV(stage, binding, cbuf, offset, size);
 }
 
 void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
@@ -863,7 +906,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
 
-    u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
+    u32 binding =
+        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
     for (const auto& entry : shader->GetEntries().global_memory_entries) {
         const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
         const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
@@ -929,16 +973,12 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
         glBindTextureUnit(binding, 0);
         return;
     }
-    glBindTextureUnit(binding, view->GetTexture());
-
-    if (view->GetSurfaceParams().IsBuffer()) {
-        return;
+    const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
+                                           texture.tic.z_source, texture.tic.w_source);
+    glBindTextureUnit(binding, handle);
+    if (!view->GetSurfaceParams().IsBuffer()) {
+        glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
     }
-    // Apply swizzle to textures that are not buffers.
-    view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
-                       texture.tic.w_source);
-
-    glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
 }
 
 void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
@@ -967,14 +1007,11 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
         glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
         return;
     }
-    if (!tic.IsBuffer()) {
-        view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
-    }
     if (entry.is_written) {
         view->MarkAsModified(texture_cache.Tick());
     }
-    glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
-                       view->GetFormat());
+    const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+    glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
 }
 
 void RasterizerOpenGL::SyncViewport() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index b94c65907..87f7fe159 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -56,8 +56,8 @@ struct DrawParameters;
 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
     explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                              ScreenInfo& info, GLShader::ProgramManager& program_manager,
-                              StateTracker& state_tracker);
+                              const Device& device, ScreenInfo& info,
+                              ProgramManager& program_manager, StateTracker& state_tracker);
     ~RasterizerOpenGL() override;
 
     void Draw(bool is_indexed, bool is_instanced) override;
@@ -106,7 +106,7 @@ private:
     void SetupComputeConstBuffers(const Shader& kernel);
 
     /// Configures a constant buffer.
-    void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
+    void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
                           const ConstBufferEntry& entry);
 
     /// Configures the current global memory entries to use for the draw command.
@@ -224,7 +224,7 @@ private:
 
     void SetupShaders(GLenum primitive_mode);
 
-    const Device device;
+    const Device& device;
 
     TextureCacheOpenGL texture_cache;
     ShaderCacheOpenGL shader_cache;
@@ -236,7 +236,7 @@ private:
 
     Core::System& system;
     ScreenInfo& screen_info;
-    GLShader::ProgramManager& program_manager;
+    ProgramManager& program_manager;
     StateTracker& state_tracker;
 
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
@@ -248,6 +248,12 @@ private:
     std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
         enabled_transform_feedback_buffers;
 
+    static constexpr std::size_t NUM_CONSTANT_BUFFERS =
+        Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+        Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+    std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
+    std::size_t current_cbuf = 0;
+
     /// Number of commands queued to the OpenGL driver. Reseted on flush.
     std::size_t num_queued_commands = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 97803d480..a787e27d2 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -125,6 +125,15 @@ void OGLProgram::Release() {
     handle = 0;
 }
 
+void OGLAssemblyProgram::Release() {
+    if (handle == 0) {
+        return;
+    }
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteProgramsARB(1, &handle);
+    handle = 0;
+}
+
 void OGLPipeline::Create() {
     if (handle != 0)
         return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index de93f4212..f8b322227 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -167,6 +167,22 @@ public:
     GLuint handle = 0;
 };
 
+class OGLAssemblyProgram : private NonCopyable {
+public:
+    OGLAssemblyProgram() = default;
+
+    OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLAssemblyProgram() {
+        Release();
+    }
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 class OGLPipeline : private NonCopyable {
 public:
     OGLPipeline() = default;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 9759a7078..4cd0f36cf 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
     return {};
 }
 
+constexpr GLenum AssemblyEnum(ShaderType shader_type) {
+    switch (shader_type) {
+    case ShaderType::Vertex:
+        return GL_VERTEX_PROGRAM_NV;
+    case ShaderType::TesselationControl:
+        return GL_TESS_CONTROL_PROGRAM_NV;
+    case ShaderType::TesselationEval:
+        return GL_TESS_EVALUATION_PROGRAM_NV;
+    case ShaderType::Geometry:
+        return GL_GEOMETRY_PROGRAM_NV;
+    case ShaderType::Fragment:
+        return GL_FRAGMENT_PROGRAM_NV;
+    case ShaderType::Compute:
+        return GL_COMPUTE_PROGRAM_NV;
+    }
+    return {};
+}
+
 std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
     return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
 }
@@ -120,18 +138,43 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
     return registry;
 }
 
-std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
-                                        u64 unique_identifier, const ShaderIR& ir,
-                                        const Registry& registry, bool hint_retrievable = false) {
+ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
+                             const ShaderIR& ir, const Registry& registry,
+                             bool hint_retrievable = false) {
     const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
     LOG_INFO(Render_OpenGL, "{}", shader_id);
 
-    const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
-    OGLShader shader;
-    shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+    auto program = std::make_shared<ProgramHandle>();
+
+    if (device.UseAssemblyShaders()) {
+        const std::string arb = "Not implemented";
+
+        GLuint& arb_prog = program->assembly_program.handle;
+
+// Commented out functions signal OpenGL errors but are compatible with apitrace.
+// Use them only to capture and replay on apitrace.
+#if 0
+        glGenProgramsNV(1, &arb_prog);
+        glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
+                        reinterpret_cast<const GLubyte*>(arb.data()));
+#else
+        glGenProgramsARB(1, &arb_prog);
+        glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
+                                static_cast<GLsizei>(arb.size()), arb.data());
+#endif
+        const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
+        if (err && *err) {
+            LOG_CRITICAL(Render_OpenGL, "{}", err);
+            LOG_INFO(Render_OpenGL, "\n{}", arb);
+        }
+    } else {
+        const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
+        OGLShader shader;
+        shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+
+        program->source_program.Create(true, hint_retrievable, shader.handle);
+    }
 
-    auto program = std::make_shared<OGLProgram>();
-    program->Create(true, hint_retrievable, shader.handle);
     return program;
 }
 
@@ -153,15 +196,22 @@ std::unordered_set<GLenum> GetSupportedFormats() {
 
 CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
                            std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                           ShaderEntries entries, std::shared_ptr<OGLProgram> program)
+                           ShaderEntries entries, ProgramSharedPtr program_)
     : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
-      size_in_bytes{size_in_bytes}, program{std::move(program)} {}
+      size_in_bytes{size_in_bytes}, program{std::move(program_)} {
+    // Assign either the assembly program or source program. We can't have both.
+    handle = program->assembly_program.handle;
+    if (handle == 0) {
+        handle = program->source_program.handle;
+    }
+    ASSERT(handle != 0);
+}
 
 CachedShader::~CachedShader() = default;
 
 GLuint CachedShader::GetHandle() const {
     DEBUG_ASSERT(registry->IsConsistent());
-    return program->handle;
+    return handle;
 }
 
 Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
@@ -239,7 +289,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         return;
     }
 
-    const std::vector gl_cache = disk_cache.LoadPrecompiled();
+    std::vector<ShaderDiskCachePrecompiled> gl_cache;
+    if (!device.UseAssemblyShaders()) {
+        // Only load precompiled cache when we are not using assembly shaders
+        gl_cache = disk_cache.LoadPrecompiled();
+    }
     const auto supported_formats = GetSupportedFormats();
 
     // Track if precompiled cache was altered during loading to know if we have to
@@ -278,7 +332,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             auto registry = MakeRegistry(entry);
             const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
 
-            std::shared_ptr<OGLProgram> program;
+            ProgramSharedPtr program;
             if (precompiled_entry) {
                 // If the shader is precompiled, attempt to load it with
                 program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -332,6 +386,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         return;
     }
 
+    if (device.UseAssemblyShaders()) {
+        // Don't store precompiled binaries for assembly shaders.
+        return;
+    }
+
     // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
     // before precompiling them
 
@@ -339,7 +398,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         const u64 id = (*transferable)[i].unique_identifier;
         const auto it = find_precompiled(id);
         if (it == gl_cache.end()) {
-            const GLuint program = runtime_cache.at(id).program->handle;
+            const GLuint program = runtime_cache.at(id).program->source_program.handle;
             disk_cache.SavePrecompiled(id, program);
             precompiled_cache_altered = true;
         }
@@ -350,7 +409,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
     }
 }
 
-std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
+ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
     const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
     const std::unordered_set<GLenum>& supported_formats) {
     if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -358,15 +417,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
         return {};
     }
 
-    auto program = std::make_shared<OGLProgram>();
-    program->handle = glCreateProgram();
-    glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
-    glProgramBinary(program->handle, precompiled_entry.binary_format,
-                    precompiled_entry.binary.data(),
+    auto program = std::make_shared<ProgramHandle>();
+    GLuint& handle = program->source_program.handle;
+    handle = glCreateProgram();
+    glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
+    glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
                     static_cast<GLsizei>(precompiled_entry.binary.size()));
 
     GLint link_status;
-    glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
+    glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
     if (link_status == GL_FALSE) {
         LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
         return {};
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 91690b470..b2ae8d7f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -43,8 +43,14 @@ struct UnspecializedShader;
 using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
+struct ProgramHandle {
+    OGLProgram source_program;
+    OGLAssemblyProgram assembly_program;
+};
+using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
+
 struct PrecompiledShader {
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
     std::shared_ptr<VideoCommon::Shader::Registry> registry;
     ShaderEntries entries;
 };
@@ -87,12 +93,13 @@ public:
 private:
     explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                          ShaderEntries entries, std::shared_ptr<OGLProgram> program);
+                          ShaderEntries entries, ProgramSharedPtr program);
 
     std::shared_ptr<VideoCommon::Shader::Registry> registry;
     ShaderEntries entries;
     std::size_t size_in_bytes = 0;
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
+    GLuint handle = 0;
 };
 
 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -115,7 +122,7 @@ protected:
     void FlushObjectInner(const Shader& object) override {}
 
 private:
-    std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
+    ProgramSharedPtr GeneratePrecompiledProgram(
         const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
         const std::unordered_set<GLenum>& supported_formats);
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 960ebf1a1..9cb115959 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1538,7 +1538,9 @@ private:
         Expression target;
         if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
-                // Writing to Register::ZeroIndex is a no op
+                // Writing to Register::ZeroIndex is a no op but we still have to visit the source
+                // as it might have side effects.
+                code.AddLine("{};", Visit(src).GetCode());
                 return {};
             }
             target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -2309,6 +2311,18 @@ private:
         return {"gl_SubGroupInvocationARB", Type::Uint};
     }
 
+    template <const std::string_view& comparison>
+    Expression ThreadMask(Operation) {
+        if (device.HasWarpIntrinsics()) {
+            return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
+        }
+        if (device.HasShaderBallot()) {
+            return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
+        }
+        LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
+        return {"0U", Type::Uint};
+    }
+
     Expression ShuffleIndexed(Operation operation) {
         std::string value = VisitOperand(operation, 0).AsFloat();
 
@@ -2321,7 +2335,21 @@ private:
         return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
     }
 
-    Expression MemoryBarrierGL(Operation) {
+    Expression Barrier(Operation) {
+        if (!ir.IsDecompiled()) {
+            LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
+            return {};
+        }
+        code.AddLine("barrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGroup(Operation) {
+        code.AddLine("groupMemoryBarrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGlobal(Operation) {
         code.AddLine("memoryBarrier();");
         return {};
     }
@@ -2337,6 +2365,12 @@ private:
         static constexpr std::string_view NotEqual = "!=";
         static constexpr std::string_view GreaterEqual = ">=";
 
+        static constexpr std::string_view Eq = "Eq";
+        static constexpr std::string_view Ge = "Ge";
+        static constexpr std::string_view Gt = "Gt";
+        static constexpr std::string_view Le = "Le";
+        static constexpr std::string_view Lt = "Lt";
+
         static constexpr std::string_view Add = "Add";
         static constexpr std::string_view Min = "Min";
         static constexpr std::string_view Max = "Max";
@@ -2554,9 +2588,16 @@ private:
         &GLSLDecompiler::VoteEqual,
 
         &GLSLDecompiler::ThreadId,
+        &GLSLDecompiler::ThreadMask<Func::Eq>,
+        &GLSLDecompiler::ThreadMask<Func::Ge>,
+        &GLSLDecompiler::ThreadMask<Func::Gt>,
+        &GLSLDecompiler::ThreadMask<Func::Le>,
+        &GLSLDecompiler::ThreadMask<Func::Lt>,
         &GLSLDecompiler::ShuffleIndexed,
 
-        &GLSLDecompiler::MemoryBarrierGL,
+        &GLSLDecompiler::Barrier,
+        &GLSLDecompiler::MemoryBarrierGroup,
+        &GLSLDecompiler::MemoryBarrierGlobal,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 9c7b0adbd..8e754fa90 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -6,45 +6,109 @@
 
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 
-namespace OpenGL::GLShader {
+namespace OpenGL {
 
-ProgramManager::ProgramManager() = default;
+ProgramManager::ProgramManager(const Device& device) {
+    use_assembly_programs = device.UseAssemblyShaders();
+    if (use_assembly_programs) {
+        glEnable(GL_COMPUTE_PROGRAM_NV);
+    } else {
+        graphics_pipeline.Create();
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}
 
 ProgramManager::~ProgramManager() = default;
 
-void ProgramManager::Create() {
-    graphics_pipeline.Create();
-    glBindProgramPipeline(graphics_pipeline.handle);
+void ProgramManager::BindCompute(GLuint program) {
+    if (use_assembly_programs) {
+        glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
+    } else {
+        is_graphics_bound = false;
+        glUseProgram(program);
+    }
 }
 
 void ProgramManager::BindGraphicsPipeline() {
-    if (!is_graphics_bound) {
-        is_graphics_bound = true;
-        glUseProgram(0);
+    if (use_assembly_programs) {
+        UpdateAssemblyPrograms();
+    } else {
+        UpdateSourcePrograms();
     }
+}
 
-    // Avoid updating the pipeline when values have no changed
-    if (old_state == current_state) {
-        return;
+void ProgramManager::BindHostPipeline(GLuint pipeline) {
+    if (use_assembly_programs) {
+        if (geometry_enabled) {
+            geometry_enabled = false;
+            old_state.geometry = 0;
+            glDisable(GL_GEOMETRY_PROGRAM_NV);
+        }
+    } else {
+        if (!is_graphics_bound) {
+            glUseProgram(0);
+        }
     }
+    glBindProgramPipeline(pipeline);
+}
 
-    // Workaround for AMD bug
-    static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
-                                            GL_FRAGMENT_SHADER_BIT};
-    const GLuint handle = graphics_pipeline.handle;
-    glUseProgramStages(handle, all_used_stages, 0);
-    glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
-    glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
-    glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
+void ProgramManager::RestoreGuestPipeline() {
+    if (use_assembly_programs) {
+        glBindProgramPipeline(0);
+    } else {
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}
+
+void ProgramManager::UpdateAssemblyPrograms() {
+    const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
+        if (current == old) {
+            return;
+        }
+        if (current == 0) {
+            if (enabled) {
+                enabled = false;
+                glDisable(stage);
+            }
+            return;
+        }
+        if (!enabled) {
+            enabled = true;
+            glEnable(stage);
+        }
+        glBindProgramARB(stage, current);
+    };
+
+    update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
+    update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
+                 old_state.geometry);
+    update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
+                 old_state.fragment);
 
     old_state = current_state;
 }
 
-void ProgramManager::BindComputeShader(GLuint program) {
-    is_graphics_bound = false;
-    glUseProgram(program);
+void ProgramManager::UpdateSourcePrograms() {
+    if (!is_graphics_bound) {
+        is_graphics_bound = true;
+        glUseProgram(0);
+    }
+
+    const GLuint handle = graphics_pipeline.handle;
+    const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
+        if (current == old) {
+            return;
+        }
+        glUseProgramStages(handle, stage, current);
+    };
+    update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
+    update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
+    update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
+
+    old_state = current_state;
 }
 
 void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
@@ -54,4 +118,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
     y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
 }
 
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index d2e47f2a9..0f03b4f12 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -11,7 +11,9 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 
-namespace OpenGL::GLShader {
+namespace OpenGL {
+
+class Device;
 
 /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
 /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
 
 class ProgramManager {
 public:
-    explicit ProgramManager();
+    explicit ProgramManager(const Device& device);
     ~ProgramManager();
 
-    void Create();
+    /// Binds a compute program
+    void BindCompute(GLuint program);
 
-    /// Updates the graphics pipeline and binds it.
+    /// Updates bound programs.
     void BindGraphicsPipeline();
 
-    /// Binds a compute shader.
-    void BindComputeShader(GLuint program);
+    /// Binds an OpenGL pipeline object unsynchronized with the guest state.
+    void BindHostPipeline(GLuint pipeline);
+
+    /// Rewinds BindHostPipeline state changes.
+    void RestoreGuestPipeline();
 
     void UseVertexShader(GLuint program) {
-        current_state.vertex_shader = program;
+        current_state.vertex = program;
     }
 
     void UseGeometryShader(GLuint program) {
-        current_state.geometry_shader = program;
+        current_state.geometry = program;
     }
 
     void UseFragmentShader(GLuint program) {
-        current_state.fragment_shader = program;
+        current_state.fragment = program;
     }
 
 private:
     struct PipelineState {
-        bool operator==(const PipelineState& rhs) const noexcept {
-            return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
-                   geometry_shader == rhs.geometry_shader;
-        }
-
-        bool operator!=(const PipelineState& rhs) const noexcept {
-            return !operator==(rhs);
-        }
-
-        GLuint vertex_shader = 0;
-        GLuint fragment_shader = 0;
-        GLuint geometry_shader = 0;
+        GLuint vertex = 0;
+        GLuint geometry = 0;
+        GLuint fragment = 0;
     };
 
+    /// Update NV_gpu_program5 programs.
+    void UpdateAssemblyPrograms();
+
+    /// Update GLSL programs.
+    void UpdateSourcePrograms();
+
     OGLPipeline graphics_pipeline;
-    OGLPipeline compute_pipeline;
+
     PipelineState current_state;
     PipelineState old_state;
+
+    bool use_assembly_programs = false;
+
     bool is_graphics_bound = true;
+
+    bool vertex_enabled = false;
+    bool geometry_enabled = false;
+    bool fragment_enabled = false;
 };
 
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 94fbd2a22..4faa8b90c 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -35,7 +35,7 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
 namespace {
 
 struct FormatTuple {
-    GLint internal_format;
+    GLenum internal_format;
     GLenum format = GL_NONE;
     GLenum type = GL_NONE;
 };
@@ -238,6 +238,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
     return texture;
 }
 
+constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
+                            SwizzleSource w_source) {
+    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
 } // Anonymous namespace
 
 CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -381,7 +387,7 @@ void CachedSurface::DecorateSurfaceName() {
 }
 
 void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
-    LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
+    LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
 }
 
 View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -397,14 +403,13 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
 }
 
 CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
-                                     const bool is_proxy)
-    : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
-    target = GetTextureTarget(params.target);
-    format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
+                                     bool is_proxy)
+    : VideoCommon::ViewBase(params), surface{surface},
+      format{GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format},
+      target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
     if (!is_proxy) {
-        texture_view = CreateTextureView();
+        main_view = CreateTextureView();
     }
-    swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
 }
 
 CachedSurfaceView::~CachedSurfaceView() = default;
@@ -447,27 +452,49 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     }
 }
 
-void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
+GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
                                      SwizzleSource z_source, SwizzleSource w_source) {
-    u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
-    if (new_swizzle == swizzle)
-        return;
-    swizzle = new_swizzle;
-    const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
-                                   GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
-    const GLuint handle = GetTexture();
-    const PixelFormat format = surface.GetSurfaceParams().pixel_format;
-    switch (format) {
+    if (GetSurfaceParams().IsBuffer()) {
+        return GetTexture();
+    }
+    const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (current_swizzle == new_swizzle) {
+        return current_view;
+    }
+    current_swizzle = new_swizzle;
+
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
+    OGLTextureView& view = entry->second;
+    if (!is_cache_miss) {
+        current_view = view.handle;
+        return view.handle;
+    }
+    view = CreateTextureView();
+    current_view = view.handle;
+
+    std::array swizzle{x_source, y_source, z_source, w_source};
+
+    switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
     case PixelFormat::Z24S8:
     case PixelFormat::Z32FS8:
     case PixelFormat::S8Z24:
-        glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
+        UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
+        glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
                             GetComponent(format, x_source == SwizzleSource::R));
+
+        // Make sure we sample the first component
+        std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
+            return value == SwizzleSource::G ? SwizzleSource::R : value;
+        });
+        [[fallthrough]];
+    default: {
+        const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
+                                       GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
+        glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
         break;
-    default:
-        glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
-        break;
     }
+    }
+    return view.handle;
 }
 
 OGLTextureView CachedSurfaceView::CreateTextureView() const {
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 02d9981a1..8a2ac8603 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -83,7 +83,7 @@ public:
     /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
     void Attach(GLenum attachment, GLenum target) const;
 
-    void ApplySwizzle(Tegra::Texture::SwizzleSource x_source,
+    GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
                       Tegra::Texture::SwizzleSource y_source,
                       Tegra::Texture::SwizzleSource z_source,
                       Tegra::Texture::SwizzleSource w_source);
@@ -98,7 +98,7 @@ public:
         if (is_proxy) {
             return surface.GetTexture();
         }
-        return texture_view.handle;
+        return main_view.handle;
     }
 
     GLenum GetFormat() const {
@@ -110,23 +110,19 @@ public:
     }
 
 private:
-    u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
-                      Tegra::Texture::SwizzleSource y_source,
-                      Tegra::Texture::SwizzleSource z_source,
-                      Tegra::Texture::SwizzleSource w_source) const {
-        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
-               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
-    }
-
     OGLTextureView CreateTextureView() const;
 
     CachedSurface& surface;
-    GLenum target{};
-    GLenum format{};
+    const GLenum format;
+    const GLenum target;
+    const bool is_proxy;
+
+    std::unordered_map<u32, OGLTextureView> view_cache;
+    OGLTextureView main_view;
 
-    OGLTextureView texture_view;
-    u32 swizzle{};
-    bool is_proxy{};
+    // Use an invalid default so it always fails the comparison test
+    u32 current_swizzle = 0xffffffff;
+    GLuint current_view = 0;
 };
 
 class TextureCacheOpenGL final : public TextureCacheBase {
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index acabc3497..e7952924a 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -316,7 +316,7 @@ public:
 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
                                Core::Frontend::GraphicsContext& context)
     : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
-      has_debug_tool{HasDebugTool()} {}
+      program_manager{device}, has_debug_tool{HasDebugTool()} {}
 
 RendererOpenGL::~RendererOpenGL() = default;
 
@@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {
     vertex_program.Create(true, false, vertex_shader.handle);
     fragment_program.Create(true, false, fragment_shader.handle);
 
-    // Create program pipeline
-    program_manager.Create();
+    pipeline.Create();
+    glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
+    glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
 
     // Generate VBO handle for drawing
     vertex_buffer.Create();
@@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {
     if (rasterizer) {
         return;
     }
-    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info,
+    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
                                                     program_manager, state_tracker);
 }
 
@@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
     state_tracker.NotifyClipControl();
     state_tracker.NotifyAlphaTest();
 
-    program_manager.UseVertexShader(vertex_program.handle);
-    program_manager.UseGeometryShader(0);
-    program_manager.UseFragmentShader(fragment_program.handle);
-    program_manager.BindGraphicsPipeline();
+    program_manager.BindHostPipeline(pipeline.handle);
 
     glEnable(GL_CULL_FACE);
     if (screen_info.display_srgb) {
@@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
 
     glClear(GL_COLOR_BUFFER_BIT);
     glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+    program_manager.RestoreGuestPipeline();
 }
 
 bool RendererOpenGL::TryPresent(int timeout_ms) {
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 50b647661..61bf507f4 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -9,6 +9,7 @@
 #include "common/common_types.h"
 #include "common/math_util.h"
 #include "video_core/renderer_base.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -95,6 +96,7 @@ private:
     Core::Frontend::EmuWindow& emu_window;
     Core::System& system;
     Core::Frontend::GraphicsContext& context;
+    const Device device;
 
     StateTracker state_tracker{system};
 
@@ -102,13 +104,14 @@ private:
     OGLBuffer vertex_buffer;
     OGLProgram vertex_program;
     OGLProgram fragment_program;
+    OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
     /// Display information for Switch screen
     ScreenInfo screen_info;
 
     /// Global dummy shader pipeline
-    GLShader::ProgramManager program_manager;
+    ProgramManager program_manager;
 
     /// OpenGL framebuffer data
     std::vector<u8> gl_framebuffer_data;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 12be691a5..2871035f5 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -142,7 +142,7 @@ struct FormatTuple {
     {VK_FORMAT_BC6H_UFLOAT_BLOCK},                              // BC6H_UF16
     {VK_FORMAT_BC6H_SFLOAT_BLOCK},                              // BC6H_SF16
     {VK_FORMAT_ASTC_4x4_UNORM_BLOCK},                           // ASTC_2D_4X4
-    {VK_FORMAT_B8G8R8A8_UNORM},                                 // BGRA8
+    {VK_FORMAT_B8G8R8A8_UNORM, Attachable},                     // BGRA8
     {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage},      // RGBA32F
     {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage},            // RG32F
     {VK_FORMAT_R32_SFLOAT, Attachable | Storage},               // R32F
@@ -168,7 +168,7 @@ struct FormatTuple {
     {VK_FORMAT_ASTC_8x8_UNORM_BLOCK},                           // ASTC_2D_8X8
     {VK_FORMAT_UNDEFINED},                                      // ASTC_2D_8X5
     {VK_FORMAT_UNDEFINED},                                      // ASTC_2D_5X4
-    {VK_FORMAT_UNDEFINED},                                      // BGRA8_SRGB
+    {VK_FORMAT_B8G8R8A8_SRGB, Attachable},                      // BGRA8_SRGB
     {VK_FORMAT_BC1_RGBA_SRGB_BLOCK},                            // DXT1_SRGB
     {VK_FORMAT_BC2_SRGB_BLOCK},                                 // DXT23_SRGB
     {VK_FORMAT_BC3_SRGB_BLOCK},                                 // DXT45_SRGB
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 5b494da8c..5f33d9e40 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -7,6 +7,7 @@
 #include <memory>
 
 #include "core/core.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index f0c491d00..750e5a0ca 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -104,6 +104,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
                                         VK_FORMAT_R16_SFLOAT,
                                         VK_FORMAT_R16G16B16A16_SFLOAT,
                                         VK_FORMAT_B8G8R8A8_UNORM,
+                                        VK_FORMAT_B8G8R8A8_SRGB,
                                         VK_FORMAT_R4G4B4A4_UNORM_PACK16,
                                         VK_FORMAT_D32_SFLOAT,
                                         VK_FORMAT_D16_UNORM,
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
index 04d07fe6a..043fe7947 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -7,6 +7,7 @@
 #include <memory>
 
 #include "video_core/fence_manager.h"
+#include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/wrapper.h"
 
 namespace Core {
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index fe45ed269..a5c7b7945 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -329,8 +329,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
 
         const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
         const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr);
-        const auto shader = TryGet(*cpu_addr);
+        const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
         ASSERT(shader);
 
         const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 17a2efe8e..be5b77fae 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -532,14 +532,14 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
         return;
     }
     texture_cache.OnCPUWrite(addr, size);
-    pipeline_cache.InvalidateRegion(addr, size);
+    pipeline_cache.OnCPUWrite(addr, size);
     buffer_cache.OnCPUWrite(addr, size);
-    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerVulkan::SyncGuestHost() {
     texture_cache.SyncGuestHost();
     buffer_cache.SyncGuestHost();
+    pipeline_cache.SyncGuestHost();
 }
 
 void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 167e20e91..6f6dedd82 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -515,6 +515,16 @@ private:
     void DeclareCommon() {
         thread_id =
             DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
+        thread_masks[0] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
+        thread_masks[1] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
+        thread_masks[2] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
+        thread_masks[3] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
+        thread_masks[4] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
     }
 
     void DeclareVertex() {
@@ -1071,8 +1081,7 @@ private:
 
     void VisitBasicBlock(const NodeBlock& bb) {
         for (const auto& node : bb) {
-            [[maybe_unused]] const Type type = Visit(node).type;
-            ASSERT(type == Type::Void);
+            Visit(node);
         }
     }
 
@@ -1362,7 +1371,9 @@ private:
         Expression target{};
         if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
-                // Writing to Register::ZeroIndex is a no op
+                // Writing to Register::ZeroIndex is a no op but we still have to visit its source
+                // because it might have side effects.
+                Visit(src);
                 return {};
             }
             target = {registers.at(gpr->GetIndex()), Type::Float};
@@ -2175,14 +2186,37 @@ private:
         return {OpLoad(t_uint, thread_id), Type::Uint};
     }
 
+    template <std::size_t index>
+    Expression ThreadMask(Operation) {
+        // TODO(Rodrigo): Handle devices with different warp sizes
+        const Id mask = thread_masks[index];
+        return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
+    }
+
     Expression ShuffleIndexed(Operation operation) {
         const Id value = AsFloat(Visit(operation[0]));
         const Id index = AsUint(Visit(operation[1]));
         return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
     }
 
-    Expression MemoryBarrierGL(Operation) {
-        const auto scope = spv::Scope::Device;
+    Expression Barrier(Operation) {
+        if (!ir.IsDecompiled()) {
+            LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
+            return {};
+        }
+
+        const auto scope = spv::Scope::Workgroup;
+        const auto memory = spv::Scope::Workgroup;
+        const auto semantics =
+            spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
+        OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
+                         Constant(t_uint, static_cast<u32>(memory)),
+                         Constant(t_uint, static_cast<u32>(semantics)));
+        return {};
+    }
+
+    template <spv::Scope scope>
+    Expression MemoryBarrier(Operation) {
         const auto semantics =
             spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
             spv::MemorySemanticsMask::WorkgroupMemory |
@@ -2639,9 +2673,16 @@ private:
         &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,
 
         &SPIRVDecompiler::ThreadId,
+        &SPIRVDecompiler::ThreadMask<0>, // Eq
+        &SPIRVDecompiler::ThreadMask<1>, // Ge
+        &SPIRVDecompiler::ThreadMask<2>, // Gt
+        &SPIRVDecompiler::ThreadMask<3>, // Le
+        &SPIRVDecompiler::ThreadMask<4>, // Lt
         &SPIRVDecompiler::ShuffleIndexed,
 
-        &SPIRVDecompiler::MemoryBarrierGL,
+        &SPIRVDecompiler::Barrier,
+        &SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
+        &SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
@@ -2763,6 +2804,7 @@ private:
     Id workgroup_id{};
     Id local_invocation_id{};
     Id thread_id{};
+    std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt
 
     VertexIndices in_indices;
     VertexIndices out_indices;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 55f43e61b..2f1d5021d 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -354,26 +354,23 @@ CachedSurfaceView::~CachedSurfaceView() = default;
 
 VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
                                          SwizzleSource z_source, SwizzleSource w_source) {
-    const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
-    if (last_image_view && last_swizzle == swizzle) {
+    const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (last_image_view && last_swizzle == new_swizzle) {
         return last_image_view;
     }
-    last_swizzle = swizzle;
+    last_swizzle = new_swizzle;
 
-    const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle);
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
     auto& image_view = entry->second;
     if (!is_cache_miss) {
         return last_image_view = *image_view;
     }
 
-    auto swizzle_x = MaxwellToVK::SwizzleSource(x_source);
-    auto swizzle_y = MaxwellToVK::SwizzleSource(y_source);
-    auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
-    auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
-
+    std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source),
+                       MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)};
     if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
         // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
-        std::swap(swizzle_x, swizzle_z);
+        std::swap(swizzle[0], swizzle[2]);
     }
 
     // Games can sample depth or stencil values on textures. This is decided by the swizzle value on
@@ -395,11 +392,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
             UNIMPLEMENTED();
         }
 
-        // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity
-        swizzle_x = VK_COMPONENT_SWIZZLE_R;
-        swizzle_y = VK_COMPONENT_SWIZZLE_G;
-        swizzle_z = VK_COMPONENT_SWIZZLE_B;
-        swizzle_w = VK_COMPONENT_SWIZZLE_A;
+        // Make sure we sample the first component
+        std::transform(
+            swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) {
+                return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component;
+            });
     }
 
     VkImageViewCreateInfo ci;
@@ -409,7 +406,7 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
     ci.image = surface.GetImageHandle();
     ci.viewType = image_view_type;
     ci.format = surface.GetImage().GetFormat();
-    ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
+    ci.components = {swizzle[0], swizzle[1], swizzle[2], swizzle[3]};
     ci.subresourceRange.aspectMask = aspect;
     ci.subresourceRange.baseMipLevel = base_level;
     ci.subresourceRange.levelCount = num_levels;
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 9392f065b..63adbc4a3 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -387,7 +387,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
     }
     case OpCode::Id::RED: {
         UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
-        UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
         const auto [real_address, base_address, descriptor] =
             TrackGlobalMemory(bb, instr, true, true);
         if (!real_address || !base_address) {
@@ -396,7 +395,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
         Node value = GetRegister(instr.gpr0);
-        bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
+        bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
         break;
     }
     case OpCode::Id::ATOM: {
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d4f95b18c..d00e10913 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -109,6 +109,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                 return Operation(OperationCode::WorkGroupIdY);
             case SystemVariable::CtaIdZ:
                 return Operation(OperationCode::WorkGroupIdZ);
+            case SystemVariable::EqMask:
+            case SystemVariable::LtMask:
+            case SystemVariable::LeMask:
+            case SystemVariable::GtMask:
+            case SystemVariable::GeMask:
+                uses_warps = true;
+                switch (instr.sys20) {
+                case SystemVariable::EqMask:
+                    return Operation(OperationCode::ThreadEqMask);
+                case SystemVariable::LtMask:
+                    return Operation(OperationCode::ThreadLtMask);
+                case SystemVariable::LeMask:
+                    return Operation(OperationCode::ThreadLeMask);
+                case SystemVariable::GtMask:
+                    return Operation(OperationCode::ThreadGtMask);
+                case SystemVariable::GeMask:
+                    return Operation(OperationCode::ThreadGeMask);
+                default:
+                    UNREACHABLE();
+                    return Immediate(0u);
+                }
             default:
                 UNIMPLEMENTED_MSG("Unhandled system move: {}",
                                   static_cast<u32>(instr.sys20.Value()));
@@ -272,10 +293,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
         break;
     }
+    case OpCode::Id::BAR: {
+        UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
+        bb.push_back(Operation(OperationCode::Barrier));
+        break;
+    }
     case OpCode::Id::MEMBAR: {
-        UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
         UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
-        bb.push_back(Operation(OperationCode::MemoryBarrierGL));
+        const OperationCode type = [instr] {
+            switch (instr.membar.type) {
+            case Tegra::Shader::MembarType::CTA:
+                return OperationCode::MemoryBarrierGroup;
+            case Tegra::Shader::MembarType::GL:
+                return OperationCode::MemoryBarrierGlobal;
+            default:
+                UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
+                return OperationCode::MemoryBarrierGlobal;
+            }
+        }();
+        bb.push_back(Operation(type));
         break;
     }
     case OpCode::Id::DEPBAR: {
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index f75b62240..c5e5165ff 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -226,9 +226,16 @@ enum class OperationCode {
     VoteEqual,    /// (bool) -> bool
 
     ThreadId,       /// () -> uint
+    ThreadEqMask,   /// () -> uint
+    ThreadGeMask,   /// () -> uint
+    ThreadGtMask,   /// () -> uint
+    ThreadLeMask,   /// () -> uint
+    ThreadLtMask,   /// () -> uint
     ShuffleIndexed, /// (uint value, uint index) -> uint
 
-    MemoryBarrierGL, /// () -> void
+    Barrier,             /// () -> void
+    MemoryBarrierGroup,  /// () -> void
+    MemoryBarrierGlobal, /// () -> void
 
     Amount,
 };
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d6efc34b2..45e3ddd2c 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -14,6 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <boost/range/iterator_range.hpp>
 
@@ -53,6 +54,7 @@ using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
 
 template <typename TSurface, typename TView>
 class TextureCache {
+    using VectorSurface = boost::container::small_vector<TSurface, 1>;
 
 public:
     void InvalidateRegion(VAddr addr, std::size_t size) {
@@ -308,18 +310,20 @@ public:
         dst_surface.first->MarkAsModified(true, Tick());
     }
 
-    TSurface TryFindFramebufferSurface(VAddr addr) {
+    TSurface TryFindFramebufferSurface(VAddr addr) const {
         if (!addr) {
             return nullptr;
         }
         const VAddr page = addr >> registry_page_bits;
-        std::vector<TSurface>& list = registry[page];
-        for (auto& surface : list) {
-            if (surface->GetCpuAddr() == addr) {
-                return surface;
-            }
+        const auto it = registry.find(page);
+        if (it == registry.end()) {
+            return nullptr;
         }
-        return nullptr;
+        const auto& list = it->second;
+        const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
+            return surface->GetCpuAddr() == addr;
+        });
+        return found != list.end() ? *found : nullptr;
     }
 
     u64 Tick() {
@@ -498,7 +502,7 @@ private:
      * @param untopological Indicates to the recycler that the texture has no way
      *                      to match the overlaps due to topological reasons.
      **/
-    RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params,
+    RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
                                  const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
         if (Settings::IsGPULevelExtreme()) {
             return RecycleStrategy::Flush;
@@ -538,9 +542,8 @@ private:
      * @param untopological     Indicates to the recycler that the texture has no way to match the
      *                          overlaps due to topological reasons.
      **/
-    std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
-                                              const SurfaceParams& params, const GPUVAddr gpu_addr,
-                                              const bool preserve_contents,
+    std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
+                                              const GPUVAddr gpu_addr, const bool preserve_contents,
                                               const MatchTopologyResult untopological) {
         const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
         for (auto& surface : overlaps) {
@@ -650,7 +653,7 @@ private:
      * @param params   The parameters on the new surface.
      * @param gpu_addr The starting address of the new surface.
      **/
-    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
                                                                     const SurfaceParams& params,
                                                                     const GPUVAddr gpu_addr) {
         if (params.target == SurfaceTarget::Texture3D) {
@@ -708,7 +711,7 @@ private:
      * @param preserve_contents Indicates that the new surface should be loaded from memory or
      *                          left blank.
      */
-    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
                                                                const SurfaceParams& params,
                                                                const GPUVAddr gpu_addr,
                                                                const VAddr cpu_addr,
@@ -810,7 +813,7 @@ private:
             TSurface& current_surface = iter->second;
             const auto topological_result = current_surface->MatchesTopology(params);
             if (topological_result != MatchTopologyResult::FullMatch) {
-                std::vector<TSurface> overlaps{current_surface};
+                VectorSurface overlaps{current_surface};
                 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                       topological_result);
             }
@@ -991,7 +994,9 @@ private:
         params.target = target;
         params.is_tiled = false;
         params.srgb_conversion = false;
-        params.is_layered = false;
+        params.is_layered =
+            target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray ||
+            target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray;
         params.block_width = 0;
         params.block_height = 0;
         params.block_depth = 0;
@@ -1124,23 +1129,25 @@ private:
         }
     }
 
-    std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
+    VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
         if (size == 0) {
             return {};
         }
         const VAddr cpu_addr_end = cpu_addr + size;
-        VAddr start = cpu_addr >> registry_page_bits;
         const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
-        std::vector<TSurface> surfaces;
-        while (start <= end) {
-            std::vector<TSurface>& list = registry[start];
-            for (auto& surface : list) {
-                if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) {
-                    surface->MarkAsPicked(true);
-                    surfaces.push_back(surface);
+        VectorSurface surfaces;
+        for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
+            const auto it = registry.find(start);
+            if (it == registry.end()) {
+                continue;
+            }
+            for (auto& surface : it->second) {
+                if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
+                    continue;
                 }
+                surface->MarkAsPicked(true);
+                surfaces.push_back(surface);
             }
-            start++;
         }
         for (auto& surface : surfaces) {
             surface->MarkAsPicked(false);
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 27775701d..b08b87426 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -643,6 +643,8 @@ void Config::ReadRendererValues() {
     Settings::values.use_asynchronous_gpu_emulation =
         ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool();
     Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool();
+    Settings::values.use_assembly_shaders =
+        ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool();
     Settings::values.use_fast_gpu_time =
         ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool();
     Settings::values.force_30fps_mode =
@@ -1090,6 +1092,8 @@ void Config::SaveRendererValues() {
     WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"),
                  Settings::values.use_asynchronous_gpu_emulation, false);
     WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
+    WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders,
+                 false);
     WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true);
     WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false);
 
diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp
index 5bb2ae555..37aadf7f8 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.cpp
+++ b/src/yuzu/configuration/configure_graphics_advanced.cpp
@@ -12,6 +12,9 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)
 
     ui->setupUi(this);
 
+    // TODO: Remove this after assembly shaders are fully integrated
+    ui->use_assembly_shaders->setVisible(false);
+
     SetConfiguration();
 }
 
@@ -22,6 +25,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() {
     ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy));
     ui->use_vsync->setEnabled(runtime_lock);
     ui->use_vsync->setChecked(Settings::values.use_vsync);
+    ui->use_assembly_shaders->setEnabled(runtime_lock);
+    ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders);
     ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time);
     ui->force_30fps_mode->setEnabled(runtime_lock);
     ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
@@ -33,6 +38,7 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() {
     auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex());
     Settings::values.gpu_accuracy = gpu_accuracy;
     Settings::values.use_vsync = ui->use_vsync->isChecked();
+    Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked();
     Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked();
     Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked();
     Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex();
diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui
index 770b80c50..0021607ac 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.ui
+++ b/src/yuzu/configuration/configure_graphics_advanced.ui
@@ -63,6 +63,16 @@
          </widget>
         </item>
         <item>
+         <widget class="QCheckBox" name="use_assembly_shaders">
+          <property name="toolTip">
+           <string>Enabling this reduces shader stutter. Enables OpenGL assembly shaders on supported Nvidia devices (NV_gpu_program5 is required). This feature is experimental.</string>
+          </property>
+          <property name="text">
+           <string>Use assembly shaders (experimental, Nvidia OpenGL only)</string>
+          </property>
+         </widget>
+        </item>
+        <item>
          <widget class="QCheckBox" name="force_30fps_mode">
           <property name="text">
            <string>Force 30 FPS mode</string>
diff --git a/src/yuzu/discord_impl.cpp b/src/yuzu/discord_impl.cpp
index ea0079353..a93733b26 100644
--- a/src/yuzu/discord_impl.cpp
+++ b/src/yuzu/discord_impl.cpp
@@ -18,7 +18,7 @@ DiscordImpl::DiscordImpl() {
 
     // The number is the client ID for yuzu, it's used for images and the
     // application name
-    Discord_Initialize("471872241299226636", &handlers, 1, nullptr);
+    Discord_Initialize("712465656758665259", &handlers, 1, nullptr);
 }
 
 DiscordImpl::~DiscordImpl() {
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 0b291c7d0..270cccc77 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -65,6 +65,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
 #include "common/logging/backend.h"
 #include "common/logging/filter.h"
 #include "common/logging/log.h"
+#include "common/memory_detect.h"
 #include "common/microprofile.h"
 #include "common/scm_rev.h"
 #include "common/scope_exit.h"
@@ -219,6 +220,10 @@ GMainWindow::GMainWindow()
     LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string);
 #endif
     LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString());
+    LOG_INFO(Frontend, "Host RAM: {:.2f} GB",
+             Common::GetMemInfo().TotalPhysicalMemory / 1024.0f / 1024 / 1024);
+    LOG_INFO(Frontend, "Host Swap: {:.2f} GB",
+             Common::GetMemInfo().TotalSwapMemory / 1024.0f / 1024 / 1024);
     UpdateWindowTitle();
 
     show();
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index 2348e6e0d..c20d48c42 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -397,6 +397,8 @@ void Config::ReadValues() {
         sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
     Settings::values.use_vsync =
         static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1));
+    Settings::values.use_assembly_shaders =
+        sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false);
     Settings::values.use_fast_gpu_time =
         sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);
 
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index ae94b51c4..abc6e6e65 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -134,6 +134,10 @@ max_anisotropy =
 # 0 (default): Off, 1: On
 use_vsync =
 
+# Whether to use OpenGL assembly shaders or not. NV_gpu_program5 is required.
+# 0 (default): Off, 1: On
+use_assembly_shaders =
+
 # Turns on the frame limiter, which will limit frames output to the target game speed
 # 0: Off, 1: On (default)
 use_frame_limit =